commit 10bb08d: [Minor] Various fixes for display link detection

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Mar 5 17:21:07 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-03-05 16:50:48 +0000
URL: https://github.com/rspamd/rspamd/commit/10bb08dd2d8484d0d1d2ae507b94aaa24f48b61b

[Minor] Various fixes for display link detection

---
 src/controller.c                 |  4 ++--
 src/libserver/html.c             | 24 ++++++++++++++++++------
 src/libserver/http/http_router.c |  2 +-
 src/libserver/http/http_util.c   |  2 +-
 src/libserver/http/http_util.h   |  2 +-
 src/libserver/url.c              |  2 +-
 src/libutil/str_util.c           |  2 +-
 src/libutil/str_util.h           |  2 +-
 8 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/controller.c b/src/controller.c
index 174382879..0ecaf860d 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -2764,7 +2764,7 @@ rspamd_controller_handle_custom (struct rspamd_http_connection_entry *conn_ent,
 	http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
 
 	if (u.field_set & (1 << UF_PATH)) {
-		guint unnorm_len;
+		gsize unnorm_len;
 		lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
 		lookup.len = u.field_data[UF_PATH].len;
 
@@ -2971,7 +2971,7 @@ rspamd_controller_handle_lua_plugin (struct rspamd_http_connection_entry *conn_e
 	http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
 
 	if (u.field_set & (1 << UF_PATH)) {
-		guint unnorm_len;
+		gsize unnorm_len;
 		lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
 		lookup.len = u.field_data[UF_PATH].len;
 
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 5b3aafca0..401c55f31 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1452,7 +1452,8 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 	gsize decoded_len;
 	const gchar *p, *s, *prefix = "http://";
 	gchar *d;
-	guint i, dlen;
+	guint i;
+	gsize dlen;
 	gboolean has_bad_chars = FALSE, no_prefix = FALSE;
 	static const gchar hexdigests[16] = "0123456789abcdef";
 
@@ -2588,8 +2589,11 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
 	struct rspamd_url *turl;
 	gboolean url_found = FALSE;
 	struct rspamd_process_exception *ex;
+	enum rspamd_normalise_result norm_res;
+	guint saved_flags = 0;
+	gsize dlen;
 
-	if (href_offset <= 0) {
+	if (href_offset < 0) {
 		/* No dispalyed url, just some text within <a> tag */
 		return;
 	}
@@ -2597,15 +2601,23 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
 	url->visible_part = rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
 	rspamd_strlcpy (url->visible_part, dest->data + href_offset,
 			dest->len - href_offset + 1);
-	g_strstrip (url->visible_part);
+	dlen = dest->len - href_offset;
+	url->visible_part =
+			(gchar *)rspamd_string_len_strip (url->visible_part, &dlen, " \t\v\r\n");
+
+	norm_res = rspamd_normalise_unicode_inplace (pool, url->visible_part, &dlen);
+
+	if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
+		saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+	}
 
 	rspamd_html_url_is_phished (pool, url,
-			dest->data + href_offset,
-			dest->len - href_offset,
+			url->visible_part,
+			dlen,
 			&url_found, &displayed_url);
 
 	if (url_found) {
-		url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
+		url->flags |= saved_flags|RSPAMD_URL_FLAG_DISPLAY_URL;
 	}
 
 	if (exceptions && url_found) {
diff --git a/src/libserver/http/http_router.c b/src/libserver/http/http_router.c
index a5b960e72..960df0ce3 100644
--- a/src/libserver/http/http_router.c
+++ b/src/libserver/http/http_router.c
@@ -291,7 +291,7 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn,
 			http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
 
 			if (u.field_set & (1 << UF_PATH)) {
-				guint unnorm_len;
+				gsize unnorm_len;
 
 				pathbuf = g_malloc (u.field_data[UF_PATH].len);
 				memcpy (pathbuf, msg->url->str + u.field_data[UF_PATH].off,
diff --git a/src/libserver/http/http_util.c b/src/libserver/http/http_util.c
index ec9d9fa58..fd5adb3c1 100644
--- a/src/libserver/http/http_util.c
+++ b/src/libserver/http/http_util.c
@@ -302,7 +302,7 @@ rspamd_http_date_format (gchar *buf, gsize len, time_t time)
 }
 
 void
-rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen)
+rspamd_http_normalize_path_inplace (gchar *path, guint len, gsize *nlen)
 {
 	const gchar *p, *end, *slash = NULL, *dot = NULL;
 	gchar *o;
diff --git a/src/libserver/http/http_util.h b/src/libserver/http/http_util.h
index 7a22ffb16..19b497f30 100644
--- a/src/libserver/http/http_util.h
+++ b/src/libserver/http/http_util.h
@@ -47,7 +47,7 @@ glong rspamd_http_date_format (gchar *buf, gsize len, time_t time);
  * @param len
  * @param nlen
  */
-void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen);
+void rspamd_http_normalize_path_inplace (gchar *path, guint len, gsize *nlen);
 
 #ifdef  __cplusplus
 }
diff --git a/src/libserver/url.c b/src/libserver/url.c
index d83c1988f..a5de7ebdf 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2166,7 +2166,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 	gchar *p;
 	const gchar *end;
 	guint i, complen, ret, flags = 0;
-	guint unquoted_len = 0;
+	gsize unquoted_len = 0;
 
 	memset (uri, 0, sizeof (*uri));
 	memset (&u, 0, sizeof (u));
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 5a44ed311..00774d588 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -3023,7 +3023,7 @@ rspamd_get_unicode_normalizer (void)
 
 enum rspamd_normalise_result
 rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
-		guint *len)
+		gsize *len)
 {
 #if U_ICU_VERSION_MAJOR_NUM >= 44
 	UErrorCode uc_err = U_ZERO_ERROR;
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 0e66d0ed1..427d6b94e 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -491,7 +491,7 @@ enum rspamd_normalise_result {
  * @return TRUE if a string has been normalised
  */
 enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
-															   gchar *start, guint *len);
+															   gchar *start, gsize *len);
 
 enum rspamd_regexp_escape_flags {
 	RSPAMD_REGEXP_ESCAPE_ASCII = 0,


More information about the Commits mailing list