commit b7467f9: [Fix] Urls: Fix processing of html urls when it comes to the flags

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Mar 6 23:56:28 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-03-06 23:49:16 +0000
URL: https://github.com/rspamd/rspamd/commit/b7467f9d294faf54f25d2b2fc32255f613097416 (HEAD -> master)

[Fix] Urls: Fix processing of html urls when it comes to the flags
Issue: #3664

---
 src/libserver/html.c | 11 +++++++---
 src/libserver/url.c  | 58 +++++++++++++++++++++++++++++++++++++---------------
 src/libserver/url.h  |  5 +++--
 src/lua/lua_task.c   |  2 +-
 4 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 974b59129..aa1cdf6cc 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1764,7 +1764,7 @@ rspamd_html_url_query_callback (struct rspamd_url *url, gsize start_offset,
 
 	url->flags |= RSPAMD_URL_FLAG_QUERY;
 
-	if (rspamd_url_set_add_or_increase (cbd->url_set, url) && cbd->part_urls) {
+	if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
 		g_ptr_array_add (cbd->part_urls, url);
 	}
 
@@ -1903,7 +1903,7 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 						if (img->url) {
 							img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
 
-							if (rspamd_url_set_add_or_increase (url_set, img->url) &&
+							if (rspamd_url_set_add_or_increase(url_set, img->url, false) &&
 								part_urls) {
 								g_ptr_array_add (part_urls, img->url);
 							}
@@ -3245,10 +3245,15 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 						if (url != NULL) {
 
 							if (url_set != NULL) {
-								if (rspamd_url_set_add_or_increase (url_set, url)) {
+								struct rspamd_url *maybe_existing =
+										rspamd_url_set_add_or_return (url_set, url);
+								if (maybe_existing == url) {
 									rspamd_process_html_url (pool, url, url_set,
 											part_urls);
 								}
+								else {
+									url = maybe_existing;
+								}
 							}
 
 							href_offset = dest->len;
diff --git a/src/libserver/url.c b/src/libserver/url.c
index a5de7ebdf..8183213b6 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -3377,7 +3377,7 @@ rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset,
 	url->flags |= RSPAMD_URL_FLAG_QUERY;
 
 
-	if (rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url)) {
+	if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false)) {
 		if (cbd->part && cbd->part->mime_part->urls) {
 			g_ptr_array_add (cbd->part->mime_part->urls, url);
 		}
@@ -3433,8 +3433,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 
 	url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
 
-	if (rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url) &&
-			cbd->part->mime_part->urls) {
+	if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false) &&
+		cbd->part->mime_part->urls) {
 		g_ptr_array_add (cbd->part->mime_part->urls, url);
 	}
 
@@ -3592,7 +3592,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
 		}
 	}
 
-	rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);
+	rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false);
 
 	/* We also search the query for additional url inside */
 	if (url->querylen > 0) {
@@ -3622,8 +3622,8 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
 					}
 				}
 
-				rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls),
-						query_url);
+				rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls),
+						query_url, false);
 			}
 		}
 	}
@@ -4044,21 +4044,44 @@ rspamd_url_protocol_from_string (const gchar *str)
 
 
 bool
-rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
-									 struct rspamd_url *u)
+rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set,
+							   struct rspamd_url *u,
+							   bool enforce_replace)
 {
 	khiter_t k;
 	gint r;
 
-	k = kh_put (rspamd_url_hash, set, u, &r);
+	k = kh_get (rspamd_url_hash, set, u);
 
-	if (r == 0) {
+	if (k != kh_end (set)) {
+		/* Existing url */
 		struct rspamd_url *ex = kh_key (set, k);
-
-		ex->count ++;
+#define SUSPICIOUS_URL_FLAGS (RSPAMD_URL_FLAG_PHISHED|RSPAMD_URL_FLAG_OBSCURED|RSPAMD_URL_FLAG_ZW_SPACES)
+		if (enforce_replace) {
+			kh_key (set, k) = u;
+			u->count++;
+		}
+		else {
+			if (u->flags & SUSPICIOUS_URL_FLAGS) {
+				if (!(ex->flags & SUSPICIOUS_URL_FLAGS)) {
+					/* Propagate new url to an old one */
+					kh_key (set, k) = u;
+					u->count++;
+				}
+				else {
+					ex->count++;
+				}
+			}
+			else {
+				ex->count++;
+			}
+		}
 
 		return false;
 	}
+	else {
+		k = kh_put (rspamd_url_hash, set, u, &r);
+	}
 
 	return true;
 }
@@ -4071,12 +4094,15 @@ rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
 	gint r;
 
 	if (set) {
-		k = kh_put (rspamd_url_hash, set, u, &r);
+		k = kh_get (rspamd_url_hash, set, u);
 
-		if (r == 0) {
-			struct rspamd_url *ex = kh_key (set, k);
+		if (k != kh_end (set)) {
+			return kh_key (set, k);
+		}
+		else {
+			k = kh_put (rspamd_url_hash, set, u, &r);
 
-			return ex;
+			return kh_key (set, k);
 		}
 	}
 
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 567cdd137..59485ab9a 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -296,8 +296,9 @@ KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char);
  * @param u
  * @return true if a new url has been added
  */
-bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
-		struct rspamd_url *u);
+bool rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set,
+									struct rspamd_url *u,
+									bool enforce_replace);
 
 /**
  * Same as rspamd_url_set_add_or_increase but returns the existing url if found
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index d2bd17aba..579f04fb9 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -2507,7 +2507,7 @@ lua_task_inject_url (lua_State * L)
 	}
 
 	if (task && task->message && url && url->url) {
-		if (rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url->url)) {
+		if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url->url, false)) {
 			if (mpart && mpart->urls) {
 				/* Also add url to the mime part */
 				g_ptr_array_add (mpart->urls, url->url);


More information about the Commits mailing list