commit a4977e1: [Rework] Fix various url extraction issues

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Mar 19 18:35:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-03-19 18:33:16 +0000
URL: https://github.com/rspamd/rspamd/commit/a4977e18de67905b2863514feeff7a77025d4087 (HEAD -> master)

[Rework] Fix various url extraction issues

---
 src/libserver/url.c        | 14 ++++++++------
 src/libutil/multipattern.c | 17 +++++++++--------
 test/lua/unit/url.lua      |  4 +++-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/libserver/url.c b/src/libserver/url.c
index ff8c30819..2e0991406 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -496,12 +496,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
 			rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
 					static_matchers[i].pattern,
 					RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
-							RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+							RSPAMD_MULTIPATTERN_RE);
 		}
 		else {
 			rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
 					static_matchers[i].pattern,
-					RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+					RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
 		}
 	}
 
@@ -513,12 +513,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
 				rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
 						static_matchers[i].pattern,
 						RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
-						RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+						RSPAMD_MULTIPATTERN_RE);
 			}
 			else {
 				rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
 						static_matchers[i].pattern,
-						RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+						RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
 			}
 		}
 		g_array_append_vals (sc->matchers_full, static_matchers, n);
@@ -558,14 +558,14 @@ rspamd_url_init (const gchar *tld_file)
 			sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers));
 	url_scanner->search_trie_strict = rspamd_multipattern_create_sized (
 			G_N_ELEMENTS (static_matchers),
-			RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+			RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
 
 	if (tld_file) {
 		/* Reserve larger multipattern */
 		url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE,
 				sizeof (struct url_matcher), 13000);
 		url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000,
-				RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+				RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
 	}
 	else {
 		url_scanner->matchers_full = NULL;
@@ -3173,6 +3173,8 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
 	}
 	else {
 		cb->url_str = NULL;
+		/* Continue search if no pattern has been found */
+		return 0;
 	}
 
 	/* Continue search if required (return 0 means continue) */
diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c
index f1295a9e4..547762d26 100644
--- a/src/libutil/multipattern.c
+++ b/src/libutil/multipattern.c
@@ -103,7 +103,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
 	/*
 	 * We understand the following cases
 	 * 1) blah -> .blah\b
-	 * 2) *.blah -> ..*\\.blah\b
+	 * 2) *.blah -> ..*\\.blah\b|$
 	 * 3) ???
 	 */
 
@@ -127,7 +127,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
 		len = slen + strlen (prefix);
 	}
 
-	suffix = "\\b";
+	suffix = "(:?\\b|$)";
 	len += strlen (suffix);
 
 	res = g_malloc (len + 1);
@@ -329,26 +329,27 @@ rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp,
 	if (rspamd_hs_check ()) {
 		gchar *np;
 		gint fl = HS_FLAG_SOM_LEFTMOST;
+		gint adjusted_flags = mp->flags | flags;
 
-		if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
+		if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) {
 			fl |= HS_FLAG_CASELESS;
 		}
-		if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
-			if (mp->flags & RSPAMD_MULTIPATTERN_TLD) {
+		if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) {
+			if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) {
 				fl |= HS_FLAG_UTF8;
 			}
 			else {
 				fl |= HS_FLAG_UTF8 | HS_FLAG_UCP;
 			}
 		}
-		if (mp->flags & RSPAMD_MULTIPATTERN_DOTALL) {
+		if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) {
 			fl |= HS_FLAG_DOTALL;
 		}
-		if (mp->flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
+		if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
 			fl |= HS_FLAG_SINGLEMATCH;
 			fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */
 		}
-		if (mp->flags & RSPAMD_MULTIPATTERN_NO_START) {
+		if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
 			fl &= ~HS_FLAG_SOM_LEFTMOST;
 		}
 
diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua
index 24c354960..9647db79b 100644
--- a/test/lua/unit/url.lua
+++ b/test/lua/unit/url.lua
@@ -83,7 +83,9 @@ context("URL check functions", function()
     {"http:/\\www.google.com/foo?bar=baz#", true, {
       host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com'
     }},
-    {"http://[www.google.com]/", false},
+    {"http://[www.google.com]/", true, {
+      host = 'www.google.com',
+    }},
     {"<test.com", true, {
       host = 'test.com', tld = 'test.com',
     }},


More information about the Commits mailing list