commit cca8d7a: [Rework] Use multiple search tries for different url extraction types

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Mar 19 16:42:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-03-19 16:40:51 +0000
URL: https://github.com/rspamd/rspamd/commit/cca8d7a476cf60dac291466647d220f3ecae32d7 (HEAD -> master)

[Rework] Use multiple search tries for different url extraction types

---
 src/libserver/url.c | 184 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 139 insertions(+), 45 deletions(-)

diff --git a/src/libserver/url.c b/src/libserver/url.c
index 6b2ecdfde..ff8c30819 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -235,6 +235,7 @@ struct url_callback_data {
 	enum rspamd_url_find_type how;
 	gboolean prefix_added;
 	guint newline_idx;
+	GArray *matchers;
 	GPtrArray *newlines;
 	const gchar *start;
 	const gchar *fin;
@@ -245,8 +246,10 @@ struct url_callback_data {
 };
 
 struct url_match_scanner {
-	GArray *matchers;
-	struct rspamd_multipattern *search_trie;
+	GArray *matchers_full;
+	GArray *matchers_strict;
+	struct rspamd_multipattern *search_trie_full;
+	struct rspamd_multipattern *search_trie_strict;
 };
 
 struct url_match_scanner *url_scanner = NULL;
@@ -469,12 +472,12 @@ rspamd_url_parse_tld_file (const gchar *fname,
 		}
 
 		m.flags = flags;
-		rspamd_multipattern_add_pattern (url_scanner->search_trie, p,
+		rspamd_multipattern_add_pattern (url_scanner->search_trie_full, p,
 				RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
-		m.pattern = rspamd_multipattern_get_pattern (url_scanner->search_trie,
-				rspamd_multipattern_get_npatterns (url_scanner->search_trie) - 1);
+		m.pattern = rspamd_multipattern_get_pattern (url_scanner->search_trie_full,
+				rspamd_multipattern_get_npatterns (url_scanner->search_trie_full) - 1);
 
-		g_array_append_val (url_scanner->matchers, m);
+		g_array_append_val (url_scanner->matchers_full, m);
 	}
 
 	free (linebuf);
@@ -490,27 +493,49 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
 
 	for (i = 0; i < n; i++) {
 		if (static_matchers[i].flags & URL_FLAG_REGEXP) {
-			rspamd_multipattern_add_pattern (url_scanner->search_trie,
+			rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
 					static_matchers[i].pattern,
 					RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
-							RSPAMD_MULTIPATTERN_RE);
+							RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
 		}
 		else {
-			rspamd_multipattern_add_pattern (url_scanner->search_trie,
+			rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
 					static_matchers[i].pattern,
-					RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+					RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
 		}
 	}
 
-	g_array_append_vals (sc->matchers, static_matchers, n);
+	g_array_append_vals (sc->matchers_strict, static_matchers, n);
+
+	if (sc->matchers_full) {
+		for (i = 0; i < n; i++) {
+			if (static_matchers[i].flags & URL_FLAG_REGEXP) {
+				rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
+						static_matchers[i].pattern,
+						RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
+						RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+			}
+			else {
+				rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
+						static_matchers[i].pattern,
+						RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+			}
+		}
+		g_array_append_vals (sc->matchers_full, static_matchers, n);
+	}
 }
 
 void
 rspamd_url_deinit (void)
 {
 	if (url_scanner != NULL) {
-		rspamd_multipattern_destroy (url_scanner->search_trie);
-		g_array_free (url_scanner->matchers, TRUE);
+		if (url_scanner->search_trie_full) {
+			rspamd_multipattern_destroy (url_scanner->search_trie_full);
+			g_array_free (url_scanner->matchers_full, TRUE);
+		}
+
+		rspamd_multipattern_destroy (url_scanner->search_trie_strict);
+		g_array_free (url_scanner->matchers_strict, TRUE);
 		g_free (url_scanner);
 
 		url_scanner = NULL;
@@ -529,18 +554,22 @@ rspamd_url_init (const gchar *tld_file)
 
 	url_scanner = g_malloc (sizeof (struct url_match_scanner));
 
+	url_scanner->matchers_strict = g_array_sized_new (FALSE, TRUE,
+			sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers));
+	url_scanner->search_trie_strict = rspamd_multipattern_create_sized (
+			G_N_ELEMENTS (static_matchers),
+			RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+
 	if (tld_file) {
 		/* Reserve larger multipattern */
-		url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
+		url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE,
 				sizeof (struct url_matcher), 13000);
-		url_scanner->search_trie = rspamd_multipattern_create_sized (13000,
+		url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000,
 				RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
 	}
 	else {
-		url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
-				sizeof (struct url_matcher), 128);
-		url_scanner->search_trie = rspamd_multipattern_create_sized (128,
-				RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+		url_scanner->matchers_full = NULL;
+		url_scanner->search_trie_full = NULL;
 	}
 
 	rspamd_url_add_static_matchers (url_scanner);
@@ -549,27 +578,36 @@ rspamd_url_init (const gchar *tld_file)
 		ret = rspamd_url_parse_tld_file (tld_file, url_scanner);
 	}
 
-	if (url_scanner->matchers->len > 1000) {
+	if (url_scanner->matchers_full && url_scanner->matchers_full->len > 1000) {
 		msg_info ("start compiling of %d TLD suffixes; it might take a long time",
-				url_scanner->matchers->len);
+				url_scanner->matchers_full->len);
 	}
 
-	if (!rspamd_multipattern_compile (url_scanner->search_trie, &err)) {
-		msg_err ("cannot compile tld patterns, url matching will be "
-				 "broken completely: %e", err);
-		g_error_free (err);
-		ret = FALSE;
+	if (!rspamd_multipattern_compile (url_scanner->search_trie_strict, &err)) {
+		msg_err ("cannot compile url matcher static patterns, fatal error: %e", err);
+		g_abort ();
+	}
+
+	if (url_scanner->search_trie_full) {
+		if (!rspamd_multipattern_compile (url_scanner->search_trie_full, &err)) {
+			msg_err ("cannot compile tld patterns, url matching will be "
+					 "broken completely: %e", err);
+			g_error_free (err);
+			ret = FALSE;
+		}
 	}
 
 	if (tld_file != NULL) {
 		if (ret) {
 			msg_info ("initialized %ud url match suffixes from '%s'",
-					url_scanner->matchers->len, tld_file);
+					url_scanner->matchers_full->len - url_scanner->matchers_strict->len,
+					tld_file);
 		}
 		else {
 			msg_err ("failed to initialize url tld suffixes from '%s', "
 					 "use %ud internal match suffixes",
-					tld_file, url_scanner->matchers->len);
+					tld_file,
+					url_scanner->matchers_strict->len);
 		}
 	}
 }
@@ -1557,7 +1595,7 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
 	struct rspamd_url *url = context;
 	gint ndots;
 
-	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
+	matcher = &g_array_index (url_scanner->matchers_full, struct url_matcher,
 			strnum);
 	ndots = 1;
 
@@ -2243,9 +2281,11 @@ rspamd_url_parse (struct rspamd_url *uri,
 
 	if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
 		/* Find TLD part */
-		rspamd_multipattern_lookup (url_scanner->search_trie,
-				rspamd_url_host_unsafe (uri), uri->hostlen,
-				rspamd_tld_trie_callback, uri, NULL);
+		if (url_scanner->search_trie_full)  {
+			rspamd_multipattern_lookup (url_scanner->search_trie_full,
+					rspamd_url_host_unsafe (uri), uri->hostlen,
+					rspamd_tld_trie_callback, uri, NULL);
+		}
 
 		if (uri->tldlen == 0) {
 			if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
@@ -2325,7 +2365,7 @@ rspamd_tld_trie_find_callback (struct rspamd_multipattern *mp,
 	struct tld_trie_cbdata *cbdata = context;
 	gint ndots = 1;
 
-	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
+	matcher = &g_array_index (url_scanner->matchers_full, struct url_matcher,
 			strnum);
 
 	if (matcher->flags & URL_FLAG_STAR_MATCH) {
@@ -2384,8 +2424,10 @@ rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out)
 	cbdata.out = out;
 	out->len = 0;
 
-	rspamd_multipattern_lookup (url_scanner->search_trie, in, inlen,
-			rspamd_tld_trie_find_callback, &cbdata, NULL);
+	if (url_scanner->search_trie_full) {
+		rspamd_multipattern_lookup (url_scanner->search_trie_full, in, inlen,
+				rspamd_tld_trie_find_callback, &cbdata, NULL);
+	}
 
 	if (out->len > 0) {
 		return TRUE;
@@ -2880,7 +2922,7 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
 		return 0;
 	}
 
-	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
+	matcher = &g_array_index (cb->matchers, struct url_matcher,
 			strnum);
 
 	if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
@@ -2971,8 +3013,26 @@ rspamd_url_find (rspamd_mempool_t *pool,
 	cb.how = how;
 	cb.pool = pool;
 
-	ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
-			rspamd_url_trie_callback, &cb, NULL);
+	if (how == RSPAMD_URL_FIND_ALL) {
+		if (url_scanner->search_trie_full) {
+			cb.matchers = url_scanner->matchers_full;
+			ret = rspamd_multipattern_lookup (url_scanner->search_trie_full,
+					begin, len,
+					rspamd_url_trie_callback, &cb, NULL);
+		}
+		else {
+			cb.matchers = url_scanner->matchers_strict;
+			ret = rspamd_multipattern_lookup (url_scanner->search_trie_strict,
+					begin, len,
+					rspamd_url_trie_callback, &cb, NULL);
+		}
+	}
+	else {
+		cb.matchers = url_scanner->matchers_strict;
+		ret = rspamd_multipattern_lookup (url_scanner->search_trie_strict,
+				begin, len,
+				rspamd_url_trie_callback, &cb, NULL);
+	}
 
 	if (ret) {
 		if (url_str) {
@@ -3018,7 +3078,7 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
 		return 0;
 	}
 
-	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
+	matcher = &g_array_index (cb->matchers, struct url_matcher,
 			strnum);
 	pool = cb->pool;
 
@@ -3292,9 +3352,26 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool,
 	cb.func = func;
 	cb.newlines = nlines;
 
-	rspamd_multipattern_lookup (url_scanner->search_trie, in,
-			inlen,
-			rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+	if (how == RSPAMD_URL_FIND_ALL) {
+		if (url_scanner->search_trie_full) {
+			cb.matchers = url_scanner->matchers_full;
+			rspamd_multipattern_lookup (url_scanner->search_trie_full,
+					in, inlen,
+					rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+		}
+		else {
+			cb.matchers = url_scanner->matchers_strict;
+			rspamd_multipattern_lookup (url_scanner->search_trie_strict,
+					in, inlen,
+					rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+		}
+	}
+	else {
+		cb.matchers = url_scanner->matchers_strict;
+		rspamd_multipattern_lookup (url_scanner->search_trie_strict,
+				in, inlen,
+				rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+	}
 }
 
 void
@@ -3322,9 +3399,26 @@ rspamd_url_find_single (rspamd_mempool_t *pool,
 	cb.funcd = ud;
 	cb.func = func;
 
-	rspamd_multipattern_lookup (url_scanner->search_trie, in,
-			inlen,
-			rspamd_url_trie_generic_callback_single, &cb, NULL);
+	if (how == RSPAMD_URL_FIND_ALL) {
+		if (url_scanner->search_trie_full) {
+			cb.matchers = url_scanner->matchers_full;
+			rspamd_multipattern_lookup (url_scanner->search_trie_full,
+					in, inlen,
+					rspamd_url_trie_generic_callback_single, &cb, NULL);
+		}
+		else {
+			cb.matchers = url_scanner->matchers_strict;
+			rspamd_multipattern_lookup (url_scanner->search_trie_strict,
+					in, inlen,
+					rspamd_url_trie_generic_callback_single, &cb, NULL);
+		}
+	}
+	else {
+		cb.matchers = url_scanner->matchers_strict;
+		rspamd_multipattern_lookup (url_scanner->search_trie_strict,
+				in, inlen,
+				rspamd_url_trie_generic_callback_single, &cb, NULL);
+	}
 }
 
 


More information about the Commits mailing list