commit d0974f0: [Fix] Fix trie code when there are regexps and Hyperscan is absent

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Sep 10 11:42:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-09-10 12:34:52 +0100
URL: https://github.com/rspamd/rspamd/commit/d0974f01f91da985d7646f6ef64fed1e053c64b2 (HEAD -> master)

[Fix] Fix trie code when there are regexps and Hyperscan is absent

---
 src/libutil/multipattern.c | 74 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c
index b2cdc6645..0fc028969 100644
--- a/src/libutil/multipattern.c
+++ b/src/libutil/multipattern.c
@@ -25,6 +25,7 @@
 #include "hs.h"
 #endif
 #include "acism.h"
+#include "libutil/regexp.h"
 #include <stdalign.h>
 
 #define MAX_SCRATCH 4
@@ -51,6 +52,7 @@ struct RSPAMD_ALIGNED(64) rspamd_multipattern {
 #endif
 	ac_trie_t *t;
 	GArray *pats;
+	GArray *res;
 
 	gboolean compiled;
 	guint cnt;
@@ -192,14 +194,14 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
 		gsize *dst_len)
 {
 	gchar *ret = NULL;
-#ifdef WITH_HYPERSCAN
-	if (rspamd_hs_check ()) {
-		gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
+	gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
 
-		if (flags & RSPAMD_MULTIPATTERN_UTF8) {
-			gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
-		}
+	if (flags & RSPAMD_MULTIPATTERN_UTF8) {
+		gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
+	}
 
+#ifdef WITH_HYPERSCAN
+	if (rspamd_hs_check ()) {
 		if (flags & RSPAMD_MULTIPATTERN_TLD) {
 			gchar *tmp;
 			gsize tlen;
@@ -228,6 +230,14 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
 	if (flags & RSPAMD_MULTIPATTERN_TLD) {
 		ret = rspamd_multipattern_escape_tld_acism (pattern, len, dst_len);
 	}
+	else if (flags & RSPAMD_MULTIPATTERN_RE) {
+		ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags |
+															   RSPAMD_REGEXP_ESCAPE_RE);
+	}
+	else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
+		ret = rspamd_str_regexp_escape (pattern, len, dst_len,
+				gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
+	}
 	else {
 		ret = malloc (len + 1);
 		*dst_len = rspamd_strlcpy (ret, pattern, len + 1);
@@ -496,7 +506,30 @@ rspamd_multipattern_compile (struct rspamd_multipattern *mp, GError **err)
 #endif
 
 	if (mp->cnt > 0) {
-		mp->t = acism_create ((const ac_trie_pat_t *)mp->pats->data, mp->cnt);
+
+		if (mp->flags & (RSPAMD_MULTIPATTERN_GLOB|RSPAMD_MULTIPATTERN_RE)) {
+			/* Fallback to pcre... */
+			rspamd_regexp_t *re;
+			mp->res = g_array_sized_new (FALSE, TRUE,
+					sizeof (rspamd_regexp_t *), mp->cnt);
+
+			for (guint i = 0; i < mp->cnt; i ++) {
+				const ac_trie_pat_t *pat;
+
+				pat = &g_array_index (mp->pats, ac_trie_pat_t, i);
+
+				re = rspamd_regexp_new (pat->ptr, NULL, err);
+
+				if (re == NULL) {
+					return FALSE;
+				}
+
+				g_array_append_val (mp->res, re);
+			}
+		}
+		else {
+			mp->t = acism_create ((const ac_trie_pat_t *) mp->pats->data, mp->cnt);
+		}
 	}
 
 	mp->compiled = TRUE;
@@ -617,11 +650,30 @@ rspamd_multipattern_lookup (struct rspamd_multipattern *mp,
 
 	gint state = 0;
 
-	ret = acism_lookup (mp->t, in, len, rspamd_multipattern_acism_cb, &cbd,
-			&state, mp->flags & RSPAMD_MULTIPATTERN_ICASE);
+	if (mp->flags & (RSPAMD_MULTIPATTERN_GLOB|RSPAMD_MULTIPATTERN_RE)) {
+		/* Terribly inefficient, but who cares - just use hyperscan */
+		for (guint i = 0; i < mp->cnt; i ++) {
+			rspamd_regexp_t *re = g_array_index (mp->res, rspamd_regexp_t *, i);
+			const gchar *start = NULL, *end = NULL;
+
+			while (rspamd_regexp_search (re,
+					in,
+					len,
+					&start,
+					&end,
+					TRUE,
+					NULL)) {
+				ret = rspamd_multipattern_acism_cb (i, end - in, &cbd);
+			}
+		}
+	}
+	else {
+		ret = acism_lookup (mp->t, in, len, rspamd_multipattern_acism_cb, &cbd,
+				&state, mp->flags & RSPAMD_MULTIPATTERN_ICASE);
 
-	if (pnfound) {
-		*pnfound = cbd.nfound;
+		if (pnfound) {
+			*pnfound = cbd.nfound;
+		}
 	}
 
 	return ret;


More information about the Commits mailing list