commit 788ca58: [Feature] Composites: Improve composite atoms parser

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Nov 4 12:21:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-11-04 12:17:59 +0000
URL: https://github.com/rspamd/rspamd/commit/788ca58f26cee86c985ef45a53b78d81cb13fcdc (HEAD -> master)

[Feature] Composites: Improve composite atoms parser

---
 src/libserver/composites.c | 281 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 233 insertions(+), 48 deletions(-)

diff --git a/src/libserver/composites.c b/src/libserver/composites.c
index c1ee471f8..9f7f548ab 100644
--- a/src/libserver/composites.c
+++ b/src/libserver/composites.c
@@ -108,82 +108,267 @@ static rspamd_expression_atom_t *
 rspamd_composite_expr_parse (const gchar *line, gsize len,
 		rspamd_mempool_t *pool, gpointer ud, GError **err)
 {
-	gsize clen;
+	gsize clen = 0;
 	rspamd_expression_atom_t *res;
 	struct rspamd_composite_atom *atom;
+	const gchar *p, *end;
+	enum composite_expr_state {
+		comp_state_read_symbol = 0,
+		comp_state_read_obrace,
+		comp_state_read_option,
+		comp_state_read_regexp,
+		comp_state_read_regexp_end,
+		comp_state_read_comma,
+		comp_state_read_ebrace,
+		comp_state_read_end
+	} state = comp_state_read_symbol;
+
+	end = line + len;
+	p = line;
+
+	/* Find length of the atom using a reduced state machine */
+	while (p < end) {
+		if (state == comp_state_read_end) {
+			break;
+		}
 
-	/*
-	 * Composites are just sequences of symbols
-	 */
-	clen = strcspn (line, "; \t()><!|&\n");
-	if (clen == 0) {
-		/* Invalid composite atom */
-		g_set_error (err, rspamd_composites_quark (), 100, "Invalid composite: %s",
-				line);
+		switch (state) {
+		case comp_state_read_symbol:
+			clen = rspamd_memcspn (p, "[; \t()><!|&\n", len);
+			p += clen;
+
+			if (*p == '[') {
+				state = comp_state_read_obrace;
+			}
+			else {
+				state = comp_state_read_end;
+			}
+			break;
+		case comp_state_read_obrace:
+			p ++;
+
+			if (*p == '/') {
+				p ++;
+				state = comp_state_read_regexp;
+			}
+			else {
+				state = comp_state_read_option;
+			}
+			break;
+		case comp_state_read_regexp:
+			if (*p == '\\' && p + 1 < end) {
+				/* Escaping */
+				p ++;
+			}
+			else if (*p == '/') {
+				/* End of regexp, possible flags */
+				state = comp_state_read_regexp_end;
+			}
+			p ++;
+			break;
+		case comp_state_read_option:
+		case comp_state_read_regexp_end:
+			if (*p == ',') {
+				p ++;
+				state = comp_state_read_comma;
+			}
+			else if (*p == ']') {
+				state = comp_state_read_ebrace;
+			}
+			else {
+				p ++;
+			}
+			break;
+		case comp_state_read_comma:
+			if (!g_ascii_isspace (*p)) {
+				if (*p == '/') {
+					state = comp_state_read_regexp;
+				}
+				else if (*p == ']') {
+					state = comp_state_read_ebrace;
+				}
+				else {
+					state = comp_state_read_option;
+				}
+			}
+			else {
+				/* Skip spaces after comma */
+				p ++;
+			}
+			break;
+		case comp_state_read_ebrace:
+			p ++;
+			state = comp_state_read_end;
+			break;
+		case comp_state_read_end:
+			g_assert_not_reached ();
+		}
+	}
+
+	if (state != comp_state_read_end) {
+		g_set_error (err, rspamd_composites_quark (), 100, "invalid composite: %s;"
+														   "parser stopped in state %d",
+				line, state);
 		return NULL;
 	}
 
+	clen = p - line;
+	p = line;
+	state = comp_state_read_symbol;
+
+	atom = rspamd_mempool_alloc0 (pool, sizeof (*atom));
 	res = rspamd_mempool_alloc0 (pool, sizeof (*res));
 	res->len = clen;
 	res->str = line;
 
-	atom = rspamd_mempool_alloc0 (pool, sizeof (*atom));
+	/* Full state machine to fill a composite atom */
+	const gchar *opt_start = NULL;
+
+	while (p < end) {
+		struct rspamd_composite_option_match *opt_match;
+
+		if (state == comp_state_read_end) {
+			break;
+		}
+
+		switch (state) {
+		case comp_state_read_symbol:
+			clen = rspamd_memcspn (p, "[; \t()><!|&\n", len);
+			p += clen;
+
+			if (*p == '[') {
+				state = comp_state_read_obrace;
+			}
+			else {
+				state = comp_state_read_end;
+			}
+
+			atom->symbol = rspamd_mempool_alloc (pool, clen + 1);
+			rspamd_strlcpy (atom->symbol, line, clen + 1);
+
+			break;
+		case comp_state_read_obrace:
+			p ++;
 
-	/* Now check for options combinations */
-	const gchar *obrace, *ebrace;
+			if (*p == '/') {
+				opt_start = p;
+				p ++; /* Starting slash */
+				state = comp_state_read_regexp;
+			}
+			else {
+				state = comp_state_read_option;
+				opt_start = p;
+			}
+
+			break;
+		case comp_state_read_regexp:
+			if (*p == '\\' && p + 1 < end) {
+				/* Escaping */
+				p ++;
+			}
+			else if (*p == '/') {
+				/* End of regexp, possible flags */
+				state = comp_state_read_regexp_end;
+			}
+			p ++;
+			break;
+		case comp_state_read_option:
+			if (*p == ',' || *p == ']') {
+				opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match));
+				/* Plain match */
+				gchar *opt_buf;
+				gint opt_len = p - opt_start;
 
-	if ((obrace = memchr (line, '[', clen)) != NULL && obrace > line) {
-		atom->symbol = rspamd_mempool_alloc (pool, obrace - line + 1);
-		rspamd_strlcpy (atom->symbol, line, obrace - line + 1);
-		ebrace = memchr (line, ']', clen);
+				opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
+				rspamd_strlcpy (opt_buf, opt_start, opt_len + 1);
 
-		if (ebrace != NULL && ebrace > obrace) {
-			/* We can make a list of options */
-			gchar **opts = rspamd_string_len_split (obrace + 1,
-					ebrace - obrace - 1, ",", -1, pool);
+				opt_match->data.match = opt_buf;
+				opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN;
 
-			for (guint i = 0; opts[i] != NULL; i ++) {
-				struct rspamd_composite_option_match *opt_match;
+				DL_APPEND (atom->opts, opt_match);
 
+				if (*p == ',') {
+					p++;
+					state = comp_state_read_comma;
+				}
+				else {
+					state = comp_state_read_ebrace;
+				}
+			}
+			else {
+				p ++;
+			}
+			break;
+		case comp_state_read_regexp_end:
+			if (*p == ',' || *p == ']') {
 				opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match));
+				/* Plain match */
+				gchar *opt_buf;
+				gint opt_len = p - opt_start;
 
-				if (opts[i][0] == '/' && strchr (opts[i] + 1, '/') != NULL) {
-					/* Regexp */
-					rspamd_regexp_t *re;
-					GError *re_err = NULL;
+				opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
+				rspamd_strlcpy (opt_buf, opt_start, opt_len + 1);
 
-					re = rspamd_regexp_new (opts[i], NULL, &re_err);
+				rspamd_regexp_t *re;
+				GError *re_err = NULL;
 
-					if (re == NULL) {
-						msg_err_pool ("cannot create regexp from string %s: %e",
-								opts[i], re_err);
+				re = rspamd_regexp_new (opt_buf, NULL, &re_err);
 
-						g_error_free (re_err);
-					}
-					else {
-						rspamd_mempool_add_destructor (pool,
-								(rspamd_mempool_destruct_t)rspamd_regexp_unref,
-								re);
-						opt_match->data.re = re;
-						opt_match->type = RSPAMD_COMPOSITE_OPTION_RE;
-
-						DL_APPEND (atom->opts, opt_match);
-					}
+				if (re == NULL) {
+					msg_err_pool ("cannot create regexp from string %s: %e",
+							opt_buf, re_err);
+
+					g_error_free (re_err);
 				}
 				else {
-					/* Plain match */
-					opt_match->data.match = opts[i];
-					opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN;
+					rspamd_mempool_add_destructor (pool,
+							(rspamd_mempool_destruct_t)rspamd_regexp_unref,
+							re);
+					opt_match->data.re = re;
+					opt_match->type = RSPAMD_COMPOSITE_OPTION_RE;
 
 					DL_APPEND (atom->opts, opt_match);
 				}
+
+				if (*p == ',') {
+					p++;
+					state = comp_state_read_comma;
+				}
+				else {
+					state = comp_state_read_ebrace;
+				}
+			}
+			else {
+				p ++;
 			}
+			break;
+		case comp_state_read_comma:
+			if (!g_ascii_isspace (*p)) {
+				if (*p == '/') {
+					state = comp_state_read_regexp;
+					opt_start = p;
+				}
+				else if (*p == ']') {
+					state = comp_state_read_ebrace;
+				}
+				else {
+					opt_start = p;
+					state = comp_state_read_option;
+				}
+			}
+			else {
+				/* Skip spaces after comma */
+				p ++;
+			}
+			break;
+		case comp_state_read_ebrace:
+			p ++;
+			state = comp_state_read_end;
+			break;
+		case comp_state_read_end:
+			g_assert_not_reached ();
 		}
 	}
-	else {
-		atom->symbol = rspamd_mempool_alloc (pool, clen + 1);
-		rspamd_strlcpy (atom->symbol, line, clen + 1);
-	}
 
 	res->data = atom;
 


More information about the Commits mailing list