commit 8a9452e: [Minor] Fix `www.` prefix matcher

Vsevolod Stakhov vsevolod at rspamd.com
Thu Aug 24 15:28:03 UTC 2023


Author: Vsevolod Stakhov
Date: 2023-08-24 16:24:50 +0100
URL: https://github.com/rspamd/rspamd/commit/8a9452eb0ddfe9cabcfd79dbcb12dd03158c7116 (HEAD -> master)

[Minor] Fix `www.` prefix matcher

---
 src/libmime/scan_result.c |  1 +
 src/libmime/scan_result.h | 18 +++++++++++++++++-
 src/libserver/url.c       | 30 +++++++++++++++---------------
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/libmime/scan_result.c b/src/libmime/scan_result.c
index 080fc0d51..e84d92494 100644
--- a/src/libmime/scan_result.c
+++ b/src/libmime/scan_result.c
@@ -52,6 +52,7 @@ rspamd_scan_result_dtor(gpointer d)
 			kh_destroy(rspamd_options_hash, sres->options);
 		}
 	});
+
 	kh_destroy(rspamd_symbols_hash, r->symbols);
 	kh_destroy(rspamd_symbols_group_hash, r->sym_groups);
 }
diff --git a/src/libmime/scan_result.h b/src/libmime/scan_result.h
index da3fb1608..46c2de8fa 100644
--- a/src/libmime/scan_result.h
+++ b/src/libmime/scan_result.h
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 /**
  * @file scan_result.h
  * Scan result holder
@@ -43,7 +59,7 @@ struct rspamd_symbol_result {
 	gssize opts_len;           /**< total size of all options (negative if truncated option is added) */
 	guint nshots;
 	int flags;
-	struct rspamd_symbol_result *next;
+	struct rspamd_symbol_result *next; /**< for shadow results */
 };
 
 
diff --git a/src/libserver/url.c b/src/libserver/url.c
index ca0b2072e..0842a1ebd 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -38,10 +38,10 @@ typedef struct url_match_s {
 	gchar st;
 } url_match_t;
 
-#define URL_FLAG_NOHTML (1u << 0u)
-#define URL_FLAG_TLD_MATCH (1u << 1u)
-#define URL_FLAG_STAR_MATCH (1u << 2u)
-#define URL_FLAG_REGEXP (1u << 3u)
+#define URL_MATCHER_FLAG_NOHTML (1u << 0u)
+#define URL_MATCHER_FLAG_TLD_MATCH (1u << 1u)
+#define URL_MATCHER_FLAG_STAR_MATCH (1u << 2u)
+#define URL_MATCHER_FLAG_REGEXP (1u << 3u)
 
 struct url_callback_data;
 
@@ -163,8 +163,8 @@ struct url_matcher static_matchers[] = {
 	 0},
 	{"sip:", "", url_web_start, url_web_end,
 	 0},
-	{"www.", "http://", url_web_start, url_web_end,
-	 0},
+	{"www\\.[0-9a-z]", "http://", url_web_start, url_web_end,
+	 URL_MATCHER_FLAG_REGEXP},
 	{"ftp.", "ftp://", url_web_start, url_web_end,
 	 0},
 	/* Likely emails */
@@ -449,10 +449,10 @@ rspamd_url_parse_tld_file(const gchar *fname,
 			continue;
 		}
 
-		flags = URL_FLAG_NOHTML | URL_FLAG_TLD_MATCH;
+		flags = URL_MATCHER_FLAG_NOHTML | URL_MATCHER_FLAG_TLD_MATCH;
 
 		if (linebuf[0] == '*') {
-			flags |= URL_FLAG_STAR_MATCH;
+			flags |= URL_MATCHER_FLAG_STAR_MATCH;
 			p = strchr(linebuf, '.');
 
 			if (p == NULL) {
@@ -486,7 +486,7 @@ rspamd_url_add_static_matchers(struct url_match_scanner *sc)
 	gint n = G_N_ELEMENTS(static_matchers), i;
 
 	for (i = 0; i < n; i++) {
-		if (static_matchers[i].flags & URL_FLAG_REGEXP) {
+		if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) {
 			rspamd_multipattern_add_pattern(url_scanner->search_trie_strict,
 											static_matchers[i].pattern,
 											RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
@@ -503,7 +503,7 @@ rspamd_url_add_static_matchers(struct url_match_scanner *sc)
 
 	if (sc->matchers_full) {
 		for (i = 0; i < n; i++) {
-			if (static_matchers[i].flags & URL_FLAG_REGEXP) {
+			if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) {
 				rspamd_multipattern_add_pattern(url_scanner->search_trie_full,
 												static_matchers[i].pattern,
 												RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
@@ -1664,7 +1664,7 @@ rspamd_tld_trie_callback(struct rspamd_multipattern *mp,
 							 strnum);
 	ndots = 1;
 
-	if (matcher->flags & URL_FLAG_STAR_MATCH) {
+	if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) {
 		/* Skip one more tld component */
 		ndots++;
 	}
@@ -2595,7 +2595,7 @@ rspamd_tld_trie_find_callback(struct rspamd_multipattern *mp,
 	matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher,
 							 strnum);
 
-	if (matcher->flags & URL_FLAG_STAR_MATCH) {
+	if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) {
 		/* Skip one more tld component */
 		ndots = 2;
 	}
@@ -3107,7 +3107,7 @@ static gboolean
 rspamd_url_trie_is_match(struct url_matcher *matcher, const gchar *pos,
 						 const gchar *end, const gchar *newline_pos)
 {
-	if (matcher->flags & URL_FLAG_TLD_MATCH) {
+	if (matcher->flags & URL_MATCHER_FLAG_TLD_MATCH) {
 		/* Immediately check pos for valid chars */
 		if (pos < end) {
 			if (pos != newline_pos && !g_ascii_isspace(*pos) && *pos != '/' && *pos != '?' &&
@@ -3156,7 +3156,7 @@ rspamd_url_trie_callback(struct rspamd_multipattern *mp,
 	matcher = &g_array_index(cb->matchers, struct url_matcher,
 							 strnum);
 
-	if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
+	if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
 		/* Do not try to match non-html like urls in html texts */
 		return 0;
 	}
@@ -3313,7 +3313,7 @@ rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp,
 							 strnum);
 	pool = cb->pool;
 
-	if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
+	if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
 		/* Do not try to match non-html like urls in html texts, continue matching */
 		return 0;
 	}


More information about the Commits mailing list