commit a4977e1: [Rework] Fix various url extraction issues
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Mar 19 18:35:06 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-03-19 18:33:16 +0000
URL: https://github.com/rspamd/rspamd/commit/a4977e18de67905b2863514feeff7a77025d4087 (HEAD -> master)
[Rework] Fix various url extraction issues
---
src/libserver/url.c | 14 ++++++++------
src/libutil/multipattern.c | 17 +++++++++--------
test/lua/unit/url.lua | 4 +++-
3 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/src/libserver/url.c b/src/libserver/url.c
index ff8c30819..2e0991406 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -496,12 +496,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
- RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+ RSPAMD_MULTIPATTERN_RE);
}
else {
rspamd_multipattern_add_pattern (url_scanner->search_trie_strict,
static_matchers[i].pattern,
- RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
}
}
@@ -513,12 +513,12 @@ rspamd_url_add_static_matchers (struct url_match_scanner *sc)
rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
static_matchers[i].pattern,
RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
- RSPAMD_MULTIPATTERN_RE|RSPAMD_MULTIPATTERN_TLD);
+ RSPAMD_MULTIPATTERN_RE);
}
else {
rspamd_multipattern_add_pattern (url_scanner->search_trie_full,
static_matchers[i].pattern,
- RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|RSPAMD_MULTIPATTERN_TLD);
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
}
}
g_array_append_vals (sc->matchers_full, static_matchers, n);
@@ -558,14 +558,14 @@ rspamd_url_init (const gchar *tld_file)
sizeof (struct url_matcher), G_N_ELEMENTS (static_matchers));
url_scanner->search_trie_strict = rspamd_multipattern_create_sized (
G_N_ELEMENTS (static_matchers),
- RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
if (tld_file) {
/* Reserve larger multipattern */
url_scanner->matchers_full = g_array_sized_new (FALSE, TRUE,
sizeof (struct url_matcher), 13000);
url_scanner->search_trie_full = rspamd_multipattern_create_sized (13000,
- RSPAMD_MULTIPATTERN_TLD|RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
}
else {
url_scanner->matchers_full = NULL;
@@ -3173,6 +3173,8 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
}
else {
cb->url_str = NULL;
+ /* Continue search if no pattern has been found */
+ return 0;
}
/* Continue search if required (return 0 means continue) */
diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c
index f1295a9e4..547762d26 100644
--- a/src/libutil/multipattern.c
+++ b/src/libutil/multipattern.c
@@ -103,7 +103,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
/*
* We understand the following cases
* 1) blah -> .blah\b
- * 2) *.blah -> ..*\\.blah\b
+ * 2) *.blah -> ..*\\.blah\b|$
* 3) ???
*/
@@ -127,7 +127,7 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen,
len = slen + strlen (prefix);
}
- suffix = "\\b";
+ suffix = "(:?\\b|$)";
len += strlen (suffix);
res = g_malloc (len + 1);
@@ -329,26 +329,27 @@ rspamd_multipattern_add_pattern_len (struct rspamd_multipattern *mp,
if (rspamd_hs_check ()) {
gchar *np;
gint fl = HS_FLAG_SOM_LEFTMOST;
+ gint adjusted_flags = mp->flags | flags;
- if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) {
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) {
fl |= HS_FLAG_CASELESS;
}
- if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) {
- if (mp->flags & RSPAMD_MULTIPATTERN_TLD) {
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) {
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) {
fl |= HS_FLAG_UTF8;
}
else {
fl |= HS_FLAG_UTF8 | HS_FLAG_UCP;
}
}
- if (mp->flags & RSPAMD_MULTIPATTERN_DOTALL) {
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) {
fl |= HS_FLAG_DOTALL;
}
- if (mp->flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) {
fl |= HS_FLAG_SINGLEMATCH;
fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */
}
- if (mp->flags & RSPAMD_MULTIPATTERN_NO_START) {
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
fl &= ~HS_FLAG_SOM_LEFTMOST;
}
diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua
index 24c354960..9647db79b 100644
--- a/test/lua/unit/url.lua
+++ b/test/lua/unit/url.lua
@@ -83,7 +83,9 @@ context("URL check functions", function()
{"http:/\\www.google.com/foo?bar=baz#", true, {
host = 'www.google.com', path = 'foo', query = 'bar=baz', tld = 'google.com'
}},
- {"http://[www.google.com]/", false},
+ {"http://[www.google.com]/", true, {
+ host = 'www.google.com',
+ }},
{"<test.com", true, {
host = 'test.com', tld = 'test.com',
}},
More information about the Commits
mailing list