commit db4ef54: [Minor] Fix parsing of some bogus urls
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed May 12 13:42:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-12 14:39:09 +0100
URL: https://github.com/rspamd/rspamd/commit/db4ef545172db3417684695df1ecca3de001c93f (HEAD -> master)
[Minor] Fix parsing of some bogus urls
---
src/libserver/url.c | 29 +++++++++++++++++++++++++++--
test/lua/unit/url.lua | 3 +++
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/src/libserver/url.c b/src/libserver/url.c
index eb663519d..8a33b4915 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1113,10 +1113,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
if (t != '/' && t != '\\') {
c = p;
- st = parse_domain_start;
slash = p;
+ st = parse_domain_start;
+
+ /*
+ * Unfortunately, due to brain damage of the RFC 3986 authors,
+ * we have to distinguish two possibilities here:
+ * authority = [ userinfo "@" ] host [ ":" port ]
+ * So if we have @ somewhere before hostname then we must process
+ * with the username state. Otherwise, we have to process via
+ * the hostname state. Unfortunately, there is no way to distinguish
+ * them aside of running NFA or two DFA or performing lookahead.
+ * Lookahead approach looks easier to implement.
+ */
+
+ const char *tp = p;
+ while (tp < last) {
+ if (*tp == '@') {
+ user_seen = TRUE;
+ st = parse_user;
+ break;
+ }
+ else if (*tp == '/' || *tp == '#' || *tp == '?') {
+ st = parse_domain_start;
+ }
+
+ tp ++;
+ }
- if (*p == '[') {
+ if (st == parse_domain_start && *p == '[') {
st = parse_ipv6;
p++;
c = p;
diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua
index 97eda91c6..40d684bfc 100644
--- a/test/lua/unit/url.lua
+++ b/test/lua/unit/url.lua
@@ -133,6 +133,9 @@ context("URL check functions", function()
{"http://hehe。example。com#test", true, {
host = 'hehe.example.com', fragment = 'test'
}},
+ {"http:////$%^&****((@example.org//#f@f", true, {
+ user = '$%^&****((', host = 'example.org', fragment = 'f at f'
+ }},
}
-- Some cases from https://code.google.com/p/google-url/source/browse/trunk/src/url_canon_unittest.cc
More information about the Commits
mailing list