commit db4ef54: [Minor] Fix parsing of some bogus urls

Vsevolod Stakhov vsevolod at highsecure.ru
Wed May 12 13:42:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-12 14:39:09 +0100
URL: https://github.com/rspamd/rspamd/commit/db4ef545172db3417684695df1ecca3de001c93f (HEAD -> master)

[Minor] Fix parsing of some bogus urls

---
 src/libserver/url.c   | 29 +++++++++++++++++++++++++++--
 test/lua/unit/url.lua |  3 +++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/libserver/url.c b/src/libserver/url.c
index eb663519d..8a33b4915 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1113,10 +1113,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 
 			if (t != '/' && t != '\\') {
 				c = p;
-				st = parse_domain_start;
 				slash = p;
+				st = parse_domain_start;
+
+				/*
+				 * Unfortunately, due to brain damage of the RFC 3986 authors,
+				 * we have to distinguish two possibilities here:
+				 * authority = [ userinfo "@" ] host [ ":" port ]
+				 * So if we have @ somewhere before hostname then we must process
+				 * with the username state. Otherwise, we have to process via
+				 * the hostname state. Unfortunately, there is no way to distinguish
+				 * them aside of running NFA or two DFA or performing lookahead.
+				 * Lookahead approach looks easier to implement.
+				 */
+
+				const char *tp = p;
+				while (tp < last) {
+					if (*tp == '@') {
+						user_seen = TRUE;
+						st = parse_user;
+						break;
+					}
+					else if (*tp == '/' || *tp == '#' || *tp == '?') {
+						st = parse_domain_start;
+					}
+
+					tp ++;
+				}
 
-				if (*p == '[') {
+				if (st == parse_domain_start && *p == '[') {
 					st = parse_ipv6;
 					p++;
 					c = p;
diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua
index 97eda91c6..40d684bfc 100644
--- a/test/lua/unit/url.lua
+++ b/test/lua/unit/url.lua
@@ -133,6 +133,9 @@ context("URL check functions", function()
     {"http://hehe。example。com#test", true, {
       host = 'hehe.example.com', fragment = 'test'
     }},
+    {"http:////$%^&****((@example.org//#f@f", true, {
+      user = '$%^&****((', host = 'example.org', fragment = 'f at f'
+    }},
   }
 
   -- Some cases from https://code.google.com/p/google-url/source/browse/trunk/src/url_canon_unittest.cc


More information about the Commits mailing list