commit 7085305: [Fix] Find suspicious url encodings that could break url extraction

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Nov 17 21:56:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-11-17 21:48:00 +0000
URL: https://github.com/rspamd/rspamd/commit/708530514c4656b84bdbcf700084de011b8aaa04

[Fix] Find suspicious url encodings that could break url extraction

---
 src/libserver/html/html_url.cxx | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx
index d073a25d9..8f721b3e8 100644
--- a/src/libserver/html/html_url.cxx
+++ b/src/libserver/html/html_url.cxx
@@ -376,6 +376,8 @@ html_process_url(rspamd_mempool_t *pool, std::string_view &input)
 	/*
 	 * We also need to remove all internal newlines, spaces
 	 * and encode unsafe characters
+	 * Another obfuscation find in the wild was encoding of the SAFE url characters,
+	 * including essential ones
 	 */
 	for (auto i = 0; i < sz; i++) {
 		if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
@@ -388,6 +390,43 @@ html_process_url(rspamd_mempool_t *pool, std::string_view &input)
 			*d++ = hexdigests[s[i] & 0xf];
 			has_bad_chars = TRUE;
 		}
+		else if (G_UNLIKELY (s[i] == '%')) {
+			if (i + 2 < sz) {
+				auto [c1, c2] = std::tuple(s[i + 1], s[i + 2]);
+
+				if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
+					auto codepoint = 0;
+
+					if      (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
+					else if (c1 >= 'A' && c1 <= 'F') codepoint = c1 - 'A' + 10;
+					else if (c1 >= 'a' && c1 <= 'f') codepoint = c1 - 'a' + 10;
+
+					codepoint <<= 4;
+
+					if      (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
+					else if (c2 >= 'A' && c2 <= 'F') codepoint += c2 - 'A' + 10;
+					else if (c2 >= 'a' && c2 <= 'f') codepoint += c2 - 'a' + 10;
+
+					/* Now check for 'interesting' codepoints */
+					if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
+						codepoint == '?' || codepoint == '\\' || codepoint == '/') {
+						/* Replace it back */
+						*d++ = (char)(codepoint & 0xff);
+						i += 2;
+						has_bad_chars = TRUE;
+					}
+					else {
+						*d++ = s[i];
+					}
+				}
+				else {
+					*d++ = s[i];
+				}
+			}
+			else {
+				*d++ = s[i];
+			}
+		}
 		else {
 			*d++ = s[i];
 		}


More information about the Commits mailing list