commit f1e6d84: [Minor] More heuristics in HTML urls detection

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Feb 21 13:49:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-02-21 13:42:05 +0000
URL: https://github.com/rspamd/rspamd/commit/f1e6d84387a1faf4048b2faf5dc268bc255b1152 (HEAD -> master)

[Minor] More heuristics in HTML urls detection

---
 src/libserver/html.c | 72 ++++++++++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 8ade5a61e..3353db7b7 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1301,7 +1301,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 	gchar *decoded;
 	gint rc;
 	gsize decoded_len;
-	const gchar *p, *s;
+	const gchar *p, *s, *prefix = "http://";
 	gchar *d;
 	guint i, dlen;
 	gboolean has_bad_chars = FALSE, no_prefix = FALSE;
@@ -1347,44 +1347,50 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 	}
 
 	if (rspamd_substring_search (start, len, "://", 3) == -1) {
-		/* We have no prefix */
-		dlen += sizeof ("http://") - 1;
-		no_prefix = TRUE;
+		if (len >= sizeof ("mailto:") &&
+				(memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
+						memcmp (start, "tel:", sizeof ("tel:") - 1) == 0)) {
+			/* Exclusion, has valid but 'strange' prefix */
+		}
+		else {
+			for (i = 0; i < len; i ++) {
+				if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
+					if (i == 0 && len > 2 && s[i] == '/'  && s[i + 1] == '/') {
+						prefix = "http:";
+						dlen += sizeof ("http:") - 1;
+						no_prefix = TRUE;
+					}
+					else if (s[i] == '@') {
+						/* Likely email prefix */
+						prefix = "mailto://";
+						dlen += sizeof ("mailto://") - 1;
+						no_prefix = TRUE;
+					}
+					else {
+						if (i == 0) {
+							/* No valid data */
+							return NULL;
+						}
+					}
+
+					break;
+				}
+			}
+
+			if (!no_prefix) {
+				no_prefix = TRUE;
+				dlen += strlen (prefix);
+			}
+		}
 	}
 
 	decoded = rspamd_mempool_alloc (pool, dlen + 1);
 	d = decoded;
 
 	if (no_prefix) {
-		if (s[0] == '/' && (len > 2 && s[1] == '/')) {
-			/* //bla case */
-			memcpy (d, "http:", sizeof ("http:") - 1);
-			d += sizeof ("http:") - 1;
-		}
-		else if (s[0] == '\\' && (len > 2 && s[1] == '\\')) {
-			/* Likely SMB share, ignore */
-			return NULL;
-		}
-		else {
-			if (s[0] == '.') {
-				/*
-				 * We have relative URL without base URL:
-				 * the former is covered by caller function which
-				 * checks for the base URL.
-				 *
-				 * In the most cases, it is caused by a broken client
-				 */
-				return NULL;
-			}
-			else if ((s[0] & 0x80) || g_ascii_isalnum (s[0])) {
-				memcpy (d, "http://", sizeof ("http://") - 1);
-				d += sizeof ("http://") - 1;
-			}
-			else {
-				/* Some crap */
-				return NULL;
-			}
-		}
+		gsize plen = strlen (prefix);
+		memcpy (d, prefix, plen);
+		d += plen;
 	}
 
 	/*


More information about the Commits mailing list