commit d01de01: [Fix] One more fix to skip images that are not urls

Fri May 1 12:14:10 UTC 2020

Author: Vsevolod Stakhov
Date: 2020-05-01 13:12:11 +0100
URL: https://github.com/rspamd/rspamd/commit/d01de01be22e81accd84a7f346b27269c3c91990 (HEAD -> master)

[Fix] One more fix to skip images that are not urls

---
 src/libserver/html.c | 17 ++++++++++-------
 src/libserver/url.c  |  9 ++++++++-
 src/libserver/url.h  |  1 +
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 80fa3479b..f8c43bdd5 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -191,8 +191,7 @@ khash_t(color_by_name) *html_color_by_name;
 
 static struct rspamd_url *rspamd_html_process_url (rspamd_mempool_t *pool,
 												   const gchar *start, guint len,
-												   struct html_tag_component *comp,
-												   bool is_image);
+												   struct html_tag_component *comp);
 
 static void
 rspamd_html_library_init (void)
@@ -1362,7 +1361,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
 
 struct rspamd_url *
 rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
-		struct html_tag_component *comp, bool is_image)
+		struct html_tag_component *comp)
 {
 	struct rspamd_url *url;
 	guint saved_flags = 0;
@@ -1506,8 +1505,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 		}
 	}
 
-	rc = rspamd_url_parse (url, decoded, dlen, pool,
-			is_image ? RSPAMD_URL_PARSE_TEXT :RSPAMD_URL_PARSE_HREF);
+	rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
 
 	/* Filter some completely damaged urls */
 	if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
@@ -1520,6 +1518,11 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 
 		if (no_prefix) {
 			url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+
+			if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
+				/* Ignore urls with both no schema and no tld */
+				return NULL;
+			}
 		}
 
 		decoded = url->string;
@@ -1606,7 +1609,7 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 				}
 			}
 
-			url = rspamd_html_process_url (pool, start, len, comp, false);
+			url = rspamd_html_process_url (pool, start, len, comp);
 
 			if (url && tag->extra == NULL) {
 				tag->extra = url;
@@ -1771,7 +1774,7 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 					if (img->src) {
 
 						img->url = rspamd_html_process_url (pool,
-								img->src, fstr.len, NULL, true);
+								img->src, fstr.len, NULL);
 
 						if (img->url) {
 							img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 195727c13..a47d732f7 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -240,7 +240,8 @@ struct rspamd_url_flag_name {
 		{"url_displayed", RSPAMD_URL_FLAG_DISPLAY_URL, -1},
 		{"image", RSPAMD_URL_FLAG_IMAGE, -1},
 		{"query", RSPAMD_URL_FLAG_QUERY, -1},
-		{"content", RSPAMD_URL_FLAG_CONTENT, -1}
+		{"content", RSPAMD_URL_FLAG_CONTENT, -1},
+		{"no_tld", RSPAMD_URL_FLAG_NO_TLD, -1},
 };
 
 
@@ -2348,6 +2349,12 @@ rspamd_url_parse (struct rspamd_url *uri,
 					uri->tldshift = uri->hostshift;
 					uri->tldlen = uri->hostlen;
 				}
+				else if (uri->flags & RSPAMD_URL_FLAG_SCHEMALESS) {
+					/* Ignore urls with both no schema and no tld */
+					return URI_ERRNO_TLD_MISSING;
+				}
+
+				uri->flags |= RSPAMD_URL_FLAG_NO_TLD;
 			}
 		}
 
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 2a5892fc5..7fddd07ef 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -37,6 +37,7 @@ enum rspamd_url_flags {
 	RSPAMD_URL_FLAG_IMAGE = 1u << 19u,
 	RSPAMD_URL_FLAG_QUERY = 1u << 20u,
 	RSPAMD_URL_FLAG_CONTENT = 1u << 21u,
+	RSPAMD_URL_FLAG_NO_TLD = 1u << 22u,
 };
 
 struct rspamd_url_tag {