commit 882ea33: [Rework] Html: Further html urls rework

Thu May 27 14:07:07 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-05-25 12:15:30 +0100
URL: https://github.com/rspamd/rspamd/commit/882ea3395af8bfb9929ff5308ecc8cfd959fd761

[Rework] Html: Further html urls rework

---
 src/libserver/html/html.cxx     | 403 +++++++---------------------------------
 src/libserver/html/html.h       |  19 +-
 src/libserver/html/html_url.cxx | 220 +++++++++++++++++++++-
 src/libserver/html/html_url.hxx |  24 +++
 4 files changed, 322 insertions(+), 344 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index c167b004f..c384a9023 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -30,6 +30,7 @@
 #include "html_tag_defs.hxx"
 #include "html_entities.hxx"
 #include "html_tag.hxx"
+#include "html_url.hxx"
 
 #include <vector>
 #include <frozen/unordered_map.h>
@@ -633,273 +634,76 @@ parse_tag_content(rspamd_mempool_t *pool,
 	parser_env.cur_state = state;
 }
 
-}
-
-/* Unconverted C part */
-
-static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
-												  const gchar *start, guint len,
-												  struct html_tag_component *comp);
-
-
-
-
-struct rspamd_url *
-rspamd_html_process_url(rspamd_mempool_t *pool, const gchar *start, guint len,
-						struct html_tag_component *comp) {
-	struct rspamd_url *url;
-	guint saved_flags = 0;
-	gchar *decoded;
-	gint rc;
-	gsize decoded_len;
-	const gchar *p, *s, *prefix = "http://";
-	gchar *d;
-	guint i;
-	gsize dlen;
-	gboolean has_bad_chars = FALSE, no_prefix = FALSE;
-	static const gchar hexdigests[] = "0123456789abcdef";
-
-	p = start;
-
-	/* Strip spaces from the url */
-	/* Head spaces */
-	while (p < start + len && g_ascii_isspace (*p)) {
-		p++;
-		start++;
-		len--;
-	}
-
-	if (comp) {
-		comp->start = (guchar *)p;
-		comp->len = len;
-	}
-
-	/* Trailing spaces */
-	p = start + len - 1;
-
-	while (p >= start && g_ascii_isspace (*p)) {
-		p--;
-		len--;
-
-		if (comp) {
-			comp->len--;
-		}
-	}
+static auto
+html_process_url_tag(rspamd_mempool_t *pool,
+					 struct html_tag *tag,
+					 struct html_content *hc) -> std::optional<struct rspamd_url *>
+{
+	auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
 
-	s = start;
-	dlen = 0;
+	if (found_href_it != tag->parameters.end()) {
+		/* Check base url */
+		auto &href_value = found_href_it->second;
 
-	for (i = 0; i < len; i++) {
-		if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
-			dlen += 3;
-		}
-		else {
-			dlen++;
-		}
-	}
+		if (hc && hc->base_url && href_value.size() > 2) {
+			/*
+			 * Relative url cannot start from the following:
+			 * schema://
+			 * data:
+			 * slash
+			 */
 
-	if (rspamd_substring_search(start, len, "://", 3) == -1) {
-		if (len >= sizeof("mailto:") &&
-			(memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
-			 memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
-			 memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
-			/* Exclusion, has valid but 'strange' prefix */
-		}
-		else {
-			for (i = 0; i < len; i++) {
-				if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
-					if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
-						prefix = "http:";
-						dlen += sizeof("http:") - 1;
-						no_prefix = TRUE;
-					}
-					else if (s[i] == '@') {
-						/* Likely email prefix */
-						prefix = "mailto://";
-						dlen += sizeof("mailto://") - 1;
-						no_prefix = TRUE;
-					}
-					else if (s[i] == ':' && i != 0) {
-						/* Special case */
-						no_prefix = FALSE;
-					}
-					else {
-						if (i == 0) {
-							/* No valid data */
-							return NULL;
-						}
-						else {
-							no_prefix = TRUE;
-							dlen += strlen(prefix);
-						}
-					}
+			if (rspamd_substring_search(href_value.data(), href_value.size(), "://", 3) == -1) {
 
-					break;
+				if (href_value.size() >= sizeof("data:") &&
+					g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
+					/* Image data url, never insert as url */
+					return std::nullopt;
 				}
-			}
-		}
-	}
-
-	decoded = (char *)rspamd_mempool_alloc (pool, dlen + 1);
-	d = decoded;
-
-	if (no_prefix) {
-		gsize plen = strlen(prefix);
-		memcpy(d, prefix, plen);
-		d += plen;
-	}
-
-	/*
-	 * We also need to remove all internal newlines, spaces
-	 * and encode unsafe characters
-	 */
-	for (i = 0; i < len; i++) {
-		if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
-			continue;
-		}
-		else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
-			/* URL encode */
-			*d++ = '%';
-			*d++ = hexdigests[(s[i] >> 4) & 0xf];
-			*d++ = hexdigests[s[i] & 0xf];
-			has_bad_chars = TRUE;
-		}
-		else {
-			*d++ = s[i];
-		}
-	}
-
-	*d = '\0';
-	dlen = d - decoded;
 
-	url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+				/* Assume relative url */
+				auto need_slash = false;
 
-	rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
+				auto orig_len = href_value.size();
+				auto len = orig_len + hc->base_url->urllen;
 
-	rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
-	/* Filter some completely damaged urls */
-	if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
-		!((url->protocol & PROTOCOL_UNKNOWN))) {
-		url->flags |= saved_flags;
-
-		if (has_bad_chars) {
-			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
-		}
-
-		if (no_prefix) {
-			url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+				if (hc->base_url->datalen == 0) {
+					need_slash = true;
+					len++;
+				}
 
-			if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
-				/* Ignore urls with both no schema and no tld */
-				return NULL;
+				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+				auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1,
+						"%*s%s%*s",
+						hc->base_url->urllen, hc->base_url->string,
+						need_slash ? "/" : "",
+						(gint) orig_len, href_value.size());
+				href_value = {buf, nlen};
+			}
+			else if (href_value[0] == '/' && href_value[1] != '/') {
+				/* Relative to the hostname */
+				auto orig_len = href_value.size();
+				auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
+					   3 /* for :// */;
+				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+				auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
+						hc->base_url->protocollen, hc->base_url->string,
+						hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
+						(gint)orig_len, href_value.data());
+				href_value = {buf, nlen};
 			}
 		}
 
-		decoded = url->string;
-		decoded_len = url->urllen;
+		auto url = html_process_url(pool, href_value);
 
-		if (comp) {
-			comp->start = (guchar *)decoded;
-			comp->len = decoded_len;
-		}
-		/* Spaces in href usually mean an attempt to obfuscate URL */
-		/* See https://github.com/vstakhov/rspamd/issues/593 */
-#if 0
-		if (has_spaces) {
-			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+		if (url && tag->extra == nullptr) {
+			tag->extra = url.value();
 		}
-#endif
 
 		return url;
 	}
 
-	return NULL;
-}
-
-static struct rspamd_url *
-rspamd_html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag,
-							struct html_content *hc) {
-	struct html_tag_component *comp;
-	GList *cur;
-	struct rspamd_url *url;
-	const gchar *start;
-	gsize len;
-
-	cur = tag->params->head;
-
-	while (cur) {
-		comp = (struct html_tag_component *)cur->data;
-
-		if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
-			start = (char *)comp->start;
-			len = comp->len;
-
-			/* Check base url */
-			if (hc && hc->base_url && comp->len > 2) {
-				/*
-				 * Relative url cannot start from the following:
-				 * schema://
-				 * data:
-				 * slash
-				 */
-				gchar *buf;
-				gsize orig_len;
-
-				if (rspamd_substring_search(start, len, "://", 3) == -1) {
-
-					if (len >= sizeof("data:") &&
-						g_ascii_strncasecmp(start, "data:", sizeof("data:") - 1) == 0) {
-						/* Image data url, never insert as url */
-						return NULL;
-					}
-
-					/* Assume relative url */
-
-					gboolean need_slash = FALSE;
-
-					orig_len = len;
-					len += hc->base_url->urllen;
-
-					if (hc->base_url->datalen == 0) {
-						need_slash = TRUE;
-						len++;
-					}
-
-					buf = (char *)rspamd_mempool_alloc (pool, len + 1);
-					rspamd_snprintf(buf, len + 1, "%*s%s%*s",
-							hc->base_url->urllen, hc->base_url->string,
-							need_slash ? "/" : "",
-							(gint) orig_len, start);
-					start = buf;
-				}
-				else if (start[0] == '/' && start[1] != '/') {
-					/* Relative to the hostname */
-					orig_len = len;
-					len += hc->base_url->hostlen + hc->base_url->protocollen +
-						   3 /* for :// */;
-					buf = (char *)rspamd_mempool_alloc (pool, len + 1);
-					rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
-							hc->base_url->protocollen, hc->base_url->string,
-							hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
-							(gint) orig_len, start);
-					start = buf;
-				}
-			}
-
-			url = rspamd_html_process_url(pool, start, len, comp);
-
-			if (url && tag->extra == NULL) {
-				tag->extra = url;
-			}
-
-			return url;
-		}
-
-		cur = g_list_next (cur);
-	}
-
-	return NULL;
+	return std::nullopt;
 }
 
 struct rspamd_html_url_query_cbd {
@@ -910,8 +714,9 @@ struct rspamd_html_url_query_cbd {
 };
 
 static gboolean
-rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
-							   gsize end_offset, gpointer ud) {
+html_url_query_callback(struct rspamd_url *url, gsize start_offset,
+							   gsize end_offset, gpointer ud)
+{
 	struct rspamd_html_url_query_cbd *cbd =
 			(struct rspamd_html_url_query_cbd *) ud;
 	rspamd_mempool_t *pool;
@@ -939,9 +744,10 @@ rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
 }
 
 static void
-rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
-						khash_t (rspamd_url_hash) *url_set,
-						GPtrArray *part_urls) {
+process_html_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
+					   khash_t (rspamd_url_hash) *url_set,
+					   GPtrArray *part_urls)
+{
 	if (url->querylen > 0) {
 		struct rspamd_html_url_query_cbd qcbd;
 
@@ -953,7 +759,7 @@ rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
 		rspamd_url_find_multiple(pool,
 				rspamd_url_query_unsafe (url), url->querylen,
 				RSPAMD_URL_FIND_ALL, NULL,
-				rspamd_html_url_query_callback, &qcbd);
+				html_url_query_callback, &qcbd);
 	}
 
 	if (part_urls) {
@@ -1013,10 +819,12 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
 }
 
 static void
-rspamd_html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
-							struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
-							GPtrArray *part_urls,
-							GByteArray *dest) {
+html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+					 struct html_content *hc,
+					 khash_t (rspamd_url_hash) *url_set,
+					 GPtrArray *part_urls,
+					 GByteArray *dest)
+{
 	struct html_tag_component *comp;
 	struct html_image *img;
 	rspamd_ftok_t fstr;
@@ -1205,6 +1013,10 @@ rspamd_html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	}
 }
 
+}
+
+/* Unconverted C part */
+
 static void
 rspamd_html_process_color(const gchar *line, guint len, struct html_color *cl)
 {
@@ -1764,80 +1576,7 @@ rspamd_html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	tag->extra = bl;
 }
 
-static void
-rspamd_html_check_displayed_url(rspamd_mempool_t *pool,
-								GList **exceptions,
-								khash_t (rspamd_url_hash) *url_set,
-								GByteArray *dest,
-								gint href_offset,
-								struct rspamd_url *url) {
-	struct rspamd_url *displayed_url = NULL;
-	struct rspamd_url *turl;
-	gboolean url_found = FALSE;
-	struct rspamd_process_exception *ex;
-	guint saved_flags = 0;
-	gsize dlen;
-
-	if (href_offset < 0) {
-		/* No dispalyed url, just some text within <a> tag */
-		return;
-	}
-
-	url->visible_part = (gchar *)rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
-	rspamd_strlcpy(url->visible_part,
-			reinterpret_cast<const gchar *>(dest->data + href_offset),
-			dest->len - href_offset + 1);
-	dlen = dest->len - href_offset;
-
-	/* Strip unicode spaces from the start and the end */
-	url->visible_part = rspamd_string_unicode_trim_inplace(url->visible_part,
-			&dlen);
-	rspamd_html_url_is_phished(pool, url,
-			reinterpret_cast<const guchar *>(url->visible_part),
-			dlen,
-			&url_found, &displayed_url);
-
-	if (url_found) {
-		url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
-	}
-
-	if (exceptions && url_found) {
-		ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
-		ex->pos = href_offset;
-		ex->len = dest->len - href_offset;
-		ex->type = RSPAMD_EXCEPTION_URL;
-		ex->ptr = url;
-
-		*exceptions = g_list_prepend(*exceptions,
-				ex);
-	}
-
-	if (displayed_url && url_set) {
-		turl = rspamd_url_set_add_or_return(url_set,
-				displayed_url);
 
-		if (turl != NULL) {
-			/* Here, we assume the following:
-			 * if we have a URL in the text part which
-			 * is the same as displayed URL in the
-			 * HTML part, we assume that it is also
-			 * hint only.
-			 */
-			if (turl->flags &
-				RSPAMD_URL_FLAG_FROM_TEXT) {
-				turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
-				turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
-			}
-
-			turl->count++;
-		}
-		else {
-			/* Already inserted by `rspamd_url_set_add_or_return` */
-		}
-	}
-
-	rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
-}
 
 static gboolean
 rspamd_html_propagate_lengths(GNode *node, gpointer _unused) {
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 14217b2c9..afa46eb06 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -46,7 +46,6 @@ extern "C" {
 
 
 struct rspamd_image;
-struct html_tag;
 
 struct html_image {
 	guint height;
@@ -55,7 +54,7 @@ struct html_image {
 	gchar *src;
 	struct rspamd_url *url;
 	struct rspamd_image *embedded_image;
-	struct html_tag *tag;
+	void *tag;
 };
 
 struct html_color {
@@ -79,7 +78,7 @@ struct html_color {
 };
 
 struct html_block {
-	struct html_tag *tag;
+	void *tag;
 	struct html_color font_color;
 	struct html_color background_color;
 	//struct html_tag_component style;
@@ -101,8 +100,6 @@ struct html_block {
 #define FL_HREF         (1 << 29)
 #define FL_IMAGE        (1 << 30)
 
-
-
 /* Forwarded declaration */
 struct rspamd_task;
 
@@ -122,13 +119,13 @@ struct html_content {
 /*
  * Decode HTML entitles in text. Text is modified in place.
  */
-guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
+guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
 
-GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
+GByteArray *rspamd_html_process_part(rspamd_mempool_t *pool,
 									  struct html_content *hc,
 									  GByteArray *in);
 
-GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
+GByteArray *rspamd_html_process_part_full(rspamd_mempool_t *pool,
 										   struct html_content *hc,
 										   GByteArray *in, GList **exceptions,
 										   khash_t (rspamd_url_hash) *url_set,
@@ -138,21 +135,21 @@ GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
 /*
  * Returns true if a specified tag has been seen in a part
  */
-gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
+gboolean rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname);
 
 /**
  * Returns name for the specified tag id
  * @param id
  * @return
  */
-const gchar *rspamd_html_tag_by_id (gint id);
+const gchar *rspamd_html_tag_by_id(gint id);
 
 /**
  * Returns HTML tag id by name
  * @param name
  * @return
  */
-gint rspamd_html_tag_by_name (const gchar *name);
+gint rspamd_html_tag_by_name(const gchar *name);
 
 /**
  * Extract URL from HTML tag component and sets component elements if needed
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx
index 93728119b..5c4fb8d56 100644
--- a/src/libserver/html/html_url.cxx
+++ b/src/libserver/html/html_url.cxx
@@ -18,6 +18,7 @@
 #include "libutil/str_util.h"
 #include "libserver/url.h"
 #include "libserver/logger.h"
+#include "rspamd.h"
 
 #include <unicode/idna.h>
 
@@ -137,7 +138,7 @@ html_url_is_phished(rspamd_mempool_t *pool,
 	if (text_data.size() > 4 &&
 		rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
 				RSPAMD_URL_FIND_ALL,
-				&url_pos, NULL) && url_str != NULL) {
+				&url_pos, NULL) && url_str != nullptr) {
 
 		text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
 		auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
@@ -197,4 +198,221 @@ html_url_is_phished(rspamd_mempool_t *pool,
 	return std::nullopt;
 }
 
+void
+html_check_displayed_url(rspamd_mempool_t *pool,
+						 GList **exceptions,
+						 void *url_set,
+						 std::string_view visible_part,
+						 goffset href_offset,
+						 struct rspamd_url *url)
+{
+	struct rspamd_url *displayed_url = nullptr;
+	struct rspamd_url *turl;
+	struct rspamd_process_exception *ex;
+	guint saved_flags = 0;
+	gsize dlen;
+
+	if (visible_part.empty()) {
+		/* No dispalyed url, just some text within <a> tag */
+		return;
+	}
+
+	url->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
+	rspamd_strlcpy(url->visible_part,
+			visible_part.data(),
+			visible_part.size());
+	dlen = visible_part.size();
+
+	/* Strip unicode spaces from the start and the end */
+	url->visible_part = const_cast<char *>(
+			rspamd_string_unicode_trim_inplace(url->visible_part,
+			&dlen));
+	auto maybe_url = html_url_is_phished(pool, url,
+			{url->visible_part, dlen});
+
+	if (maybe_url) {
+		url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
+		displayed_url = maybe_url.value();
+	}
+
+	if (exceptions && displayed_url != nullptr) {
+		ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
+		ex->pos = href_offset;
+		ex->len = dlen;
+		ex->type = RSPAMD_EXCEPTION_URL;
+		ex->ptr = url;
+
+		*exceptions = g_list_prepend(*exceptions, ex);
+	}
+
+	if (displayed_url && url_set) {
+		turl = rspamd_url_set_add_or_return((khash_t (rspamd_url_hash) *)url_set, displayed_url);
+
+		if (turl != nullptr) {
+			/* Here, we assume the following:
+			 * if we have a URL in the text part which
+			 * is the same as displayed URL in the
+			 * HTML part, we assume that it is also
+			 * hint only.
+			 */
+			if (turl->flags &
+				RSPAMD_URL_FLAG_FROM_TEXT) {
+				turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+				turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+			}
+
+			turl->count++;
+		}
+		else {
+			/* Already inserted by `rspamd_url_set_add_or_return` */
+		}
+	}
+
+	rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
+}
+
+auto
+html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+	-> std::optional<struct rspamd_url *>
+{
+	struct rspamd_url *url;
+	guint saved_flags = 0;
+	gint rc;
+	const gchar *s, *prefix = "http://";
+	gchar *d;
+	gsize dlen;
+	gboolean has_bad_chars = FALSE, no_prefix = FALSE;
+	static const gchar hexdigests[] = "0123456789abcdef";
+
+	auto sz = input.length();
+	const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
+	input = {trimmed, sz};
+
+	const auto *start = input.data();
+	s = start;
+	dlen = 0;
+
+	for (auto i = 0; i < sz; i++) {
+		if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+			dlen += 3;
+		}
+		else {
*** OUTPUT TRUNCATED, 164 LINES SKIPPED ***