commit b6a4d1a: [Rework] Html: Move images processing stuff

Vsevolod Stakhov vsevolod at highsecure.ru
Thu May 27 14:07:08 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-25 15:33:37 +0100
URL: https://github.com/rspamd/rspamd/commit/b6a4d1a2edc0ea72d305b67519142174c93f084e

[Rework] Html: Move images processing stuff

---
 src/libserver/html/html.cxx | 239 ++++++++++++++++++++++----------------------
 src/libutil/cxx/util.hxx    |   2 +-
 2 files changed, 120 insertions(+), 121 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index c384a9023..8a615ced6 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -767,10 +767,11 @@ process_html_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
 	}
 }
 
-static void
-rspamd_html_process_data_image(rspamd_mempool_t *pool,
-							   struct html_image *img,
-							   struct html_tag_component *src) {
+static auto
+html_process_data_image(rspamd_mempool_t *pool,
+						struct html_image *img,
+						std::string_view input) -> void
+{
 	/*
 	 * Here, we do very basic processing of the data:
 	 * detect if we have something like: ``
@@ -778,11 +779,10 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
 	 * We ignore content type so far
 	 */
 	struct rspamd_image *parsed_image;
-	const gchar *semicolon_pos = NULL, *end = (gchar *)src->start + src->len;
-
-	semicolon_pos = (gchar *)src->start;
+	const gchar *semicolon_pos = input.data(),
+			*end = input.data() + input.size();
 
-	while ((semicolon_pos = (gchar *)memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
+	if ((semicolon_pos = (const gchar *)memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
 		if (end - semicolon_pos > sizeof("base64,")) {
 			if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
 				const gchar *data_pos = semicolon_pos + sizeof("base64,");
@@ -791,7 +791,7 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
 				rspamd_ftok_t inp;
 
 				decoded_len = (encoded_len / 4 * 3) + 12;
-				decoded = (gchar *)rspamd_mempool_alloc (pool, decoded_len);
+				decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
 				rspamd_cryptobox_base64_decode(data_pos, encoded_len,
 						reinterpret_cast<guchar *>(decoded), &decoded_len);
 				inp.begin = decoded;
@@ -806,69 +806,63 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
 					img->embedded_image = parsed_image;
 				}
 			}
-
-			break;
 		}
 		else {
 			/* Nothing useful */
 			return;
 		}
-
-		semicolon_pos++;
 	}
 }
 
 static void
-html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+html_process_img_tag(rspamd_mempool_t *pool,
+					 struct html_tag *tag,
 					 struct html_content *hc,
 					 khash_t (rspamd_url_hash) *url_set,
 					 GPtrArray *part_urls,
 					 GByteArray *dest)
 {
-	struct html_tag_component *comp;
 	struct html_image *img;
-	rspamd_ftok_t fstr;
-	const guchar *p;
-	GList *cur;
-	gulong val;
-	gboolean seen_width = FALSE, seen_height = FALSE;
-	goffset pos;
 
-	cur = tag->params->head;
 	img = rspamd_mempool_alloc0_type (pool, struct html_image);
 	img->tag = tag;
 	tag->flags |= FL_IMAGE;
 
-	while (cur) {
-		comp = static_cast<html_tag_component *>(cur->data);
+	auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
 
-		if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
-			fstr.begin = (gchar *) comp->start;
-			fstr.len = comp->len;
+	if (found_href_it != tag->parameters.end()) {
+		/* Check base url */
+		const auto &href_value = found_href_it->second;
+
+		if (href_value.size() > 0) {
+			rspamd_ftok_t fstr;
+			fstr.begin = href_value.data();
+			fstr.len = href_value.size();
 			img->src = rspamd_mempool_ftokdup (pool, &fstr);
 
-			if (comp->len > sizeof("cid:") - 1 && memcmp(comp->start,
+			if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
 					"cid:", sizeof("cid:") - 1) == 0) {
 				/* We have an embedded image */
 				img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
 			}
 			else {
-				if (comp->len > sizeof("data:") - 1 && memcmp(comp->start,
+				if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
 						"data:", sizeof("data:") - 1) == 0) {
 					/* We have an embedded image in HTML tag */
 					img->flags |=
 							(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
-					rspamd_html_process_data_image(pool, img, comp);
+					html_process_data_image(pool, img, href_value);
 					hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
 				}
 				else {
 					img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
 					if (img->src) {
 
-						img->url = rspamd_html_process_url(pool,
-								img->src, fstr.len, NULL);
+						std::string_view cpy{href_value};
+						auto maybe_url = html_process_url(pool, cpy);
 
-						if (img->url) {
+						if (maybe_url) {
+							img->url = maybe_url.value();
 							struct rspamd_url *existing;
 
 							img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
@@ -892,95 +886,109 @@ html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 				}
 			}
 		}
-		else if (comp->type == RSPAMD_HTML_COMPONENT_HEIGHT) {
-			rspamd_strtoul(reinterpret_cast<const gchar *>(comp->start), comp->len, &val);
-			img->height = val;
-			seen_height = TRUE;
-		}
-		else if (comp->type == RSPAMD_HTML_COMPONENT_WIDTH) {
-			rspamd_strtoul(reinterpret_cast<const gchar *>(comp->start), comp->len, &val);
-			img->width = val;
-			seen_width = TRUE;
-		}
-		else if (comp->type == RSPAMD_HTML_COMPONENT_STYLE) {
-			/* Try to search for height= or width= in style tag */
-			if (!seen_height && comp->len > 0) {
-				pos = rspamd_substring_search_caseless(reinterpret_cast<const gchar *>(comp->start),
-						comp->len,
-						"height", sizeof("height") - 1);
-
-				if (pos != -1) {
-					p = comp->start + pos + sizeof("height") - 1;
-
-					while (p < comp->start + comp->len) {
-						if (g_ascii_isdigit (*p)) {
-							rspamd_strtoul(reinterpret_cast<const gchar *>(p),
-									comp->len - (p - comp->start), &val);
-							img->height = val;
-							break;
-						}
-						else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
-							/* Fallback */
-							break;
-						}
-						p++;
-					}
-				}
-			}
+	}
 
-			if (!seen_width && comp->len > 0) {
-				pos = rspamd_substring_search_caseless(reinterpret_cast<const gchar *>(comp->start),
-						comp->len,
-						"width", sizeof("width") - 1);
 
-				if (pos != -1) {
-					p = comp->start + pos + sizeof("width") - 1;
+	auto found_height_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT);
+	if (found_height_it != tag->parameters.end()) {
+		unsigned long val;
 
-					while (p < comp->start + comp->len) {
-						if (g_ascii_isdigit (*p)) {
-							rspamd_strtoul(reinterpret_cast<const gchar *>(p),
-									comp->len - (p - comp->start), &val);
-							img->width = val;
-							break;
-						}
-						else if (!g_ascii_isspace (*p) && *p != '=' && *p != ':') {
-							/* Fallback */
-							break;
-						}
-						p++;
+		rspamd_strtoul(found_height_it->second.data(), found_height_it->second.size(), &val);
+		img->height = val;
+	}
+
+	auto found_width_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_WIDTH);
+	if (found_width_it != tag->parameters.end()) {
+		unsigned long val;
+
+		rspamd_strtoul(found_width_it->second.data(), found_width_it->second.size(), &val);
+		img->width = val;
+	}
+
+	/* TODO: rework to css at some time */
+	auto found_style_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_STYLE);
+	if (found_style_it != tag->parameters.end()) {
+		if (found_height_it == tag->parameters.end()) {
+			auto style_st = found_style_it->second;
+			auto pos = rspamd_substring_search_caseless(style_st.data(),
+					style_st.size(),
+					"height", sizeof("height") - 1);
+			if (pos != -1) {
+				auto substr = style_st.substr(pos + sizeof("height") - 1);
+
+				for (auto i = 0; i < substr.size(); i ++) {
+					auto t = substr[i];
+					if (g_ascii_isdigit (t)) {
+						unsigned long val;
+						rspamd_strtoul(substr.data(),
+								substr.size(), &val);
+						img->height = val;
+						break;
+					}
+					else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
+						/* Fallback */
+						break;
 					}
 				}
 			}
 		}
-		else if (comp->type == RSPAMD_HTML_COMPONENT_ALT && comp->len > 0 && dest != NULL) {
-			if (dest->len > 0 && !g_ascii_isspace (dest->data[dest->len - 1])) {
-				/* Add a space */
-				g_byte_array_append(dest, reinterpret_cast<const guint8 *>(" "), 1);
+		if (found_width_it == tag->parameters.end()) {
+			auto style_st = found_style_it->second;
+			auto pos = rspamd_substring_search_caseless(style_st.data(),
+					style_st.size(),
+					"width", sizeof("width") - 1);
+			if (pos != -1) {
+				auto substr = style_st.substr(pos + sizeof("width") - 1);
+
+				for (auto i = 0; i < substr.size(); i ++) {
+					auto t = substr[i];
+					if (g_ascii_isdigit (t)) {
+						unsigned long val;
+						rspamd_strtoul(substr.data(),
+								substr.size(), &val);
+						img->width = val;
+						break;
+					}
+					else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
+						/* Fallback */
+						break;
+					}
+				}
 			}
+		}
+	}
 
-			g_byte_array_append(dest, comp->start, comp->len);
+	auto found_alt_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
 
-			if (!g_ascii_isspace (dest->data[dest->len - 1])) {
-				/* Add a space */
-				g_byte_array_append(dest, reinterpret_cast<const guint8 *>(" "), 1);
-			}
+	if (found_alt_it != tag->parameters.end() && dest != NULL) {
+		if (dest->len > 0 && !g_ascii_isspace (dest->data[dest->len - 1])) {
+			/* Add a space */
+			g_byte_array_append(dest, reinterpret_cast<const guint8 *>(" "), 1);
 		}
 
-		cur = g_list_next (cur);
+		g_byte_array_append(dest,
+				reinterpret_cast<const guint8 *>(found_alt_it->second.data()),
+				found_alt_it->second.size());
+
+		if (!g_ascii_isspace (dest->data[dest->len - 1])) {
+			/* Add a space */
+			g_byte_array_append(dest, reinterpret_cast<const guint8 *>(" "), 1);
+		}
 	}
 
-	if (hc->images == NULL) {
+
+	if (hc->images == nullptr) {
 		hc->images = g_ptr_array_sized_new(4);
-		rspamd_mempool_notify_alloc (pool, 4 * sizeof(gpointer) + sizeof(GPtrArray));
-		rspamd_mempool_add_destructor (pool, rspamd_ptr_array_free_hard,
+		rspamd_mempool_notify_alloc(pool, 4 * sizeof(gpointer) + sizeof(GPtrArray));
+		rspamd_mempool_add_destructor(pool, rspamd_ptr_array_free_hard,
 				hc->images);
 	}
 
 	if (img->embedded_image) {
-		if (!seen_height) {
+		if (img->height == 0) {
 			img->height = img->embedded_image->height;
 		}
-		if (!seen_width) {
+		if (img->width == 0) {
 			img->width = img->embedded_image->width;
 		}
 	}
@@ -989,27 +997,18 @@ html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	tag->extra = img;
 }
 
-static void
-rspamd_html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
-							 struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
-							 GPtrArray *part_urls) {
-	struct html_tag_component *comp;
-	GList *cur;
-
-	cur = tag->params->head;
-
-	while (cur) {
-		comp = static_cast<html_tag_component *>(cur->data);
-
-		if (comp->type == RSPAMD_HTML_COMPONENT_REL && comp->len > 0) {
-			if (comp->len == sizeof("icon") - 1 &&
-				rspamd_lc_cmp(reinterpret_cast<const gchar *>(comp->start), "icon", sizeof("icon") - 1) == 0) {
+static auto
+html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+					  struct html_content *hc,
+					  khash_t (rspamd_url_hash) *url_set,
+					  GPtrArray *part_urls) -> void
+{
+	auto found_rel_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_REL);
 
-				rspamd_html_process_img_tag(pool, tag, hc, url_set, part_urls, NULL);
-			}
+	if (found_rel_it != tag->parameters.end()) {
+		if (found_rel_it->second == "icon") {
+			html_process_img_tag(pool, tag, hc, url_set, part_urls, nullptr);
 		}
-
-		cur = g_list_next (cur);
 	}
 }
 
diff --git a/src/libutil/cxx/util.hxx b/src/libutil/cxx/util.hxx
index 2b8ddfe3d..3eeb6d20d 100644
--- a/src/libutil/cxx/util.hxx
+++ b/src/libutil/cxx/util.hxx
@@ -20,6 +20,7 @@
 
 #include <memory>
 #include <array>
+#include <string_view>
 
 /*
  * Common C++ utilities
@@ -63,7 +64,6 @@ constexpr auto array_of(T&&... t) -> std::array<V, sizeof...(T)>
 	return {{ std::forward<T>(t)... }};
 }
 
-
 }
 
 #endif //RSPAMD_UTIL_HXX


More information about the Commits mailing list