commit 7d0bb5c: [Rework] Html: Make parameters as a vector again

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jun 7 21:00:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-07 21:57:36 +0100
URL: https://github.com/rspamd/rspamd/commit/7d0bb5ce599c01ed4da6e4204acc63d32bfca853 (HEAD -> master)

[Rework] Html: Make parameters as a vector again

---
 src/libserver/html/html.cxx     | 305 +++++++++++++++++++---------------------
 src/libserver/html/html_tag.hxx |  30 +++-
 2 files changed, 171 insertions(+), 164 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index f041f45b7..7ae748dff 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -300,16 +300,12 @@ html_parse_tag_content(rspamd_mempool_t *pool,
 			in > parser_env.saved_p) {
 
 			/* We ignore repeated attributes */
-			auto found_it = tag->parameters.find(parser_env.cur_component.value());
-
-			if (found_it == tag->parameters.end()) {
 				auto sz = (std::size_t)(in - parser_env.saved_p);
 				auto *s = rspamd_mempool_alloc_buffer(pool, sz);
 				memcpy(s, parser_env.saved_p, sz);
 				sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p);
-				tag->parameters.emplace(parser_env.cur_component.value(),
+				tag->parameters.emplace_back(parser_env.cur_component.value(),
 						std::string_view{s, sz});
-			}
 		}
 
 		parser_env.saved_p = nullptr;
@@ -635,11 +631,11 @@ html_process_url_tag(rspamd_mempool_t *pool,
 					 struct html_tag *tag,
 					 struct html_content *hc) -> std::optional<struct rspamd_url *>
 {
-	auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
+	auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
 
-	if (found_href_it != tag->parameters.end()) {
+	if (found_href_maybe) {
 		/* Check base url */
-		auto &href_value = found_href_it->second;
+		auto &href_value = found_href_maybe.value();
 
 		if (hc && hc->base_url && href_value.size() > 2) {
 			/*
@@ -823,148 +819,145 @@ html_process_img_tag(rspamd_mempool_t *pool,
 	img->tag = tag;
 	tag->flags |= FL_IMAGE;
 
-	auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
 
-	if (found_href_it != tag->parameters.end()) {
-		/* Check base url */
-		const auto &href_value = found_href_it->second;
+	for (const auto &param : tag->parameters) {
 
-		if (href_value.size() > 0) {
-			rspamd_ftok_t fstr;
-			fstr.begin = href_value.data();
-			fstr.len = href_value.size();
-			img->src = rspamd_mempool_ftokdup (pool, &fstr);
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
+			/* Check base url */
+			const auto &href_value = param.value;
 
-			if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
-					"cid:", sizeof("cid:") - 1) == 0) {
-				/* We have an embedded image */
-				img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
-			}
-			else {
-				if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
-						"data:", sizeof("data:") - 1) == 0) {
-					/* We have an embedded image in HTML tag */
-					img->flags |=
-							(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
-					html_process_data_image(pool, img, href_value);
-					hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+			if (href_value.size() > 0) {
+				rspamd_ftok_t fstr;
+				fstr.begin = href_value.data();
+				fstr.len = href_value.size();
+				img->src = rspamd_mempool_ftokdup (pool, &fstr);
+
+				if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
+						"cid:", sizeof("cid:") - 1) == 0) {
+					/* We have an embedded image */
+					img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
 				}
 				else {
-					img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
-					if (img->src) {
+					if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
+							"data:", sizeof("data:") - 1) == 0) {
+						/* We have an embedded image in HTML tag */
+						img->flags |=
+								(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
+						html_process_data_image(pool, img, href_value);
+						hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+					}
+					else {
+						img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
+						if (img->src) {
 
-						std::string_view cpy{href_value};
-						auto maybe_url = html_process_url(pool, cpy);
+							std::string_view cpy{href_value};
+							auto maybe_url = html_process_url(pool, cpy);
 
-						if (maybe_url) {
-							img->url = maybe_url.value();
-							struct rspamd_url *existing;
+							if (maybe_url) {
+								img->url = maybe_url.value();
+								struct rspamd_url *existing;
 
-							img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
-							existing = rspamd_url_set_add_or_return(url_set, img->url);
+								img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
+								existing = rspamd_url_set_add_or_return(url_set, img->url);
 
-							if (existing != img->url) {
-								/*
-								 * We have some other URL that could be
-								 * found, e.g. from another part. However,
-								 * we still want to set an image flag on it
-								 */
-								existing->flags |= img->url->flags;
-								existing->count++;
-							}
-							else if (part_urls) {
-								/* New url */
-								g_ptr_array_add(part_urls, img->url);
+								if (existing != img->url) {
+									/*
+									 * We have some other URL that could be
+									 * found, e.g. from another part. However,
+									 * we still want to set an image flag on it
+									 */
+									existing->flags |= img->url->flags;
+									existing->count++;
+								}
+								else if (part_urls) {
+									/* New url */
+									g_ptr_array_add(part_urls, img->url);
+								}
 							}
 						}
 					}
 				}
 			}
 		}
-	}
 
 
-	auto found_height_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT);
-	if (found_height_it != tag->parameters.end()) {
-		unsigned long val;
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
+			unsigned long val;
 
-		rspamd_strtoul(found_height_it->second.data(), found_height_it->second.size(), &val);
-		img->height = val;
-	}
+			rspamd_strtoul(param.value.data(), param.value.size(), &val);
+			img->height = val;
+		}
 
-	auto found_width_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_WIDTH);
-	if (found_width_it != tag->parameters.end()) {
-		unsigned long val;
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
+			unsigned long val;
 
-		rspamd_strtoul(found_width_it->second.data(), found_width_it->second.size(), &val);
-		img->width = val;
-	}
+			rspamd_strtoul(param.value.data(), param.value.size(), &val);
+			img->width = val;
+		}
 
-	/* TODO: rework to css at some time */
-	auto found_style_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_STYLE);
-	if (found_style_it != tag->parameters.end()) {
-		if (found_height_it == tag->parameters.end()) {
-			auto style_st = found_style_it->second;
-			auto pos = rspamd_substring_search_caseless(style_st.data(),
-					style_st.size(),
-					"height", sizeof("height") - 1);
-			if (pos != -1) {
-				auto substr = style_st.substr(pos + sizeof("height") - 1);
-
-				for (auto i = 0; i < substr.size(); i ++) {
-					auto t = substr[i];
-					if (g_ascii_isdigit (t)) {
-						unsigned long val;
-						rspamd_strtoul(substr.data(),
-								substr.size(), &val);
-						img->height = val;
-						break;
-					}
-					else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
-						/* Fallback */
-						break;
+		/* TODO: rework to css at some time */
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
+			if (img->height == 0) {
+				auto style_st = param.value;
+				auto pos = rspamd_substring_search_caseless(style_st.data(),
+						style_st.size(),
+						"height", sizeof("height") - 1);
+				if (pos != -1) {
+					auto substr = style_st.substr(pos + sizeof("height") - 1);
+
+					for (auto i = 0; i < substr.size(); i++) {
+						auto t = substr[i];
+						if (g_ascii_isdigit (t)) {
+							unsigned long val;
+							rspamd_strtoul(substr.data(),
+									substr.size(), &val);
+							img->height = val;
+							break;
+						}
+						else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
+							/* Fallback */
+							break;
+						}
 					}
 				}
 			}
-		}
-		if (found_width_it == tag->parameters.end()) {
-			auto style_st = found_style_it->second;
-			auto pos = rspamd_substring_search_caseless(style_st.data(),
-					style_st.size(),
-					"width", sizeof("width") - 1);
-			if (pos != -1) {
-				auto substr = style_st.substr(pos + sizeof("width") - 1);
-
-				for (auto i = 0; i < substr.size(); i ++) {
-					auto t = substr[i];
-					if (g_ascii_isdigit (t)) {
-						unsigned long val;
-						rspamd_strtoul(substr.data(),
-								substr.size(), &val);
-						img->width = val;
-						break;
-					}
-					else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
-						/* Fallback */
-						break;
+			if (img->width == 0) {
+				auto style_st = param.value;
+				auto pos = rspamd_substring_search_caseless(style_st.data(),
+						style_st.size(),
+						"width", sizeof("width") - 1);
+				if (pos != -1) {
+					auto substr = style_st.substr(pos + sizeof("width") - 1);
+
+					for (auto i = 0; i < substr.size(); i++) {
+						auto t = substr[i];
+						if (g_ascii_isdigit (t)) {
+							unsigned long val;
+							rspamd_strtoul(substr.data(),
+									substr.size(), &val);
+							img->width = val;
+							break;
+						}
+						else if (!g_ascii_isspace (t) && t != '=' && t != ':') {
+							/* Fallback */
+							break;
+						}
 					}
 				}
 			}
 		}
-	}
-
-	auto found_alt_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
 
-	if (found_alt_it != tag->parameters.end()) {
-		if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
-			/* Add a space */
-			hc->parsed += ' ';
-		}
-		hc->parsed.append(found_alt_it->second);
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_ALT) {
+			if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) {
+				/* Add a space */
+				hc->parsed += ' ';
+			}
+			hc->parsed.append(param.value);
 
-		if (!g_ascii_isspace (hc->parsed.back())) {
-			/* Add a space */
-			hc->parsed += ' ';
+			if (!g_ascii_isspace (hc->parsed.back())) {
+				/* Add a space */
+				hc->parsed += ' ';
+			}
 		}
 	}
 
@@ -987,10 +980,10 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 					  khash_t (rspamd_url_hash) *url_set,
 					  GPtrArray *part_urls) -> void
 {
-	auto found_rel_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_REL);
+	auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
 
-	if (found_rel_it != tag->parameters.end()) {
-		if (found_rel_it->second == "icon") {
+	if (found_rel_maybe) {
+		if (found_rel_maybe.value() == "icon") {
 			html_process_img_tag(pool, tag, hc, url_set, part_urls);
 		}
 	}
@@ -1484,45 +1477,41 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	bl->font_size = (guint) -1;
 	bl->font_color.d.comp.alpha = 255;
 
-	auto found_color_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_COLOR);
-
-	if (found_color_it != tag->parameters.end()) {
-		html_process_color(found_color_it->second, &bl->font_color);
-		msg_debug_html ("tag %*s; got color: %xd",
-				(int)tag->name.size(), tag->name.data(),
-				bl->font_color.d.val);
-	}
-
-	auto found_bgcolor_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR);
+	for (const auto &param : tag->parameters) {
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
+			html_process_color(param.value, &bl->font_color);
+			msg_debug_html ("tag %*s; got color: %xd",
+					(int) tag->name.size(), tag->name.data(),
+					bl->font_color.d.val);
+		}
 
-	if (found_bgcolor_it != tag->parameters.end()) {
-		html_process_color(found_bgcolor_it->second, &bl->background_color);
-		msg_debug_html ("tag %*s; got bgcolor: %xd",
-				(int)tag->name.size(), tag->name.data(),
-				bl->background_color.d.val);
-		if (tag->id == Tag_BODY) {
-			/* Set global background color */
-			memcpy(&hc->bgcolor, &bl->background_color,
-					sizeof(hc->bgcolor));
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
+			html_process_color(param.value, &bl->background_color);
+			msg_debug_html ("tag %*s; got bgcolor: %xd",
+					(int) tag->name.size(), tag->name.data(),
+					bl->background_color.d.val);
+			if (tag->id == Tag_BODY) {
+				/* Set global background color */
+				memcpy(&hc->bgcolor, &bl->background_color,
+						sizeof(hc->bgcolor));
+			}
 		}
-	}
 
-	auto found_style_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_STYLE);
-	if (found_style_it != tag->parameters.end()) {
-		html_process_style(pool, bl, hc, found_style_it->second);
-		msg_debug_html ("tag: %*s; got style: %*s",
-				(int)tag->name.size(), tag->name.data(),
-				(int) bl->style.len, bl->style.begin);
-	}
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
+			html_process_style(pool, bl, hc, param.value);
+			msg_debug_html ("tag: %*s; got style: %*s",
+					(int) tag->name.size(), tag->name.data(),
+					(int) bl->style.len, bl->style.begin);
+		}
 
-	auto found_class_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_CLASS);
-	if (found_class_it != tag->parameters.end()) {
-		rspamd_ftok_t fstr;
-		fstr.begin = found_class_it->second.data();
-		fstr.len = found_class_it->second.size();
-		bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
-		msg_debug_html ("tag: %*s; got class: %s",
-				(int)tag->name.size(), tag->name.data(), bl->html_class);
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
+			rspamd_ftok_t fstr;
+			fstr.begin = param.value.data();
+			fstr.len = param.value.size();
+			bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
+			msg_debug_html ("tag: %*s; got class: %s",
+					(int) tag->name.size(), tag->name.data(), bl->html_class);
+		}
 	}
 
 	hc->blocks.push_back(bl);
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 4aba9af41..251ba148c 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -22,7 +22,7 @@
 #include <string_view>
 #include <variant>
 #include <vector>
-#include <contrib/robin-hood/robin_hood.h>
+#include <optional>
 
 namespace rspamd::html {
 
@@ -41,20 +41,38 @@ enum class html_component_type : std::uint8_t {
 };
 
 using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>;
+struct html_tag_component {
+	html_component_type type;
+	std::string_view value;
+
+	html_tag_component(html_component_type type, std::string_view value)
+		: type(type), value(value) {}
+};
 
 struct html_tag {
-	gint id = -1;
-	gint flags = 0;
-	mutable guint content_length = 0; /* Allow content length propagation */
-	goffset content_offset = 0;
+	int id = -1;
+	unsigned int flags = 0;
+	mutable unsigned int content_length = 0; /* Allow content length propagation */
+	unsigned int content_offset = 0;
 
 	std::string_view name;
-	robin_hood::unordered_flat_map<html_component_type, std::string_view> parameters;
+	std::vector<html_tag_component> parameters;
 
 	html_tag_extra_t extra;
 	struct html_block *block = nullptr; /* TODO: temporary, must be handled by css */
 	std::vector<struct html_tag *> children;
 	struct html_tag *parent;
+
+	auto find_component(html_component_type what) const -> std::optional<std::string_view>
+	{
+		for (const auto &comp : parameters) {
+			if (comp.type == what) {
+				return comp.value;
+			}
+		}
+
+		return std::nullopt;
+	}
 };
 
 }


More information about the Commits mailing list