commit 909a594: [Rework] Save invisible content to a separate buffer

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Sep 7 14:07:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-09-07 15:00:38 +0100
URL: https://github.com/rspamd/rspamd/commit/909a594f8d003dbe89462ad02d1634371f42bec3

[Rework] Save invisible content to a separate buffer

---
 src/libserver/html/html.cxx     | 115 ++++++++++++++++++++++++++++------------
 src/libserver/html/html.hxx     |   1 +
 src/libserver/html/html_tag.hxx |  16 ++----
 src/lua/lua_html.cxx            |   6 +--
 4 files changed, 89 insertions(+), 49 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e4cc137b4..97009749f 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -985,12 +985,15 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 }
 
 static inline auto
-html_append_parsed(struct html_content *hc, std::string_view data, bool transparent,
-		std::size_t input_len) -> std::size_t
+html_append_parsed(struct html_content *hc,
+				   std::string_view data,
+				   bool transparent,
+				   std::size_t input_len,
+				   std::string &dest) -> std::size_t
 {
-	auto cur_offset = hc->parsed.size();
+	auto cur_offset = dest.size();
 
-	if (hc->parsed.size() > input_len) {
+	if (dest.size() > input_len) {
 		/* Impossible case, refuse to append */
 		return 0;
 	}
@@ -999,9 +1002,9 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar
 		/* Handle multiple spaces at the begin */
 
 		if (cur_offset > 0) {
-			auto last = hc->parsed.back();
+			auto last = dest.back();
 			if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
-				hc->parsed.append(" ");
+				dest.append(" ");
 				data = {data.data() + 1, data.size() - 1};
 				cur_offset++;
 			}
@@ -1020,24 +1023,24 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar
 				}
 			};
 
-			hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD"));
-			replace_zero_func(data, hc->parsed);
+			dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
+			replace_zero_func(data, dest);
 			hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
 		}
 		else {
-			hc->parsed.append(data);
+			dest.append(data);
 		}
 	}
 
-	auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset,
-			hc->parsed.size() - cur_offset, true);
+	auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
+			dest.size() - cur_offset, true);
 
-	hc->parsed.resize(nlen + cur_offset);
+	dest.resize(nlen + cur_offset);
 
 	if (transparent) {
 		/* Replace all visible characters with spaces */
-		auto start = std::next(hc->parsed.begin(), cur_offset);
-		std::replace_if(start, std::end(hc->parsed), [](const auto c) {
+		auto start = std::next(dest.begin(), cur_offset);
+		std::replace_if(start, std::end(dest), [](const auto c) {
 			return !g_ascii_isspace(c);
 		}, ' ');
 	}
@@ -1076,11 +1079,18 @@ html_append_tag_content(rspamd_mempool_t *pool,
 {
 	auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
 	goffset next_tag_offset = tag->closing.end,
-			initial_dest_offset = hc->parsed.size();
+			initial_parsed_offset = hc->parsed.size(),
+			initial_invisible_offset = hc->invisible.size();
 
-	auto calculate_final_tag_offsets = [&tag, initial_dest_offset, hc]() -> void {
-		tag->content_offset = initial_dest_offset;
-		tag->closing.start = hc->parsed.size();
+	auto calculate_final_tag_offsets = [&]() -> void {
+		if (is_visible) {
+			tag->content_offset = initial_parsed_offset;
+			tag->closing.start = hc->parsed.size();
+		}
+		else {
+			tag->content_offset = initial_invisible_offset;
+			tag->closing.start = hc->invisible.size();
+		}
 	};
 
 	if (tag->closing.end == -1) {
@@ -1098,17 +1108,18 @@ html_append_tag_content(rspamd_mempool_t *pool,
 	}
 
 	auto append_margin = [&](char c) -> void {
+		/* We do care about visible margins only */
 		if (is_visible) {
 			if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
 				if (hc->parsed.back() == ' ') {
 					/* We also strip extra spaces at the end, but limiting the start */
-					auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_dest_offset);
+					auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
 					auto first = std::find_if(hc->parsed.rbegin(), last,
 							[](auto ch) -> auto {
 								return ch != ' ';
 							});
 					hc->parsed.erase(first.base(), hc->parsed.end());
-					g_assert(hc->parsed.size() >= initial_dest_offset);
+					g_assert(hc->parsed.size() >= initial_parsed_offset);
 				}
 				hc->parsed.push_back(c);
 			}
@@ -1177,10 +1188,17 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		auto enclosed_start = cld->tag_start;
 		goffset initial_part_len = enclosed_start - cur_offset;
 
-		if (is_visible && initial_part_len > 0) {
-			html_append_parsed(hc,
-					{start + cur_offset, std::size_t(initial_part_len)},
-					is_transparent, len);
+		if (initial_part_len > 0) {
+			if (is_visible) {
+				html_append_parsed(hc,
+						{start + cur_offset, std::size_t(initial_part_len)},
+						is_transparent, len, hc->parsed);
+			}
+			else {
+				html_append_parsed(hc,
+						{start + cur_offset, std::size_t(initial_part_len)},
+						is_transparent, len, hc->invisible);
+			}
 		}
 
 		auto next_offset = html_append_tag_content(pool, start, len,
@@ -1195,11 +1213,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
 	if (cur_offset < tag->closing.start) {
 		goffset final_part_len = tag->closing.start - cur_offset;
 
-		if (is_visible && final_part_len > 0) {
-			html_append_parsed(hc,
-					{start + cur_offset, std::size_t(final_part_len)},
-					 is_transparent,
-					 len);
+		if (final_part_len > 0) {
+			if (is_visible) {
+				html_append_parsed(hc,
+						{start + cur_offset, std::size_t(final_part_len)},
+						is_transparent,
+						len,
+						hc->parsed);
+			}
+			else {
+				html_append_parsed(hc,
+						{start + cur_offset, std::size_t(final_part_len)},
+						is_transparent,
+						len,
+						hc->invisible);
+			}
 		}
 	}
 	if (is_block) {
@@ -1211,11 +1239,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
 
 	if (is_visible) {
 		if (tag->id == Tag_A) {
-			auto written_len = hc->parsed.size() - initial_dest_offset;
+			auto written_len = hc->parsed.size() - initial_parsed_offset;
 			html_process_displayed_href_tag(pool, hc,
-					{hc->parsed.data() + initial_dest_offset, written_len},
+					{hc->parsed.data() + initial_parsed_offset, written_len},
 					tag, exceptions,
-					url_set, initial_dest_offset);
+					url_set, initial_parsed_offset);
 		}
 		else if (tag->id == Tag_IMG) {
 			/* Process ALT if presented */
@@ -1997,7 +2025,7 @@ html_process_input(rspamd_mempool_t *pool,
 		break;
 	case tags_limit_overflow:
 		html_append_parsed(hc, {c, (std::size_t) (end - c)},
-				false, end - start);
+				false, end - start, hc->parsed);
 		break;
 	default:
 		/* Do nothing */
@@ -2084,6 +2112,27 @@ auto html_tag_by_name(const std::string_view &name)
 	return std::nullopt;
 }
 
+auto
+html_tag::get_content(const struct html_content *hc) const -> std::string_view
+{
+	const std::string *dest = &hc->parsed;
+
+	if (block && !block->is_visible()) {
+		dest = &hc->invisible;
+	}
+	const auto clen = get_content_length();
+	if (content_offset < dest->size()) {
+		if (dest->size() - content_offset >= clen) {
+			return std::string_view{*dest}.substr(content_offset, clen);
+		}
+		else {
+			return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
+		}
+	}
+
+	return std::string_view{};
+}
+
 }
 
 void *
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
index 7e63bedce..5c16d085a 100644
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -47,6 +47,7 @@ struct html_content {
 	std::vector<html_image *> images;
 	std::vector<std::unique_ptr<struct html_tag>> all_tags;
 	std::string parsed;
+	std::string invisible;
 	std::shared_ptr<css::css_style_sheet> css_style;
 
 	/* Preallocate and reserve all internal structures */
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index b6fc73120..5971ca179 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -31,6 +31,8 @@ struct html_image;
 
 namespace rspamd::html {
 
+struct html_content; /* Forward declaration */
+
 enum class html_component_type : std::uint8_t {
 	RSPAMD_HTML_COMPONENT_NAME = 0,
 	RSPAMD_HTML_COMPONENT_HREF,
@@ -141,19 +143,7 @@ struct html_tag {
 		return 0;
 	}
 
-	constexpr auto get_content(std::string_view parsed) const -> std::string_view {
-		const auto clen = get_content_length();
-		if (content_offset < parsed.size()) {
-			if (parsed.size() - content_offset >= clen) {
-				return parsed.substr(content_offset, clen);
-			}
-			else {
-				return parsed.substr(content_offset, parsed.size() - content_offset);
-			}
-		}
-
-		return std::string_view{};
-	}
+	auto get_content(const struct html_content *hc) const -> std::string_view;
 };
 
 static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY);
diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx
index 848bfbdf3..8767d1a11 100644
--- a/src/lua/lua_html.cxx
+++ b/src/lua/lua_html.cxx
@@ -448,7 +448,7 @@ lua_html_foreach_tag (lua_State *L)
 				auto *ltag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(lua_html_tag)));
 				ltag->tag = tag;
 				ltag->html = hc;
-				auto ct = ltag->tag->get_content(hc->parsed);
+				auto ct = ltag->tag->get_content(hc);
 				rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
 				lua_pushinteger (L, ct.size());
 
@@ -582,7 +582,7 @@ lua_html_tag_get_content (lua_State *L)
 	if (ltag) {
 
 		if (ltag->html) {
-			auto ct = ltag->tag->get_content(ltag->html->parsed);
+			auto ct = ltag->tag->get_content(ltag->html);
 			if (ct.size() > 0) {
 				t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
 				rspamd_lua_setclass(L, "rspamd{text}", -1);
@@ -613,7 +613,7 @@ lua_html_tag_get_content_length (lua_State *L)
 
 	if (ltag) {
 		if (ltag->html) {
-			auto ct = ltag->tag->get_content(ltag->html->parsed);
+			auto ct = ltag->tag->get_content(ltag->html);
 			lua_pushinteger (L, ct.size());
 		}
 		else {


More information about the Commits mailing list