commit 5e99aef: [Project] Html: Rework tags placement

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jun 22 16:21:06 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-21 23:46:26 +0100
URL: https://github.com/rspamd/rspamd/commit/5e99aef6caea1bc206825c20d51491a6571aec52

[Project] Html: Rework tags placement

---
 src/libserver/html/html.cxx     | 110 +++++++++++++++-------------------------
 src/libserver/html/html_tag.hxx |   5 +-
 2 files changed, 45 insertions(+), 70 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 00dcebad6..925735f41 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -78,7 +78,8 @@ INIT_LOG_MODULE(html)
 static auto
 html_check_balance(struct html_tag *tag,
 				   struct html_tag *parent,
-				   std::vector<html_tag *> &tags_stack) -> bool
+				   std::vector<html_tag *> &tags_stack,
+				   goffset tag_start_offset) -> bool
 {
 
 	if (tag->flags & FL_CLOSING) {
@@ -89,7 +90,19 @@ html_check_balance(struct html_tag *tag,
 				});
 
 		if (found_opening != tags_stack.rend()) {
-			(*found_opening)->flags |= FL_CLOSED;
+			auto *opening_tag = (*found_opening);
+			opening_tag->flags |= FL_CLOSED;
+
+			/* Adjust size */
+			auto opening_content_offset = opening_tag->content_offset;
+
+			if (opening_content_offset <= tag_start_offset) {
+				opening_tag->content_length =
+						tag_start_offset - opening_content_offset;
+			}
+			else {
+				opening_tag->content_length = 0;
+			}
 
 			if (found_opening == tags_stack.rbegin()) {
 				tags_stack.pop_back();
@@ -100,7 +113,7 @@ html_check_balance(struct html_tag *tag,
 				/* Move to front */
 				std::iter_swap(found_opening, tags_stack.rbegin());
 				tags_stack.pop_back();
-				return true;
+				return false;
 			}
 		}
 		else {
@@ -117,7 +130,8 @@ static auto
 html_process_tag(rspamd_mempool_t *pool,
 				 struct html_content *hc,
 				 struct html_tag *tag,
-				 std::vector<html_tag *> &tags_stack) -> bool
+				 std::vector<html_tag *> &tags_stack,
+				 goffset tag_start_offset) -> bool
 {
 	struct html_tag *parent;
 
@@ -141,7 +155,7 @@ html_process_tag(rspamd_mempool_t *pool,
 
 	tag->parent = parent;
 
-	if (!(tag->flags & (CM_INLINE | CM_EMPTY))) {
+	if (!(tag->flags & (CM_EMPTY))) {
 		/* Block tag */
 		if ((tag->flags & (FL_CLOSING | FL_CLOSED))) {
 			/* Closed block tag */
@@ -151,7 +165,7 @@ html_process_tag(rspamd_mempool_t *pool,
 			}
 
 			if (hc->total_tags < rspamd::html::max_tags) {
-				if (!html_check_balance(tag, parent, tags_stack)) {
+				if (!html_check_balance(tag, parent, tags_stack, tag_start_offset)) {
 					msg_debug_html (
 							"mark part as unbalanced as it has not pairable closing tags");
 					hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
@@ -168,7 +182,7 @@ html_process_tag(rspamd_mempool_t *pool,
 				}
 
 				if (!(tag->flags & FL_CLOSED) &&
-					!(parent->flags & FL_BLOCK)) {
+					(parent->flags & CM_EMPTY)) {
 					/* We likely have some bad nesting */
 					if (parent->id == tag->id) {
 						/* Something like <a>bla<a>foo... */
@@ -197,6 +211,13 @@ html_process_tag(rspamd_mempool_t *pool,
 			}
 			else {
 				hc->root_tag = tag;
+				if (hc->total_tags < rspamd::html::max_tags) {
+					if ((tag->flags & FL_CLOSED) == 0) {
+						tags_stack.push_back(tag);
+					}
+
+					hc->total_tags++;
+				}
 			}
 
 			if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) {
@@ -204,7 +225,6 @@ html_process_tag(rspamd_mempool_t *pool,
 
 				return false;
 			}
-
 		}
 	}
 	else {
@@ -1062,8 +1082,8 @@ html_process_input(rspamd_mempool_t *pool,
 	gboolean closing = FALSE;
 	guint obrace = 0, ebrace = 0;
 	struct rspamd_url *url = NULL;
-	gint len, href_offset = -1;
-	struct html_tag *cur_tag = NULL, *content_tag = NULL;
+	gint href_offset = -1;
+	struct html_tag *cur_tag = NULL;
 	std::vector<html_tag *> tags_stack;
 	struct tag_content_parser_state content_parser_env;
 
@@ -1121,6 +1141,7 @@ html_process_input(rspamd_mempool_t *pool,
 		case tag_begin:
 			switch (t) {
 			case '<':
+				c = p;
 				p ++;
 				closing = FALSE;
 				break;
@@ -1148,6 +1169,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 				hc->all_tags.emplace_back(std::make_unique<html_tag>());
 				cur_tag = hc->all_tags.back().get();
+				cur_tag->tag_start = c - start;
 				break;
 			}
 
@@ -1314,6 +1336,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 		case tag_content:
 			html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
+
 			if (t == '>') {
 				if (closing) {
 					cur_tag->flags |= FL_CLOSING;
@@ -1338,8 +1361,10 @@ html_process_input(rspamd_mempool_t *pool,
 			if (cur_tag != nullptr) {
 				state = html_text_content;
 
-				cur_tag->content_offset = p - start;
-				if (!html_process_tag(pool, hc, cur_tag, tags_stack)) {
+				cur_tag->content_offset = p - start + 1;
+
+				if (!html_process_tag(pool, hc, cur_tag, tags_stack,
+						c - start)) {
 					if (cur_tag->id == Tag_STYLE) {
 						state = content_style;
 					}
@@ -1355,59 +1380,6 @@ html_process_input(rspamd_mempool_t *pool,
 					hc->tags_seen[cur_tag->id] = true;
 				}
 
-				if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
-					content_tag = cur_tag;
-				}
-
-				/* Handle newlines */
-				if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
-					if (!hc->parsed.empty() &&
-						hc->parsed.back() != '\n') {
-
-						hc->parsed += "\r\n";
-
-						if (content_tag) {
-							if (content_tag->content_length == 0) {
-								/*
-								 * Special case
-								 * we have a \r\n at the beginning but
-								 * we have no set content_offset
-								 * so we need to do it here
-								 */
-								content_tag->content_offset = hc->parsed.size();
-							}
-							else {
-								content_tag->content_length += 2;
-							}
-						}
-					}
-				}
-
-				if ((cur_tag->id == Tag_P ||
-					 cur_tag->id == Tag_TR ||
-					 cur_tag->id == Tag_DIV)) {
-					if (!hc->parsed.empty() &&
-						hc->parsed.back() != '\n') {
-
-						hc->parsed += "\r\n";
-
-						if (content_tag) {
-							if (content_tag->content_length == 0) {
-								/*
-								 * Special case
-								 * we have a \r\n at the beginning but
-								 * we have no set content_offset
-								 * so we need to get it here
-								 */
-								content_tag->content_offset = hc->parsed.size();
-							}
-							else {
-								content_tag->content_length += 2;
-							}
-						}
-					}
-				}
-
 				/* XXX: uncomment when styles parsing is not so broken */
 				if (cur_tag->flags & FL_HREF /* && !(cur_tag->flags & FL_IGNORE) */) {
 					if (!(cur_tag->flags & (FL_CLOSING))) {
@@ -1502,7 +1474,7 @@ html_process_input(rspamd_mempool_t *pool,
 							part_urls);
 				}
 
-				if (cur_tag->flags & FL_BLOCK) {
+				if (!(cur_tag->flags & CM_EMPTY)) {
 
 					if (!(cur_tag->flags & FL_CLOSING)) {
 						html_process_block_tag(pool, cur_tag, hc);
@@ -1520,6 +1492,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 	/* Summarize content length from children */
 	hc->traverse_block_tags([](const html_tag *tag) -> bool {
+
 		for (const auto *cld_tag : tag->children) {
 			tag->content_length += cld_tag->content_length;
 		}
@@ -1716,13 +1689,14 @@ TEST_CASE("html text extraction")
 {
 
 	const std::vector<std::pair<std::string, std::string>> cases{
+			{"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
+			{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
 			{"test", "test"},
 			{"test   ", "test "},
 			{"test   foo,   bar", "test foo, bar"},
 			{"<p>text</p>", "text"},
 			{"olo<p>text</p>lolo", "olo\ntext\nlolo"},
-			{"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
-			{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
+
 			{"foo<br>baz", "foo\nbaz"},
 			{"<div>foo</div><div>bar</div>", "foo\nbar"},
 	};
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 72ae6d616..f6442bdc3 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -57,10 +57,11 @@ struct html_tag_component {
 };
 
 struct html_tag {
-	int id = -1;
-	unsigned int flags = 0;
+	unsigned int tag_start = 0;
 	mutable unsigned int content_length = 0; /* Allow content length propagation */
 	unsigned int content_offset = 0;
+	std::uint32_t flags = 0;
+	std::int16_t id = -1;
 
 	std::string_view name;
 	std::vector<html_tag_component> parameters;


More information about the Commits mailing list