commit eae0edd: [Project] Html: Another rework of the tags structure

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jun 30 20:28:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-30 12:50:43 +0100
URL: https://github.com/rspamd/rspamd/commit/eae0eddc9625e66d47aabcc5d854a73c5366661c

[Project] Html: Another rework of the tags structure

---
 src/libserver/css/css.cxx       |   2 +-
 src/libserver/html/html.cxx     | 507 ++++++++++++++++------------------------
 src/libserver/html/html.hxx     |   2 +-
 src/libserver/html/html_tag.hxx |  45 ++--
 src/lua/lua_html.cxx            |  15 +-
 5 files changed, 241 insertions(+), 330 deletions(-)

diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx
index 51f537b5a..9e26eb42f 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -114,7 +114,7 @@ css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) ->
 	}
 
 	/* First, find id in a tag and a class */
-	for (const auto &param : tag->parameters) {
+	for (const auto &param : tag->components) {
 		if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) {
 			id_comp = param.value;
 		}
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index f0315434a..9d1531f04 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -75,259 +75,91 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
 
 INIT_LOG_MODULE(html)
 
+/*
+ * This function is expected to be called on a closing tag to fill up all tags
+ * and return the current parent (meaning unclosed) tag
+ */
 static auto
 html_check_balance(struct html_content *hc,
 				   struct html_tag *tag,
-				   struct html_tag *parent,
-				   std::vector<html_tag *> &tags_stack,
 				   goffset tag_start_offset,
-				   goffset tag_end_offset) -> bool
+				   goffset tag_end_offset) -> html_tag *
 {
+	/* As agreed, the closing tag has the last opening at the parent ptr */
+	auto *opening_tag = tag->parent;
 
-	auto calculate_content_length = [tag_start_offset](html_tag *t) {
+	auto calculate_content_length = [tag_start_offset,tag_end_offset](html_tag *t) {
 		auto opening_content_offset = t->content_offset;
 
 		if (opening_content_offset <= tag_start_offset) {
-			t->content_length = tag_start_offset - opening_content_offset;
+			t->closing.start = tag_start_offset;
+			t->closing.end = tag_end_offset;
 		}
 		else {
-			t->content_length = 0;
+
+			t->closing.start = t->content_offset;
+			t->closing.end = tag_end_offset;
 		}
 	};
 
-	auto balance_tag = [&]() -> void {
-		auto it = tags_stack.rbegin();
+	auto balance_tag = [&]() -> html_tag * {
+		auto it = tag->parent;
 
-		for (auto end_it = tags_stack.rend(); it != end_it; ++it) {
-			if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) {
+		for (; it != nullptr; it = it->parent) {
+			if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
 				break;
 			}
 			/* Insert a virtual closing tag for all tags that are not closed */
-			auto &&vtag = std::make_unique<html_tag>();
-			vtag->id = (*it)->id;
-			vtag->flags = FL_VIRTUAL|FL_CLOSING;
-			vtag->tag_start = tag->tag_start;
-			vtag->content_offset = tag->content_offset;
-			vtag->content_length = 0;
-			vtag->parent = (*it)->parent;
-			calculate_content_length(*it);
-			(*it)->flags |= FL_CLOSED;
-			hc->all_tags.emplace_back(std::move(vtag));
+			calculate_content_length(it);
+			it->flags |= FL_CLOSED;
 		}
 
 		/* Remove tags */
-		tags_stack.erase(it.base(), std::end(tags_stack));
+		return it;
 	};
 
-	if (tag->flags & FL_CLOSING) {
-		if (!tags_stack.empty()) {
-			auto *last_tag = tags_stack.back();
+	if (opening_tag) {
 
-			if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) {
-				last_tag->flags |= FL_CLOSED;
+		if (opening_tag->id == tag->id) {
+			opening_tag->flags |= FL_CLOSED;
 
-				calculate_content_length(last_tag);
-				tags_stack.pop_back();
-				/* All good */
-				return true;
-			}
-			else {
-				balance_tag();
-
-				return false;
-			}
+			calculate_content_length(opening_tag);
+			/* All good */
+			return opening_tag->parent;
 		}
 		else {
-			/*
-			 * We have no opening tags in the stack, so we need to assume that there
-			 * is an opening tag at the beginning of the document.
-			 * There are two possibilities:
-			 *
-			 * 1) We have some block tag in hc->all_tags;
-			 * 2) We have no tags
-			 */
-
-			if (hc->all_tags.empty()) {
-				auto &&vtag = std::make_unique<html_tag>();
-				vtag->id = tag->id;
-				vtag->flags = FL_VIRTUAL|FL_CLOSED;
-				vtag->tag_start = 0;
-				vtag->content_offset = 0;
-				calculate_content_length(vtag.get());
-
-
-				if (!hc->root_tag) {
-					hc->root_tag = vtag.get();
-				}
-				else {
-					vtag->parent = hc->root_tag;
-				}
-				hc->all_tags.emplace_back(std::move(vtag));
-			}
-			else {
-				auto found_closing = std::find_if(hc->all_tags.rbegin(),
-						hc->all_tags.rend(),
-						[&](const auto &t) {
-							constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING;
-							return (t->flags & expect_flags) == (expect_flags) &&
-									t.get() != tag &&
-									t->parent != nullptr;
-						});
-
-				if (found_closing != hc->all_tags.rend()) {
-					auto *closing_tag = (*found_closing).get();
-					auto &&vtag = std::make_unique<html_tag>();
-					vtag->id = tag->id;
-					vtag->flags = FL_VIRTUAL|FL_CLOSED;
-					vtag->tag_start = closing_tag->content_offset - 1;
-					vtag->content_offset = vtag->tag_start + 1;
-					vtag->parent = closing_tag->parent;
-					vtag->content_length = tag->tag_start - vtag->content_offset;
-					hc->all_tags.emplace_back(std::move(vtag));
-				}
-				else {
-					auto &&vtag = std::make_unique<html_tag>();
-					vtag->id = tag->id;
-					vtag->flags = FL_VIRTUAL|FL_CLOSED;
-					vtag->tag_start = 0;
-					vtag->content_offset = 0;
-					calculate_content_length(vtag.get());
-
-
-					if (!hc->root_tag) {
-						hc->root_tag = vtag.get();
-					}
-					else {
-						vtag->parent = hc->root_tag;
-					}
-					hc->all_tags.emplace_back(std::move(vtag));
-				}
-			}
+			return balance_tag();
 		}
-
-		return false;
-	}
-
-	/* Misuse */
-	RSPAMD_UNREACHABLE;
-}
-
-static auto
-html_process_tag(rspamd_mempool_t *pool,
-				 struct html_content *hc,
-				 struct html_tag *tag,
-				 std::vector<html_tag *> &tags_stack,
-				 goffset tag_start_offset,
-				 goffset tag_end_offset) -> bool
-{
-	struct html_tag *parent;
-
-	if (hc->total_tags > rspamd::html::max_tags) {
-		hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
-	}
-
-	if (tag->id == -1) {
-		/* Ignore unknown tags */
-		hc->total_tags++;
-		return false;
-	}
-
-
-	if (tags_stack.empty()) {
-		parent = hc->root_tag;
 	}
 	else {
-		parent = tags_stack.back();
-	}
-
-	tag->parent = parent;
-
-	if (!(tag->flags & (CM_EMPTY))) {
-		/* Block tag */
-		if (tag->flags & FL_CLOSING) {
-			/* Closed block tag */
-			if (parent == nullptr) {
-				msg_debug_html ("bad parent node");
-				return false;
-			}
-
-			if (!html_check_balance(hc, tag, parent, tags_stack,
-					tag_start_offset, tag_end_offset)) {
-				msg_debug_html (
-						"mark part as unbalanced as it has not pairable closing tags");
-				hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
-			}
-		}
-		else {
-			/* Opening block tag */
-			if (parent) {
-				if ((parent->flags & FL_IGNORE)) {
-					tag->flags |= FL_IGNORE;
-				}
-
-				if (!(tag->flags & FL_CLOSED) &&
-					(parent->flags & CM_EMPTY)) {
-					/* We likely have some bad nesting */
-					if (parent->id == tag->id) {
-						/* Something like <a>bla<a>foo... */
-						hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
-						tag->parent = parent->parent;
-
-						if (hc->total_tags < rspamd::html::max_tags) {
-							parent->children.push_back(tag);
-							tags_stack.push_back(tag);
-							hc->total_tags++;
-						}
-
-						return true;
-					}
-				}
-
-				if (hc->total_tags < rspamd::html::max_tags) {
-					parent->children.push_back(tag);
+		/*
+		 * We have no opening tag
+		 * There are two possibilities:
+		 *
+		 * 1) We have some block tag in hc->all_tags;
+		 * 2) We have no tags
+		 */
+
+		if (hc->all_tags.empty()) {
+			auto &&vtag = std::make_unique<html_tag>();
+			vtag->id = tag->id;
+			vtag->flags = FL_VIRTUAL|FL_CLOSED;
+			vtag->tag_start = 0;
+			vtag->content_offset = 0;
+			calculate_content_length(vtag.get());
 
-					if ((tag->flags & FL_CLOSED) == 0) {
-						tags_stack.push_back(tag);
-					}
 
-					hc->total_tags++;
-				}
+			if (!hc->root_tag) {
+				hc->root_tag = vtag.get();
 			}
 			else {
-				hc->root_tag = tag;
-				if (hc->total_tags < rspamd::html::max_tags) {
-					if ((tag->flags & FL_CLOSED) == 0) {
-						tags_stack.push_back(tag);
-					}
-
-					hc->total_tags++;
-				}
-			}
-
-			if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) {
-				tag->flags |= FL_IGNORE;
-
-				return false;
-			}
-		}
-	}
-	else {
-		/* Inline tag */
-		if (parent) {
-			if (hc->total_tags < rspamd::html::max_tags) {
-				parent->children.push_back(tag);
-
-				hc->total_tags++;
-			}
-			if ((parent->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE))) {
-				tag->flags |= FL_IGNORE;
-
-				return false;
+				vtag->parent = hc->root_tag;
 			}
+			hc->all_tags.emplace_back(std::move(vtag));
 		}
 	}
 
-	return true;
+	return nullptr;
 }
 
 auto
@@ -404,7 +236,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
 		spaces_before_eq,
 		spaces_after_eq,
 		spaces_after_param,
-		ignore_bad_tag
+		ignore_bad_tag,
 	} state;
 	gboolean store = FALSE;
 
@@ -425,7 +257,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
 				auto *s = rspamd_mempool_alloc_buffer(pool, sz);
 				memcpy(s, parser_env.saved_p, sz);
 				sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p);
-				tag->parameters.emplace_back(parser_env.cur_component.value(),
+				tag->components.emplace_back(parser_env.cur_component.value(),
 						std::string_view{s, sz});
 		}
 
@@ -941,7 +773,7 @@ html_process_img_tag(rspamd_mempool_t *pool,
 	img = rspamd_mempool_alloc0_type (pool, struct html_image);
 	img->tag = tag;
 
-	for (const auto &param : tag->parameters) {
+	for (const auto &param : tag->components) {
 
 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
 			/* Check base url */
@@ -1103,7 +935,7 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 {
 	std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
 
-	for (const auto &param : tag->parameters) {
+	for (const auto &param : tag->components) {
 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
 			maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
 		}
@@ -1173,7 +1005,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 						khash_t (rspamd_url_hash) *url_set) -> goffset
 {
 	auto is_visible = true, is_block = false;
-	goffset next_tag_offset = tag->content_length + tag->content_offset,
+	goffset next_tag_offset = tag->closing.end,
 			initial_dest_offset = hc->parsed.size();
 
 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
@@ -1182,7 +1014,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		return tag->content_offset;
 	}
 
-	if ((tag->flags & (FL_COMMENT|FL_XML))) {
+	if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE))) {
 		is_visible = false;
 	}
 	else {
@@ -1213,8 +1045,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		if (!enclosed_tags.empty()) {
 			next_enclosed = enclosed_tags.back();
 			enclosed_start = next_enclosed->tag_start;
-			enclosed_end = next_enclosed->content_length +
-						   next_enclosed->content_offset;
+			enclosed_end = next_enclosed->closing.end;
 
 			if (enclosed_end > next_tag_offset) {
 				next_tag_offset = enclosed_end;
@@ -1270,7 +1101,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		}
 	}
 
-	if (is_visible && !(tag->flags & FL_CLOSING)) {
+	if (is_visible) {
 		if (tag->id == Tag_A) {
 			auto written_len = hc->parsed.size() - initial_dest_offset;
 			html_process_displayed_href_tag(pool, hc,
@@ -1313,7 +1144,7 @@ html_append_tags_content(rspamd_mempool_t *pool,
 	for (auto i = 0; i < hc->all_tags.size();) {
 		const auto &tag = hc->all_tags[i];
 		html_tag *next_tag = nullptr;
-		auto next_offset = tag->content_offset + tag->content_length;
+		auto next_offset = tag->closing.end;
 
 		auto j = i + 1;
 		while (j < hc->all_tags.size()) {
@@ -1321,9 +1152,9 @@ html_append_tags_content(rspamd_mempool_t *pool,
 
 			if (next_tag->content_offset <= next_offset) {
 				enclosed_tags_stack.push_back(next_tag);
-				if (next_tag->content_offset + next_tag->content_length > next_offset) {
+				if (next_tag->closing.end > next_offset) {
 					/* Tag spans over its parent */
-					next_offset = next_tag->content_offset + next_tag->content_length;
+					next_offset = next_tag->closing.end;
 				}
 				j ++;
 			}
@@ -1351,10 +1182,9 @@ html_process_input(rspamd_mempool_t *pool,
 	guchar t;
 	auto closing = false, in_head = false;
 	guint obrace = 0, ebrace = 0;
-	struct rspamd_url *url = NULL;
+	struct rspamd_url *url = nullptr;
 	gint href_offset = -1;
-	struct html_tag *cur_tag = NULL;
-	std::vector<html_tag *> tags_stack;
+	struct html_tag *cur_tag = nullptr, cur_closing_tag;
 	struct tag_content_parser_state content_parser_env;
 
 	enum {
@@ -1368,10 +1198,12 @@ html_process_input(rspamd_mempool_t *pool,
 		comment_content,
 		sgml_content,
 		tag_content,
-		tag_end,
+		tag_end_opening,
+		tag_end_closing,
 		html_text_content,
 		xml_tag_end,
 		content_style,
+		tags_limit_overflow,
 	} state = parse_start;
 
 	g_assert (in != NULL);
@@ -1380,6 +1212,32 @@ html_process_input(rspamd_mempool_t *pool,
 	struct html_content *hc = new html_content;
 	rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
 
+	auto new_tag = [&](int flags = 0) -> struct html_tag * {
+
+		if (hc->total_tags > rspamd::html::max_tags) {
+			hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
+
+			return nullptr;
+		}
+
+		auto *parent = cur_tag;
+
+		hc->all_tags.emplace_back(std::make_unique<html_tag>());
+		auto *ntag = hc->all_tags.back().get();
+		ntag->tag_start = c - start;
+		ntag->flags = flags;
+
+		if (parent) {
+			ntag->parent = parent;
+			parent->children.push_back(ntag);
+		}
+		else {
+			hc->root_tag = ntag;
+		}
+
+		return ntag;
+	};
+
 	p = (const char *)in->data;
 	c = p;
 	end = p + in->len;
@@ -1392,14 +1250,21 @@ html_process_input(rspamd_mempool_t *pool,
 		case parse_start:
 			if (t == '<') {
 				state = tag_begin;
+				in_head = true;
 			}
 			else {
 				/* We have no starting tag, so assume that it's content */
 				hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
-				hc->all_tags.emplace_back(std::make_unique<html_tag>());
-				cur_tag = hc->all_tags.back().get();
-				cur_tag->id = Tag_HTML;
-				state = content_before_start;
+				in_head = false;
+				cur_tag = new_tag();
+
+				if (cur_tag) {
+					cur_tag->id = Tag_HTML;
+					state = content_before_start;
+				}
+				else {
+					state = tags_limit_overflow;
+				}
 			}
 			break;
 		case content_before_start:
@@ -1419,37 +1284,56 @@ html_process_input(rspamd_mempool_t *pool,
 				closing = FALSE;
 				break;
 			case '!':
-				state = sgml_tag;
-				hc->all_tags.emplace_back(std::make_unique<html_tag>());
-				cur_tag = hc->all_tags.back().get();
-				cur_tag->tag_start = c - start;
+				cur_tag = new_tag(FL_XML);
+				if (cur_tag) {
+					state = sgml_tag;
+				}
+				else {
+					state = tags_limit_overflow;
+				}
 				p ++;
 				break;
 			case '?':
-				state = xml_tag;
-				hc->all_tags.emplace_back(std::make_unique<html_tag>());
-				cur_tag = hc->all_tags.back().get();
-				cur_tag->tag_start = c - start;
-				cur_tag->flags |= FL_XML;
+				cur_tag = new_tag(FL_XML);
+				if (cur_tag) {
+					state = xml_tag;
+				}
+				else {
+					state = tags_limit_overflow;
+				}
 				hc->flags |= RSPAMD_HTML_FLAG_XML;
 				p ++;
 				break;
 			case '/':
 				closing = TRUE;
+				/* We fill fake closing tag to fill it with the content parser */
+				cur_closing_tag.clear();
+				cur_closing_tag.parent = cur_tag; /* For simplicity */
+				cur_tag = &cur_closing_tag;
 				p ++;
 				break;
 			case '>':
 				/* Empty tag */
 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-				state = tag_end;
+				state = html_text_content;
 				continue;
 			default:
-				state = tag_content;
-				content_parser_env.reset();
+				if (g_ascii_isalpha(t)) {
+					state = tag_content;
+					content_parser_env.reset();
+					cur_tag = new_tag();
 
-				hc->all_tags.emplace_back(std::make_unique<html_tag>());
-				cur_tag = hc->all_tags.back().get();
-				cur_tag->tag_start = c - start;
+					if (cur_tag) {
+						state = tag_content;
+					}
+					else {
+						state = tags_limit_overflow;
+					}
+				}
+				else {
+					/* Wrong bad tag */
+					state = html_text_content;
+				}
 				break;
 			}
 
@@ -1482,7 +1366,7 @@ html_process_input(rspamd_mempool_t *pool,
 			else if (t == '>') {
 				/* Misformed xml tag */
 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-				state = tag_end;
+				state = tag_end_opening;
 				continue;
 			}
 			/* We efficiently ignore xml tags */
@@ -1491,7 +1375,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 		case xml_tag_end:
 			if (t == '>') {
-				state = tag_end;
+				state = tag_end_opening;
 				continue;
 			}
 			else {
@@ -1508,16 +1392,16 @@ html_process_input(rspamd_mempool_t *pool,
 				ebrace ++;
 			}
 			else if (t == '>' && obrace == ebrace) {
-				state = tag_end;
+				state = tag_end_opening;
 				continue;
 			}
 			p ++;
 			break;
 
 		case comment_tag:
-			if (t != '-')  {
+			if (t != '-') {
 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-				state = tag_end;
+				state = tag_end_opening;
 			}
 			else {
 				p++;
@@ -1534,11 +1418,11 @@ html_process_input(rspamd_mempool_t *pool,
 				if (p[0] == '-' && p + 1 < end && p[1] == '>') {
 					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
 					p ++;
-					state = tag_end;
+					state = tag_end_opening;
 				}
 				else if (*p == '>') {
 					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-					state = tag_end;
+					state = tag_end_opening;
 				}
 				else {
 					state = comment_content;
@@ -1551,7 +1435,7 @@ html_process_input(rspamd_mempool_t *pool,
 				ebrace ++;
 			}
 			else if (t == '>' && ebrace >= 2) {
-				state = tag_end;
+				state = tag_end_opening;
 				continue;
 			}
 			else {
@@ -1610,7 +1494,7 @@ html_process_input(rspamd_mempool_t *pool,
 		case sgml_content:
 			/* TODO: parse DOCTYPE here */
 			if (t == '>') {
-				state = tag_end;
+				state = tag_end_closing;
 				/* We don't know a lot about sgml tags, ignore them */
 				cur_tag = nullptr;
 				continue;
@@ -1623,32 +1507,38 @@ html_process_input(rspamd_mempool_t *pool,
 
 			if (t == '>') {
 				if (closing) {
-					cur_tag->flags |= FL_CLOSING;
+					cur_tag->closing.start = c - start;
+					cur_tag->closing.end = p - start + 1;
 
-					if (cur_tag->flags & FL_CLOSED) {
-						/* Bad mix of closed and closing */
-						hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+					closing = FALSE;
+					if (cur_tag->id == Tag_HEAD) {
+						in_head = false;
+					}
+					state = tag_end_closing;
+				}
+				else {
+					cur_tag->content_offset = p - start + 1;
+					if (cur_tag->id == Tag_HEAD) {
+						in_head = true;
+					}
+					else if (cur_tag->id == Tag_BODY) {
+						in_head = false;
 					}
 
-					closing = FALSE;
+					state = tag_end_opening;
 				}
 
-				state = tag_end;
+
 				continue;
 			}
 			p ++;
 			break;
 
-		case tag_end:
+		case tag_end_opening:
 			content_parser_env.reset();
*** OUTPUT TRUNCATED, 280 LINES SKIPPED ***


More information about the Commits mailing list