commit 620d003: [Project] Html: Implement logic for tags pairing

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Jun 26 13:49:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-26 14:30:19 +0100
URL: https://github.com/rspamd/rspamd/commit/620d003e7ab56e8f4cf1fc4a2569fd20f8f98b3f

[Project] Html: Implement logic for tags pairing

---
 src/libserver/html/html.cxx     | 135 ++++++++++++++++++++++++++++++++--------
 src/libserver/html/html_tag.hxx |  15 ++---
 src/libserver/html/html_tags.h  |   7 +--
 3 files changed, 120 insertions(+), 37 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 894b1ee45..8d312b733 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -76,51 +76,136 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
 INIT_LOG_MODULE(html)
 
 static auto
-html_check_balance(struct html_tag *tag,
+html_check_balance(struct html_content *hc,
+				   struct html_tag *tag,
 				   struct html_tag *parent,
 				   std::vector<html_tag *> &tags_stack,
 				   goffset tag_start_offset,
 				   goffset tag_end_offset) -> bool
 {
 
-	if (tag->flags & FL_CLOSING) {
-		/* Find the opening pair if any and check if it is correctly placed */
-		auto found_opening = std::find_if(tags_stack.rbegin(), tags_stack.rend(),
-				[&](const html_tag *t) {
-					return (t->flags & FL_CLOSED) == 0 && t->id == tag->id;
-				});
+	auto calculate_content_length = [tag_start_offset](html_tag *t) {
+		auto opening_content_offset = t->content_offset;
 
-		if (found_opening != tags_stack.rend()) {
-			auto *opening_tag = (*found_opening);
-			opening_tag->flags |= FL_CLOSED;
+		if (opening_content_offset <= tag_start_offset) {
+			t->content_length = tag_start_offset - opening_content_offset;
+		}
+		else {
+			t->content_length = 0;
+		}
+	};
 
-			/* Adjust size */
-			auto opening_content_offset = opening_tag->content_offset;
+	auto balance_tag = [&]() -> void {
+		auto it = tags_stack.rbegin();
 
-			if (opening_content_offset <= tag_start_offset) {
-				opening_tag->content_length =
-						tag_start_offset - opening_content_offset;
-			}
-			else {
-				opening_tag->content_length = 0;
+		for (auto end_it = tags_stack.rend(); it != end_it; ++it) {
+			if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) {
+				break;
 			}
+			/* Insert a virtual closing tag for all tags that are not closed */
+			auto &&vtag = std::make_unique<html_tag>();
+			vtag->id = (*it)->id;
+			vtag->flags = FL_VIRTUAL|FL_CLOSING;
+			vtag->tag_start = tag->tag_start;
+			vtag->content_offset = tag->content_offset;
+			vtag->content_length = 0;
+			vtag->parent = (*it)->parent;
+			calculate_content_length(*it);
+			(*it)->flags |= FL_CLOSED;
+			hc->all_tags.emplace_back(std::move(vtag));
+		}
+
+		/* Remove tags */
+		tags_stack.erase(it.base(), std::end(tags_stack));
+	};
 
-			if (found_opening == tags_stack.rbegin()) {
+	if (tag->flags & FL_CLOSING) {
+		if (!tags_stack.empty()) {
+			auto *last_tag = tags_stack.back();
+
+			if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) {
+				last_tag->flags |= FL_CLOSED;
+
+				calculate_content_length(last_tag);
 				tags_stack.pop_back();
 				/* All good */
 				return true;
 			}
 			else {
-				/* Move to front */
-				std::iter_swap(found_opening, tags_stack.rbegin());
-				tags_stack.pop_back();
+				balance_tag();
+
 				return false;
 			}
 		}
 		else {
-			/* We have unpaired tag */
-			return false;
+			/*
+			 * We have no opening tags in the stack, so we need to assume that there
+			 * is an opening tag at the beginning of the document.
+			 * There are two possibilities:
+			 *
+			 * 1) We have some block tag in hc->all_tags;
+			 * 2) We have no tags
+			 */
+
+			if (hc->all_tags.empty()) {
+				auto &&vtag = std::make_unique<html_tag>();
+				vtag->id = tag->id;
+				vtag->flags = FL_VIRTUAL|FL_CLOSED;
+				vtag->tag_start = 0;
+				vtag->content_offset = 0;
+				calculate_content_length(vtag.get());
+
+
+				if (!hc->root_tag) {
+					hc->root_tag = vtag.get();
+				}
+				else {
+					vtag->parent = hc->root_tag;
+				}
+				hc->all_tags.emplace_back(std::move(vtag));
+			}
+			else {
+				auto found_closing = std::find_if(hc->all_tags.rbegin(),
+						hc->all_tags.rend(),
+						[&](const auto &t) {
+							constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING;
+							return (t->flags & expect_flags) == (expect_flags) &&
+									t.get() != tag &&
+									t->parent != nullptr;
+						});
+
+				if (found_closing != hc->all_tags.rend()) {
+					auto *closing_tag = (*found_closing).get();
+					auto &&vtag = std::make_unique<html_tag>();
+					vtag->id = tag->id;
+					vtag->flags = FL_VIRTUAL|FL_CLOSED;
+					vtag->tag_start = closing_tag->content_offset - 1;
+					vtag->content_offset = vtag->tag_start + 1;
+					vtag->parent = closing_tag->parent;
+					vtag->content_length = tag->tag_start - vtag->content_offset;
+					hc->all_tags.emplace_back(std::move(vtag));
+				}
+				else {
+					auto &&vtag = std::make_unique<html_tag>();
+					vtag->id = tag->id;
+					vtag->flags = FL_VIRTUAL|FL_CLOSED;
+					vtag->tag_start = 0;
+					vtag->content_offset = 0;
+					calculate_content_length(vtag.get());
+
+
+					if (!hc->root_tag) {
+						hc->root_tag = vtag.get();
+					}
+					else {
+						vtag->parent = hc->root_tag;
+					}
+					hc->all_tags.emplace_back(std::move(vtag));
+				}
+			}
 		}
+
+		return false;
 	}
 
 	/* Misuse */
@@ -166,7 +251,7 @@ html_process_tag(rspamd_mempool_t *pool,
 				return false;
 			}
 
-			if (!html_check_balance(tag, parent, tags_stack,
+			if (!html_check_balance(hc, tag, parent, tags_stack,
 					tag_start_offset, tag_end_offset)) {
 				msg_debug_html (
 						"mark part as unbalanced as it has not pairable closing tags");
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 9091b9060..36110c8c7 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -43,16 +43,17 @@ enum class html_component_type : std::uint8_t {
 
 /* Public tags flags */
 /* XML tag */
-#define FL_XML          (1 << 23)
+#define FL_XML          (1 << 22)
 /* Closing tag */
-#define FL_CLOSING      (1 << 24)
+#define FL_CLOSING      (1 << 23)
 /* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED       (1 << 25)
-#define FL_BROKEN       (1 << 26)
-#define FL_IGNORE       (1 << 27)
-#define FL_BLOCK        (1 << 28)
-#define FL_HREF         (1 << 29)
+#define FL_CLOSED       (1 << 24)
+#define FL_BROKEN       (1 << 25)
+#define FL_IGNORE       (1 << 26)
+#define FL_BLOCK        (1 << 27)
+#define FL_HREF         (1 << 28)
 #define FL_COMMENT      (1 << 29)
+#define FL_VIRTUAL      (1 << 30)
 
 /**
  * Returns component type from a string
diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h
index 8f430204e..3f209c08e 100644
--- a/src/libserver/html/html_tags.h
+++ b/src/libserver/html/html_tags.h
@@ -191,13 +191,10 @@ typedef enum {
 #define CM_NO_INDENT    (1 << 18)
 /* Elements that are obsolete (such as "dir", "menu"). */
 #define CM_OBSOLETE     (1 << 19)
-/* User defined elements. Used to determine how attributes without value
-   should be printed. */
-#define CM_NEW          (1 << 20)
 /* Elements that cannot be omitted. */
-#define CM_OMITST       (1 << 21)
+#define CM_OMITST       (1 << 20)
 /* Unique elements */
-#define CM_UNIQUE       (1 << 22)
+#define CM_UNIQUE       (1 << 21)
 
 #ifdef  __cplusplus
 }


More information about the Commits mailing list