commit 9f50cd4: [Project] Html: Another try to fix unbalanced cases

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Jul 1 12:35:05 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-01 13:28:41 +0100
URL: https://github.com/rspamd/rspamd/commit/9f50cd402484da770d91f3d70f1ac1a755bdf2d1 (HEAD -> master)

[Project] Html: Another try to fix unbalanced cases

---
 src/libserver/html/html.cxx | 89 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 14 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index bdd489299..d5b58a7d4 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -94,7 +94,7 @@ html_check_balance(struct html_content *hc,
 		if (t->flags & (CM_EMPTY)) {
 			/* Attach closing tag just at the opening tag */
 			t->closing.start = t->tag_start;
-			t->closing.end = t->content_offset - 1;
+			t->closing.end = t->content_offset;
 		}
 		else {
 
@@ -112,18 +112,47 @@ html_check_balance(struct html_content *hc,
 
 	auto balance_tag = [&]() -> html_tag * {
 		auto it = tag->parent;
+		auto found_pair = false;
 
 		for (; it != nullptr; it = it->parent) {
 			if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
+				found_pair = true;
 				break;
 			}
-			/* Insert a virtual closing tag for all tags that are not closed */
-			calculate_content_length(it);
-			it->flags |= FL_CLOSED;
+
+		}
+
+		/*
+		 * If we have found a closing pair, then we need to close all tags and
+		 * return the top-most tag
+		 */
+		if (found_pair) {
+			for (it = tag->parent; it != nullptr; it = it->parent) {
+				it->flags |= FL_CLOSED;
+				/* Insert a virtual closing tag for all tags that are not closed */
+				calculate_content_length(it);
+				if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
+					break;
+				}
+			}
+
+			return it;
+		}
+		else {
+			/*
+			 * We have not found a pair, so this closing tag is bogus and should
+			 * be ignored completely.
+			 * Unfortunately, it also means that we need to insert another tag,
+			 * as the current closing tag is unusable for that purposes.
+			 *
+			 * We assume that callee will recognise that and reconstruct the
+			 * tag at the tag_end_closing state, so we return nullptr...
+			 */
+
 		}
 
-		/* Remove tags */
-		return it;
+		/* Tag must be ignored and reconstructed */
+		return nullptr;
 	};
 
 	if (opening_tag) {
@@ -156,7 +185,6 @@ html_check_balance(struct html_content *hc,
 			vtag->content_offset = 0;
 			calculate_content_length(vtag.get());
 
-
 			if (!hc->root_tag) {
 				hc->root_tag = vtag.get();
 			}
@@ -164,6 +192,8 @@ html_check_balance(struct html_content *hc,
 				vtag->parent = hc->root_tag;
 			}
 			hc->all_tags.emplace_back(std::move(vtag));
+
+			return vtag.get();
 		}
 	}
 
@@ -1012,7 +1042,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 						khash_t (rspamd_url_hash) *url_set) -> goffset
 {
 	auto is_visible = true, is_block = false;
-	goffset next_tag_offset = tag->closing.end + 1,
+	goffset next_tag_offset = tag->closing.end,
 			initial_dest_offset = hc->parsed.size();
 
 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
@@ -1021,7 +1051,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		return tag->content_offset;
 	}
 	else if (tag->id == Tag_HEAD || tag->id >= N_TAGS) {
-		return tag->closing.end + 1;
+		return tag->closing.end;
 	}
 
 	if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE|CM_HEAD))) {
@@ -1597,14 +1627,45 @@ html_process_input(rspamd_mempool_t *pool,
 			p++;
 			c = p;
 			break;
-		case tag_end_closing:
-			/* cur_tag here is a closing tag */
-			cur_tag = html_check_balance(hc, cur_tag,
-					c - start, p - start);
+		case tag_end_closing: {
+			if (cur_tag) {
+				/* cur_tag here is a closing tag */
+				auto *next_cur_tag = html_check_balance(hc, cur_tag,
+						c - start, p - start + 1);
+
+				if (next_cur_tag != nullptr) {
+					cur_tag = next_cur_tag;
+				}
+				else {
+					/*
+					 * Here, we handle cases like <p>lala</b>...
+					 * So the tag </b> is bogus and unpaired
+					 * However, we need to exclude it from the output of <p> tag
+					 * To do that, we create a fake opening tag and insert that to
+					 * the current opening tag
+					 */
+					auto *cur_opening_tag = cur_tag->parent;
+
+					if (!cur_opening_tag) {
+						cur_opening_tag = hc->root_tag;
+					}
+					auto &&vtag = std::make_unique<html_tag>();
+					vtag->id = cur_tag->id;
+					vtag->flags = FL_VIRTUAL | FL_CLOSED | FL_IGNORE;
+					vtag->tag_start = cur_tag->closing.start;
+					vtag->content_offset = p - start;
+					vtag->closing = cur_tag->closing;
+					vtag->parent = cur_opening_tag;
+					cur_opening_tag->children.push_back(vtag.get());
+					hc->all_tags.emplace_back(std::move(vtag));
+					cur_tag = cur_opening_tag;
+				}
+			} /* if cur_tag != nullptr */
 			state = html_text_content;
-			p ++;
+			p++;
 			c = p;
 			break;
+		}
 		case tags_limit_overflow:
 			msg_warn_pool("tags limit of %d tags is reached at the position %d;"
 				 " ignoring the rest of the HTML content",


More information about the Commits mailing list