commit 3fd7466: [Minor] Html: Fix unbalanced tags hanging issue

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jul 6 20:21:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-06 21:16:24 +0100
URL: https://github.com/rspamd/rspamd/commit/3fd746659d5d7b3dd6eaaa63192245965ea9d78e (HEAD -> master)

[Minor] Html: Fix unbalanced tags hanging issue

---
 src/libserver/html/html.cxx | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index a026326e8..eea957e16 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -193,7 +193,9 @@ html_check_balance(struct html_content *hc,
 			}
 
 			hc->all_tags.emplace_back(std::move(vtag));
+			msg_err("hu6");
 			tag->parent = vtag.get();
+			g_assert(tag->parent != tag);
 
 			/* Recursively call with a virtual <html> tag inserted */
 			return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
@@ -1306,11 +1308,13 @@ html_process_input(rspamd_mempool_t *pool,
 
 		if (parent_tag) {
 			cur_tag->parent = parent_tag;
+			g_assert(cur_tag->parent != cur_tag);
 			parent_tag->children.push_back(cur_tag);
 		}
 		else {
 			if (hc->root_tag) {
 				cur_tag->parent = hc->root_tag;
+				g_assert(cur_tag->parent != cur_tag);
 				hc->root_tag->children.push_back(cur_tag);
 				parent_tag = hc->root_tag;
 			}
@@ -1331,6 +1335,7 @@ html_process_input(rspamd_mempool_t *pool,
 					top_tag->content_offset = 0;
 					top_tag->children.push_back(cur_tag);
 					cur_tag->parent = top_tag;
+					g_assert(cur_tag->parent != cur_tag);
 					hc->root_tag = top_tag;
 					parent_tag = top_tag;
 				}
@@ -1473,7 +1478,44 @@ html_process_input(rspamd_mempool_t *pool,
 				closing = TRUE;
 				/* We fill fake closing tag to fill it with the content parser */
 				cur_closing_tag.clear();
-				cur_closing_tag.parent = cur_tag; /* For simplicity */
+				/*
+				 * For closing tags, we need to find some corresponding opening tag.
+				 * However, at this point we have not even parsed a name, so we
+				 * can not assume anything about balancing, etc.
+				 *
+				 * So we need to ensure that:
+				 * 1) We have some opening tag in the chain cur_tag->parent...
+				 * 2) cur_tag is nullptr - okay, html is just brain damaged
+				 * 3) cur_tag must NOT be equal to cur_closing tag. It means that
+				 * we had some poor closing tag but we still need to find an opening
+				 * tag... Somewhere...
+				 */
+
+				if (cur_tag == &cur_closing_tag) {
+					if (parent_tag != &cur_closing_tag) {
+						cur_closing_tag.parent = parent_tag;
+					}
+					else {
+						cur_closing_tag.parent = nullptr;
+					}
+				}
+				else if (cur_tag->flags & FL_CLOSED) {
+					/* Cur tag is already closed, we should find something else */
+					auto *tmp = cur_tag;
+					while (tmp) {
+						tmp = tmp->parent;
+
+						if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
+							break;
+						}
+					}
+
+					cur_closing_tag.parent = tmp;
+				}
+				else {
+					cur_closing_tag.parent = cur_tag;
+				}
+
 				cur_tag = &cur_closing_tag;
 				p ++;
 				break;
@@ -1765,6 +1807,7 @@ html_process_input(rspamd_mempool_t *pool,
 					cur_opening_tag->children.push_back(vtag.get());
 					hc->all_tags.emplace_back(std::move(vtag));
 					cur_tag = cur_opening_tag;
+					parent_tag = cur_tag->parent;
 				}
 			} /* if cur_tag != nullptr */
 			state = html_text_content;


More information about the Commits mailing list