commit 92f5857: [Rework] Html: Start removing of GNode stuff

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jun 7 16:35:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-07 14:24:15 +0100
URL: https://github.com/rspamd/rspamd/commit/92f5857eb807306bd7b1536d9901169bf3e5949b

[Rework] Html: Start removing of GNode stuff

---
 src/libserver/html/html.cxx     | 147 +++++++++++++++++++---------------------
 src/libserver/html/html.hxx     |   3 +-
 src/libserver/html/html_tag.hxx |   4 +-
 3 files changed, 73 insertions(+), 81 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 925fe217b..40ef240d5 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -73,44 +73,50 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
 
 INIT_LOG_MODULE(html)
 
-static gboolean
-html_check_balance(GNode *node, GNode **cur_level)
+static auto
+html_check_balance(struct html_tag *tag,
+				   struct html_tag *parent,
+				   std::vector<html_tag *> &tags_stack) -> bool
 {
-	struct html_tag *arg = (struct html_tag *)node->data, *tmp;
-	GNode *cur;
-
-	if (arg->flags & FL_CLOSING) {
-		/* First of all check whether this tag is closing tag for parent node */
-		cur = node->parent;
-		while (cur && cur->data) {
-			tmp = (struct html_tag *)cur->data;
-			if (tmp->id == arg->id &&
-				(tmp->flags & FL_CLOSED) == 0) {
-				tmp->flags |= FL_CLOSED;
-				/* Destroy current node as we find corresponding parent node */
-				g_node_destroy(node);
-				/* Change level */
-				*cur_level = cur->parent;
-				return TRUE;
-			}
-			cur = cur->parent;
+
+	if (tag->flags & FL_CLOSING) {
+		/* Find the opening pair if any and check if it is correctly placed */
+		auto found_opening = std::find_if(tags_stack.rbegin(), tags_stack.rend(),
+				[&](const html_tag *t) {
+					return (t->flags & FL_CLOSED) == 0 && t->id == tag->id;
+				});
+
+		if (found_opening != tags_stack.rend()) {
+			(*found_opening)->flags |= FL_CLOSED;
+
+			if (found_opening == tags_stack.rbegin()) {
+				tags_stack.pop_back();
+				/* All good */
+				return true;
+			}
+			else {
+				/* Move to front */
+				std::iter_swap(found_opening, tags_stack.rbegin());
+				tags_stack.pop_back();
+				return true;
+			}
+		}
+		else {
+			/* We have unpaired tag */
+			return false;
 		}
-	}
-	else {
-		return TRUE;
 	}
 
-	return FALSE;
+	/* Misuse */
+	return false;
 }
 
-static gboolean
+static auto
 html_process_tag(rspamd_mempool_t *pool,
-						struct html_content *hc,
-						struct html_tag *tag,
-						GNode **cur_level,
-						gboolean *balanced)
+				 struct html_content *hc,
+				 struct html_tag *tag,
+				 std::vector<html_tag *> &tags_stack) -> bool
 {
-	GNode *nnode;
 	struct html_tag *parent;
 
 	if (hc->total_tags > rspamd::html::max_tags) {
@@ -120,43 +126,40 @@ html_process_tag(rspamd_mempool_t *pool,
 	if (tag->id == -1) {
 		/* Ignore unknown tags */
 		hc->total_tags++;
-		return FALSE;
+		return false;
 	}
 
-	if (*cur_level == nullptr) {
-		*cur_level = hc->html_tags;
+
+	if (tags_stack.empty()) {
+		parent = hc->root_tag;
+	}
+	else {
+		parent = tags_stack.back();
 	}
 
-	tag->parent = *cur_level;
+	tag->parent = parent;
 
 	if (!(tag->flags & (CM_INLINE | CM_EMPTY))) {
 		/* Block tag */
-		if (tag->flags & (FL_CLOSING | FL_CLOSED)) {
-			if (!*cur_level) {
+		if ((tag->flags & (FL_CLOSING | FL_CLOSED))) {
+			/* Closed block tag */
+			if (parent == nullptr) {
 				msg_debug_html ("bad parent node");
-				return FALSE;
+				return false;
 			}
 
 			if (hc->total_tags < rspamd::html::max_tags) {
-				nnode = g_node_new(tag);
-				g_node_append (*cur_level, nnode);
-
-				if (!html_check_balance(nnode, cur_level)) {
+				if (!html_check_balance(tag, parent, tags_stack)) {
 					msg_debug_html (
 							"mark part as unbalanced as it has not pairable closing tags");
 					hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
-					*balanced = FALSE;
-				}
-				else {
-					*balanced = TRUE;
 				}
 
 				hc->total_tags++;
 			}
 		}
 		else {
-			parent = (struct html_tag *)(*cur_level)->data;
-
+			/* Opening block tag */
 			if (parent) {
 				if ((parent->flags & FL_IGNORE)) {
 					tag->flags |= FL_IGNORE;
@@ -168,27 +171,24 @@ html_process_tag(rspamd_mempool_t *pool,
 					if (parent->id == tag->id) {
 						/* Something like <a>bla<a>foo... */
 						hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
-						*balanced = FALSE;
 						tag->parent = parent->parent;
 
 						if (hc->total_tags < rspamd::html::max_tags) {
-							nnode = g_node_new(tag);
-							g_node_append (parent->parent, nnode);
-							*cur_level = nnode;
+							parent->children.push_back(tag);
+							tags_stack.push_back(tag);
 							hc->total_tags++;
 						}
 
-						return TRUE;
+						return true;
 					}
 				}
 			}
 
 			if (hc->total_tags < rspamd::html::max_tags) {
-				nnode = g_node_new(tag);
-				g_node_append (*cur_level, nnode);
+				parent->children.push_back(tag);
 
 				if ((tag->flags & FL_CLOSED) == 0) {
-					*cur_level = nnode;
+					tags_stack.push_back(tag);
 				}
 
 				hc->total_tags++;
@@ -197,31 +197,28 @@ html_process_tag(rspamd_mempool_t *pool,
 			if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) {
 				tag->flags |= FL_IGNORE;
 
-				return FALSE;
+				return false;
 			}
 
 		}
 	}
 	else {
 		/* Inline tag */
-		parent = (struct html_tag *)(*cur_level)->data;
-
 		if (parent) {
 			if (hc->total_tags < rspamd::html::max_tags) {
-				nnode = g_node_new(tag);
-				g_node_append (*cur_level, nnode);
+				parent->children.push_back(tag);
 
 				hc->total_tags++;
 			}
 			if ((parent->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE))) {
 				tag->flags |= FL_IGNORE;
 
-				return FALSE;
+				return false;
 			}
 		}
 	}
 
-	return TRUE;
+	return true;
 }
 
 
@@ -1647,14 +1644,13 @@ html_process_part_full (rspamd_mempool_t *pool,
 {
 	const gchar *p, *c, *end;
 	guchar t;
-	gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
-			balanced;
+	gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE;
 	guint obrace = 0, ebrace = 0;
-	GNode *cur_level = NULL;
 	struct rspamd_url *url = NULL;
 	gint len, href_offset = -1;
 	struct html_tag *cur_tag = NULL, *content_tag = NULL;
 	std::vector<html_block *> blocks_stack;
+	std::vector<html_tag *> tags_stack;
 	struct tag_content_parser_state content_parser_env;
 
 	enum {
@@ -1984,7 +1980,7 @@ html_process_part_full (rspamd_mempool_t *pool,
 			else {
 
 				if (allow_css) {
-					GError *err = NULL;
+					GError *err = nullptr;
 					hc->css_style = rspamd_css_parse_style(pool, p, end_style, hc->css_style,
 							&err);
 
@@ -2015,7 +2011,7 @@ html_process_part_full (rspamd_mempool_t *pool,
 			if (t == '>') {
 				state = tag_end;
 				/* We don't know a lot about sgml tags, ignore them */
-				cur_tag = NULL;
+				cur_tag = hc->root_tag;
 				continue;
 			}
 			p ++;
@@ -2044,11 +2040,9 @@ html_process_part_full (rspamd_mempool_t *pool,
 		case tag_end:
 			content_parser_env.reset();
 
-			if (cur_tag != NULL) {
-				balanced = TRUE;
+			if (cur_tag != nullptr) {
 
-				if (html_process_tag (pool, hc, cur_tag, &cur_level,
-						&balanced)) {
+				if (html_process_tag(pool, hc, cur_tag, tags_stack)) {
 					state = content_write;
 					need_decode = FALSE;
 				}
@@ -2153,16 +2147,13 @@ html_process_part_full (rspamd_mempool_t *pool,
 					}
 
 					if (cur_tag->id == Tag_A) {
-						if (!balanced && cur_level && cur_level->prev) {
-							struct html_tag *prev_tag;
-							struct rspamd_url *prev_url;
-
-							prev_tag = static_cast<html_tag *>(cur_level->prev->data);
+						if (tags_stack.size() >= 2) {
+							const auto *prev_tag = tags_stack.back()->parent;
 
-							if (prev_tag->id == Tag_A &&
+							if (prev_tag && prev_tag->id == Tag_A &&
 								!(prev_tag->flags & (FL_CLOSING)) &&
 								std::holds_alternative<rspamd_url *>(prev_tag->extra)) {
-								prev_url = std::get<rspamd_url *>(prev_tag->extra);
+								auto *prev_url = std::get<rspamd_url *>(prev_tag->extra);
 
 								std::string_view disp_part{
 										hc->parsed.data() + href_offset,
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
index c3f3830ec..c3b65a06f 100644
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -31,7 +31,7 @@ namespace rspamd::html {
 
 struct html_content {
 	struct rspamd_url *base_url = nullptr;
-	GNode *html_tags;
+	struct html_tag *root_tag = nullptr;
 	gint flags = 0;
 	guint total_tags = 0;
 	struct html_color bgcolor;
@@ -44,7 +44,6 @@ struct html_content {
 
 	/* Preallocate and reserve all internal structures */
 	html_content() {
-		html_tags =  g_node_new(nullptr);
 		tags_seen.resize(N_TAGS, false);
 		blocks.reserve(128);
 		all_tags.reserve(128);
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 645c3433e..d75cfcf6d 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -21,6 +21,7 @@
 #include <utility>
 #include <string_view>
 #include <variant>
+#include <vector>
 #include <contrib/robin-hood/robin_hood.h>
 
 namespace rspamd::html {
@@ -52,7 +53,8 @@ struct html_tag {
 
 	html_tag_extra_t extra;
 	struct html_block *block = nullptr; /* TODO: temporary, must be handled by css */
-	GNode *parent = nullptr;
+	std::vector<struct html_tag *> children;
+	struct html_tag *parent;
 };
 
 }


More information about the Commits mailing list