commit 92f5857: [Rework] Html: Start removing of GNode stuff
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Jun 7 16:35:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-07 14:24:15 +0100
URL: https://github.com/rspamd/rspamd/commit/92f5857eb807306bd7b1536d9901169bf3e5949b
[Rework] Html: Start removing of GNode stuff
---
src/libserver/html/html.cxx | 147 +++++++++++++++++++---------------------
src/libserver/html/html.hxx | 3 +-
src/libserver/html/html_tag.hxx | 4 +-
3 files changed, 73 insertions(+), 81 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 925fe217b..40ef240d5 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -73,44 +73,50 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
INIT_LOG_MODULE(html)
-static gboolean
-html_check_balance(GNode *node, GNode **cur_level)
+static auto
+html_check_balance(struct html_tag *tag,
+ struct html_tag *parent,
+ std::vector<html_tag *> &tags_stack) -> bool
{
- struct html_tag *arg = (struct html_tag *)node->data, *tmp;
- GNode *cur;
-
- if (arg->flags & FL_CLOSING) {
- /* First of all check whether this tag is closing tag for parent node */
- cur = node->parent;
- while (cur && cur->data) {
- tmp = (struct html_tag *)cur->data;
- if (tmp->id == arg->id &&
- (tmp->flags & FL_CLOSED) == 0) {
- tmp->flags |= FL_CLOSED;
- /* Destroy current node as we find corresponding parent node */
- g_node_destroy(node);
- /* Change level */
- *cur_level = cur->parent;
- return TRUE;
- }
- cur = cur->parent;
+
+ if (tag->flags & FL_CLOSING) {
+ /* Find the opening pair if any and check if it is correctly placed */
+ auto found_opening = std::find_if(tags_stack.rbegin(), tags_stack.rend(),
+ [&](const html_tag *t) {
+ return (t->flags & FL_CLOSED) == 0 && t->id == tag->id;
+ });
+
+ if (found_opening != tags_stack.rend()) {
+ (*found_opening)->flags |= FL_CLOSED;
+
+ if (found_opening == tags_stack.rbegin()) {
+ tags_stack.pop_back();
+ /* All good */
+ return true;
+ }
+ else {
+ /* Move to front */
+ std::iter_swap(found_opening, tags_stack.rbegin());
+ tags_stack.pop_back();
+ return true;
+ }
+ }
+ else {
+ /* We have unpaired tag */
+ return false;
}
- }
- else {
- return TRUE;
}
- return FALSE;
+ /* Misuse */
+ return false;
}
-static gboolean
+static auto
html_process_tag(rspamd_mempool_t *pool,
- struct html_content *hc,
- struct html_tag *tag,
- GNode **cur_level,
- gboolean *balanced)
+ struct html_content *hc,
+ struct html_tag *tag,
+ std::vector<html_tag *> &tags_stack) -> bool
{
- GNode *nnode;
struct html_tag *parent;
if (hc->total_tags > rspamd::html::max_tags) {
@@ -120,43 +126,40 @@ html_process_tag(rspamd_mempool_t *pool,
if (tag->id == -1) {
/* Ignore unknown tags */
hc->total_tags++;
- return FALSE;
+ return false;
}
- if (*cur_level == nullptr) {
- *cur_level = hc->html_tags;
+
+ if (tags_stack.empty()) {
+ parent = hc->root_tag;
+ }
+ else {
+ parent = tags_stack.back();
}
- tag->parent = *cur_level;
+ tag->parent = parent;
if (!(tag->flags & (CM_INLINE | CM_EMPTY))) {
/* Block tag */
- if (tag->flags & (FL_CLOSING | FL_CLOSED)) {
- if (!*cur_level) {
+ if ((tag->flags & (FL_CLOSING | FL_CLOSED))) {
+ /* Closed block tag */
+ if (parent == nullptr) {
msg_debug_html ("bad parent node");
- return FALSE;
+ return false;
}
if (hc->total_tags < rspamd::html::max_tags) {
- nnode = g_node_new(tag);
- g_node_append (*cur_level, nnode);
-
- if (!html_check_balance(nnode, cur_level)) {
+ if (!html_check_balance(tag, parent, tags_stack)) {
msg_debug_html (
"mark part as unbalanced as it has not pairable closing tags");
hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- *balanced = FALSE;
- }
- else {
- *balanced = TRUE;
}
hc->total_tags++;
}
}
else {
- parent = (struct html_tag *)(*cur_level)->data;
-
+ /* Opening block tag */
if (parent) {
if ((parent->flags & FL_IGNORE)) {
tag->flags |= FL_IGNORE;
@@ -168,27 +171,24 @@ html_process_tag(rspamd_mempool_t *pool,
if (parent->id == tag->id) {
/* Something like <a>bla<a>foo... */
hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- *balanced = FALSE;
tag->parent = parent->parent;
if (hc->total_tags < rspamd::html::max_tags) {
- nnode = g_node_new(tag);
- g_node_append (parent->parent, nnode);
- *cur_level = nnode;
+ parent->children.push_back(tag);
+ tags_stack.push_back(tag);
hc->total_tags++;
}
- return TRUE;
+ return true;
}
}
}
if (hc->total_tags < rspamd::html::max_tags) {
- nnode = g_node_new(tag);
- g_node_append (*cur_level, nnode);
+ parent->children.push_back(tag);
if ((tag->flags & FL_CLOSED) == 0) {
- *cur_level = nnode;
+ tags_stack.push_back(tag);
}
hc->total_tags++;
@@ -197,31 +197,28 @@ html_process_tag(rspamd_mempool_t *pool,
if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) {
tag->flags |= FL_IGNORE;
- return FALSE;
+ return false;
}
}
}
else {
/* Inline tag */
- parent = (struct html_tag *)(*cur_level)->data;
-
if (parent) {
if (hc->total_tags < rspamd::html::max_tags) {
- nnode = g_node_new(tag);
- g_node_append (*cur_level, nnode);
+ parent->children.push_back(tag);
hc->total_tags++;
}
if ((parent->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE))) {
tag->flags |= FL_IGNORE;
- return FALSE;
+ return false;
}
}
}
- return TRUE;
+ return true;
}
@@ -1647,14 +1644,13 @@ html_process_part_full (rspamd_mempool_t *pool,
{
const gchar *p, *c, *end;
guchar t;
- gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
- balanced;
+ gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE;
guint obrace = 0, ebrace = 0;
- GNode *cur_level = NULL;
struct rspamd_url *url = NULL;
gint len, href_offset = -1;
struct html_tag *cur_tag = NULL, *content_tag = NULL;
std::vector<html_block *> blocks_stack;
+ std::vector<html_tag *> tags_stack;
struct tag_content_parser_state content_parser_env;
enum {
@@ -1984,7 +1980,7 @@ html_process_part_full (rspamd_mempool_t *pool,
else {
if (allow_css) {
- GError *err = NULL;
+ GError *err = nullptr;
hc->css_style = rspamd_css_parse_style(pool, p, end_style, hc->css_style,
&err);
@@ -2015,7 +2011,7 @@ html_process_part_full (rspamd_mempool_t *pool,
if (t == '>') {
state = tag_end;
/* We don't know a lot about sgml tags, ignore them */
- cur_tag = NULL;
+ cur_tag = hc->root_tag;
continue;
}
p ++;
@@ -2044,11 +2040,9 @@ html_process_part_full (rspamd_mempool_t *pool,
case tag_end:
content_parser_env.reset();
- if (cur_tag != NULL) {
- balanced = TRUE;
+ if (cur_tag != nullptr) {
- if (html_process_tag (pool, hc, cur_tag, &cur_level,
- &balanced)) {
+ if (html_process_tag(pool, hc, cur_tag, tags_stack)) {
state = content_write;
need_decode = FALSE;
}
@@ -2153,16 +2147,13 @@ html_process_part_full (rspamd_mempool_t *pool,
}
if (cur_tag->id == Tag_A) {
- if (!balanced && cur_level && cur_level->prev) {
- struct html_tag *prev_tag;
- struct rspamd_url *prev_url;
-
- prev_tag = static_cast<html_tag *>(cur_level->prev->data);
+ if (tags_stack.size() >= 2) {
+ const auto *prev_tag = tags_stack.back()->parent;
- if (prev_tag->id == Tag_A &&
+ if (prev_tag && prev_tag->id == Tag_A &&
!(prev_tag->flags & (FL_CLOSING)) &&
std::holds_alternative<rspamd_url *>(prev_tag->extra)) {
- prev_url = std::get<rspamd_url *>(prev_tag->extra);
+ auto *prev_url = std::get<rspamd_url *>(prev_tag->extra);
std::string_view disp_part{
hc->parsed.data() + href_offset,
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
index c3f3830ec..c3b65a06f 100644
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -31,7 +31,7 @@ namespace rspamd::html {
struct html_content {
struct rspamd_url *base_url = nullptr;
- GNode *html_tags;
+ struct html_tag *root_tag = nullptr;
gint flags = 0;
guint total_tags = 0;
struct html_color bgcolor;
@@ -44,7 +44,6 @@ struct html_content {
/* Preallocate and reserve all internal structures */
html_content() {
- html_tags = g_node_new(nullptr);
tags_seen.resize(N_TAGS, false);
blocks.reserve(128);
all_tags.reserve(128);
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 645c3433e..d75cfcf6d 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -21,6 +21,7 @@
#include <utility>
#include <string_view>
#include <variant>
+#include <vector>
#include <contrib/robin-hood/robin_hood.h>
namespace rspamd::html {
@@ -52,7 +53,8 @@ struct html_tag {
html_tag_extra_t extra;
struct html_block *block = nullptr; /* TODO: temporary, must be handled by css */
- GNode *parent = nullptr;
+ std::vector<struct html_tag *> children;
+ struct html_tag *parent;
};
}
More information about the Commits
mailing list