commit ce88665: [Rework] Html: Further rework of the tags content extraction
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Jun 22 16:21:07 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-22 00:21:24 +0100
URL: https://github.com/rspamd/rspamd/commit/ce88665b0cd344fff4b2924448499b5a4ebe4f94
[Rework] Html: Further rework of the tags content extraction
---
src/libserver/html/html.cxx | 123 ++++++++++++++++++------------------
src/libserver/html/html_block.hxx | 6 +-
src/libstat/tokenizers/tokenizers.c | 8 ---
src/libutil/util.h | 1 -
4 files changed, 66 insertions(+), 72 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 925735f41..f82bd0359 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1069,6 +1069,54 @@ html_append_content(struct html_content *hc, std::string_view data) -> auto
return nlen;
}
+static auto
+html_append_tag_content(const gchar *start, gsize len,
+ struct html_content *hc, const struct html_tag *tag) -> void
+{
+ auto cur_offset = tag->content_offset;
+ auto total_len = tag->content_length;
+
+ if (cur_offset > len || total_len + cur_offset > len) {
+ RSPAMD_UNREACHABLE;
+ }
+
+ if (tag->id == Tag_BR || tag->id == Tag_HR) {
+ hc->parsed.append("\n");
+ return;
+ }
+
+ if (!tag->block) {
+ return; /* XXX: is it always true? */
+ }
+
+ if (tag->block->has_display() && tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+ hc->parsed.append("\n");
+ }
+
+ for (const auto &cld_tag : tag->children) {
+ if (cld_tag->tag_start > cur_offset) {
+ if (tag->block->is_visible()) {
+ html_append_content(hc, {start + cur_offset,
+ cld_tag->tag_start - cur_offset});
+ }
+ }
+ html_append_tag_content(start, len, hc, cld_tag);
+ auto old_offset = cur_offset;
+ cur_offset = cld_tag->content_offset + cld_tag->content_length;
+
+ if (total_len < cur_offset - old_offset) {
+ /* Child tag spans over parent (e.g. wrong nesting) */
+ total_len = 0;
+ break;
+ }
+ total_len -= cur_offset - old_offset;
+ }
+
+ if (total_len > 0 && tag->block->is_visible()) {
+ html_append_content(hc, {start + cur_offset, total_len});
+ }
+}
+
static auto
html_process_input(rspamd_mempool_t *pool,
GByteArray *in,
@@ -1490,17 +1538,8 @@ html_process_input(rspamd_mempool_t *pool,
}
}
- /* Summarize content length from children */
- hc->traverse_block_tags([](const html_tag *tag) -> bool {
-
- for (const auto *cld_tag : tag->children) {
- tag->content_length += cld_tag->content_length;
- }
- return true;
- }, html_content::traverse_type::POST_ORDER);
-
/* Propagate styles */
- hc->traverse_block_tags([&hc, &exceptions,&pool](const html_tag *tag) -> bool {
+ hc->traverse_block_tags([&hc](const html_tag *tag) -> bool {
if (hc->css_style) {
auto *css_block = hc->css_style->check_tag_block(tag);
@@ -1514,62 +1553,18 @@ html_process_input(rspamd_mempool_t *pool,
}
}
if (tag->block) {
- tag->block->compute_visibility();
-
- if (exceptions) {
- if (!tag->block->is_visible()) {
- if (tag->parent == nullptr || (tag->parent->block && tag->parent->block->is_visible())) {
- /* Add exception for an invisible element */
- auto * ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
- ex->pos = tag->content_offset;
- ex->len = tag->content_length;
- ex->type = RSPAMD_EXCEPTION_INVISIBLE;
- ex->ptr = (void *)tag;
-
- *exceptions = g_list_prepend(*exceptions, ex);
- }
+ if (!tag->block->has_display()) {
+ /* If we have no display field, we can check it by tag */
+ if (tag->flags & CM_BLOCK) {
+ tag->block->set_display(css::css_display_value::DISPLAY_BLOCK);
}
- else if (*exceptions && tag->parent) {
- /* Current block is visible, check if parent is invisible */
- auto *ex = (struct rspamd_process_exception*)g_list_first(*exceptions)->data;
-
- /*
- * TODO: we need to handle the following cases:
- * <inv><vis><inv> -< insert one more exception
- * <vis><inv> -< increase content_offset decrease length
- * <inv><vis> -< decrease length
- */
- if (ex && ex->type == RSPAMD_EXCEPTION_INVISIBLE &&
- ex->ptr == (void *)tag->parent) {
- auto *parent = tag->parent;
-
- if (tag->content_offset + tag->content_length ==
- parent->content_offset + parent->content_length) {
- /* <inv><vis> */
- ex->len -= tag->content_length;
- }
- else if (tag->content_offset == parent->content_offset) {
- /* <vis><inv> */
- ex->len -= tag->content_length;
- ex->pos += tag->content_length;
- }
- else if (tag->content_offset > ex->pos) {
- auto *nex = rspamd_mempool_alloc_type (pool,
- struct rspamd_process_exception);
- auto start_len = tag->content_offset - ex->pos;
- auto end_len = ex->len - tag->content_length - tag->content_length;
- nex->pos = tag->content_offset + tag->content_length;
- nex->len = end_len;
- nex->type = RSPAMD_EXCEPTION_INVISIBLE;
- nex->ptr = (void *)parent; /* ! */
- ex->len = start_len;
- *exceptions = g_list_prepend(*exceptions, ex);
- }
-
- }
+ else {
+ tag->block->set_display(css::css_display_value::DISPLAY_INLINE);
}
}
+ tag->block->compute_visibility();
+
for (const auto *cld_tag : tag->children) {
if (cld_tag->block) {
cld_tag->block->propagate_block(*tag->block);
@@ -1582,6 +1577,10 @@ html_process_input(rspamd_mempool_t *pool,
return true;
}, html_content::traverse_type::PRE_ORDER);
+ if (hc->root_tag) {
+ html_append_tag_content(start, end - start, hc, hc->root_tag);
+ }
+
/* Leftover */
switch (state) {
case html_text_content:
diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx
index 51f3dbb9d..f2bbf1d64 100644
--- a/src/libserver/html/html_block.hxx
+++ b/src/libserver/html/html_block.hxx
@@ -219,6 +219,10 @@ struct html_block {
return (mask & transparent_flag) != 0;
}
+ constexpr auto has_display(void) const -> bool {
+ return (mask & display_mask) != 0;
+ }
+
/**
* Returns a default html block for root HTML element
* @return
@@ -227,7 +231,7 @@ struct html_block {
return html_block{rspamd::css::css_color::black(),
rspamd::css::css_color::white(),
0, 0,
- (fg_color_mask|bg_color_mask|display_mask|font_size_mask),
+ (fg_color_mask|bg_color_mask|font_size_mask),
rspamd::css::css_display_value::DISPLAY_INLINE,
12};
}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 2dd4a6f5a..b4f8ac75c 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -275,14 +275,6 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
g_array_append_val (res, token);
token.flags = 0;
}
- else if (ex->type == RSPAMD_EXCEPTION_INVISIBLE) {
- token.original.begin = "!!INV!!";
- token.original.len = sizeof ("!!INV!!") - 1;
- token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-
- g_array_append_val (res, token);
- token.flags = 0;
- }
}
diff --git a/src/libutil/util.h b/src/libutil/util.h
index 9ee8a09ae..d993fcbdf 100644
--- a/src/libutil/util.h
+++ b/src/libutil/util.h
@@ -25,7 +25,6 @@ enum rspamd_exception_type {
RSPAMD_EXCEPTION_NEWLINE = 0,
RSPAMD_EXCEPTION_URL,
RSPAMD_EXCEPTION_GENERIC,
- RSPAMD_EXCEPTION_INVISIBLE,
};
/**
* Structure to point exception in text from processing
More information about the Commits
mailing list