commit ee8dbc1: [Project] Html: Try another approach to append tags content
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Jun 22 16:21:09 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-22 14:53:17 +0100
URL: https://github.com/rspamd/rspamd/commit/ee8dbc1f5e0e4ac35e73a00b3eaa7e05897ad23d
[Project] Html: Try another approach to append tags content
---
src/libserver/html/html.cxx | 88 +++++++++++++++++++++++----------------------
1 file changed, 46 insertions(+), 42 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 15433cc72..e65e1b028 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -172,9 +172,6 @@ html_process_tag(rspamd_mempool_t *pool,
"mark part as unbalanced as it has not pairable closing tags");
hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
}
- else {
- parent->children.push_back(tag);
- }
}
else {
/* Opening block tag */
@@ -1073,65 +1070,65 @@ html_append_content(struct html_content *hc, std::string_view data) -> auto
static auto
html_append_tag_content(const gchar *start, gsize len,
- struct html_content *hc, const struct html_tag *tag) -> void
+ struct html_content *hc,
+ const html_tag *tag,
+ goffset next_tag_offset) -> goffset
{
- auto cur_offset = tag->content_offset;
- auto total_len = tag->content_length;
-
- if (tag->flags & FL_CLOSING) {
- return;
- }
-
- if (cur_offset > len || total_len + cur_offset > len) {
- RSPAMD_UNREACHABLE;
- }
-
if (tag->id == Tag_BR || tag->id == Tag_HR) {
if (!hc->parsed.empty()) {
hc->parsed.append("\n");
}
- return;
+
+ return tag->content_offset;
}
if (!tag->block) {
- return; /* XXX: is it always true? */
+ return next_tag_offset; /* XXX: is it always true? */
}
auto is_block = tag->block->has_display() &&
- tag->block->display == css::css_display_value::DISPLAY_BLOCK;
+ tag->block->display == css::css_display_value::DISPLAY_BLOCK;
if (is_block) {
if (!hc->parsed.empty()) {
hc->parsed.append("\n");
}
}
- for (const auto &cld_tag : tag->children) {
- if (cld_tag->tag_start > cur_offset) {
- if (tag->block->is_visible()) {
- html_append_content(hc, {start + cur_offset,
- cld_tag->tag_start - cur_offset});
+ if (tag->content_length + tag->content_offset <= next_tag_offset) {
+ if (tag->block->is_visible()) {
+ html_append_content(hc, {start + tag->content_offset,
+ tag->content_length});
+
+ if (is_block) {
+ if (!hc->parsed.empty()) {
+ hc->parsed.append("\n");
+ }
}
}
- html_append_tag_content(start, len, hc, cld_tag);
- auto old_offset = cur_offset;
- cur_offset = cld_tag->content_offset + cld_tag->content_length;
-
- if (total_len < cur_offset - old_offset) {
- /* Child tag spans over parent (e.g. wrong nesting) */
- total_len = 0;
- }
- else {
- total_len -= cur_offset - old_offset;
- }
+ return tag->content_length + tag->content_offset;
}
- if (total_len > 0 && tag->block->is_visible()) {
- html_append_content(hc, {start + cur_offset, total_len});
- }
+ return next_tag_offset;
+}
- if (is_block) {
- hc->parsed.append("\n");
+static auto
+html_append_tags_content(const gchar *start, gsize len,
+ struct html_content *hc) -> void
+{
+ auto cur_offset = 0;
+
+ for (auto i = 0; i < hc->all_tags.size(); i ++) {
+ const auto &tag = hc->all_tags[i];
+ html_tag *next_tag = nullptr;
+ auto next_offset = len;
+
+ if (i + 1 < hc->all_tags.size()) {
+ next_tag = hc->all_tags[i + 1].get();
+ next_offset = next_tag->tag_start;
+ }
+
+ cur_offset = html_append_tag_content(start, len, hc, tag.get(), next_offset);
}
}
@@ -1192,6 +1189,9 @@ html_process_input(rspamd_mempool_t *pool,
else {
/* We have no starting tag, so assume that it's content */
hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
+ hc->all_tags.emplace_back(std::make_unique<html_tag>());
+ cur_tag = hc->all_tags.back().get();
+ cur_tag->id = Tag_HTML;
state = content_before_start;
}
break;
@@ -1599,8 +1599,11 @@ html_process_input(rspamd_mempool_t *pool,
return true;
}, html_content::traverse_type::PRE_ORDER);
- if (hc->root_tag) {
- html_append_tag_content(start, end - start, hc, hc->root_tag);
+ if (!hc->all_tags.empty()) {
+ std::sort(hc->all_tags.begin(), hc->all_tags.end(), [](const auto &pt1, const auto &pt2) -> auto {
+ return pt1->tag_start < pt2->tag_start;
+ });
+ html_append_tags_content(start, end - start, hc);
}
/* Leftover */
@@ -1710,6 +1713,7 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
+ {"foo<br>baz", "foo\nbaz"},
{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
{"test", "test"},
{"test ", "test "},
@@ -1717,7 +1721,7 @@ TEST_CASE("html text extraction")
{"<p>text</p>", "text"},
{"olo<p>text</p>lolo", "olo\ntext\nlolo"},
{"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
- {"foo<br>baz", "foo\nbaz"},
+
{"<div>foo</div><div>bar</div>", "foo\nbar"},
};
More information about the Commits
mailing list