commit f13b2d0: [Project] Html: One more attempt to write text content
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Jun 22 16:21:10 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-22 16:10:52 +0100
URL: https://github.com/rspamd/rspamd/commit/f13b2d042d2e42bf2216632f94495aa2244b91d5
[Project] Html: One more attempt to write text content
---
src/libserver/html/html.cxx | 116 +++++++++++++++++++++++++++++++++++---------
1 file changed, 92 insertions(+), 24 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e65e1b028..37d464c1d 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1072,8 +1072,11 @@ static auto
html_append_tag_content(const gchar *start, gsize len,
struct html_content *hc,
const html_tag *tag,
- goffset next_tag_offset) -> goffset
+ std::vector<const html_tag *> &enclosed_tags) -> goffset
{
+ auto is_visible = true, is_block = false;
+ goffset next_tag_offset = tag->content_length + tag->content_offset;
+
if (tag->id == Tag_BR || tag->id == Tag_HR) {
if (!hc->parsed.empty()) {
hc->parsed.append("\n");
@@ -1083,30 +1086,83 @@ html_append_tag_content(const gchar *start, gsize len,
}
if (!tag->block) {
- return next_tag_offset; /* XXX: is it always true? */
+ is_visible = false;
+ }
+ else if (!tag->block->is_visible()) {
+ is_visible = false;
+ }
+ else {
+ is_block = tag->block->has_display() &&
+ tag->block->display == css::css_display_value::DISPLAY_BLOCK;
}
- auto is_block = tag->block->has_display() &&
- tag->block->display == css::css_display_value::DISPLAY_BLOCK;
if (is_block) {
- if (!hc->parsed.empty()) {
+ if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
hc->parsed.append("\n");
}
}
- if (tag->content_length + tag->content_offset <= next_tag_offset) {
- if (tag->block->is_visible()) {
- html_append_content(hc, {start + tag->content_offset,
- tag->content_length});
+ goffset cur_offset = tag->content_offset;
- if (is_block) {
- if (!hc->parsed.empty()) {
- hc->parsed.append("\n");
- }
+ do {
+ auto enclosed_end = 0, enclosed_start = 0;
+ decltype(tag) next_enclosed = nullptr;
+
+ if (!enclosed_tags.empty()) {
+ next_enclosed = enclosed_tags.back();
+ enclosed_start = next_enclosed->tag_start;
+ enclosed_end = next_enclosed->content_length +
+ next_enclosed->content_offset;
+
+ if (enclosed_end > next_tag_offset) {
+ next_tag_offset = enclosed_end;
+ }
+ enclosed_tags.pop_back();
+ }
+ else {
+ enclosed_start = next_tag_offset;
+ }
+
+ goffset initial_part_len = enclosed_start - cur_offset;
+
+ if (is_visible && initial_part_len > 0) {
+ html_append_content(hc, {start + cur_offset,
+ std::size_t(initial_part_len)});
+ }
+
+ /* Deal with the remaining part */
+ std::decay_t<decltype(enclosed_tags)> nested_stack;
+
+ while (!enclosed_tags.empty() && enclosed_end > 0) {
+ const auto *last_tag = enclosed_tags.back();
+
+ if (last_tag->tag_start <= enclosed_end) {
+ nested_stack.push_back(last_tag);
+ enclosed_tags.pop_back();
+ }
+ else {
+ break;
+ }
+ }
+
+ if (!nested_stack.empty() && next_enclosed) {
+ /* Recursively print enclosed tags */
+ std::reverse(std::begin(nested_stack), std::end(nested_stack));
+ cur_offset = html_append_tag_content(start, len, hc, next_enclosed, nested_stack);
+
+ initial_part_len = next_tag_offset - cur_offset;
+ if (is_visible && initial_part_len > 0) {
+ html_append_content(hc, {start + cur_offset,
+ std::size_t(initial_part_len)});
}
}
- return tag->content_length + tag->content_offset;
+ } while (!enclosed_tags.empty());
+
+ if (is_block && is_visible) {
+ if (!hc->parsed.empty()) {
+ hc->parsed.append("\n");
+ }
}
return next_tag_offset;
@@ -1117,18 +1173,30 @@ html_append_tags_content(const gchar *start, gsize len,
struct html_content *hc) -> void
{
auto cur_offset = 0;
+ std::vector<const html_tag *> enclosed_tags_stack;
- for (auto i = 0; i < hc->all_tags.size(); i ++) {
+ for (auto i = 0; i < hc->all_tags.size();) {
const auto &tag = hc->all_tags[i];
html_tag *next_tag = nullptr;
- auto next_offset = len;
+ auto next_offset = tag->content_offset + tag->content_length;
- if (i + 1 < hc->all_tags.size()) {
- next_tag = hc->all_tags[i + 1].get();
- next_offset = next_tag->tag_start;
+ auto j = i + 1;
+ while (j < hc->all_tags.size()) {
+ next_tag = hc->all_tags[j].get();
+
+ if (next_tag->content_offset <= next_offset) {
+ enclosed_tags_stack.push_back(next_tag);
+ j ++;
+ }
+ else {
+ break;
+ }
}
- cur_offset = html_append_tag_content(start, len, hc, tag.get(), next_offset);
+ std::reverse(enclosed_tags_stack.begin(), enclosed_tags_stack.end());
+ cur_offset = html_append_tag_content(start, len, hc, tag.get(),
+ enclosed_tags_stack);
+ i = j;
}
}
@@ -1713,16 +1781,16 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
- {"foo<br>baz", "foo\nbaz"},
+ {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
{"test", "test"},
{"test ", "test "},
{"test foo, bar", "test foo, bar"},
- {"<p>text</p>", "text"},
+ {"<p>text</p>", "text\n"},
{"olo<p>text</p>lolo", "olo\ntext\nlolo"},
- {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
- {"<div>foo</div><div>bar</div>", "foo\nbar"},
+ {"foo<br>baz", "foo\nbaz"},
+ {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
};
rspamd_url_init(NULL);
More information about the Commits
mailing list