commit bb99532: [Project] Html: Insert closing tags as well :(
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Jun 22 16:21:08 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-22 11:10:54 +0100
URL: https://github.com/rspamd/rspamd/commit/bb995323cf4d2b0897c84066cd39d1b8c7f343a5
[Project] Html: Insert closing tags as well :(
---
src/libserver/html/html.cxx | 57 ++++++++++++++++++++++++++++-------------
src/libserver/html/html_tag.hxx | 4 +--
2 files changed, 41 insertions(+), 20 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index f82bd0359..15433cc72 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -79,7 +79,8 @@ static auto
html_check_balance(struct html_tag *tag,
struct html_tag *parent,
std::vector<html_tag *> &tags_stack,
- goffset tag_start_offset) -> bool
+ goffset tag_start_offset,
+ goffset tag_end_offset) -> bool
{
if (tag->flags & FL_CLOSING) {
@@ -131,7 +132,8 @@ html_process_tag(rspamd_mempool_t *pool,
struct html_content *hc,
struct html_tag *tag,
std::vector<html_tag *> &tags_stack,
- goffset tag_start_offset) -> bool
+ goffset tag_start_offset,
+ goffset tag_end_offset) -> bool
{
struct html_tag *parent;
@@ -164,14 +166,14 @@ html_process_tag(rspamd_mempool_t *pool,
return false;
}
- if (hc->total_tags < rspamd::html::max_tags) {
- if (!html_check_balance(tag, parent, tags_stack, tag_start_offset)) {
- msg_debug_html (
- "mark part as unbalanced as it has not pairable closing tags");
- hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
- }
-
- hc->total_tags++;
+ if (!html_check_balance(tag, parent, tags_stack,
+ tag_start_offset, tag_end_offset)) {
+ msg_debug_html (
+ "mark part as unbalanced as it has not pairable closing tags");
+ hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
+ }
+ else {
+ parent->children.push_back(tag);
}
}
else {
@@ -1076,12 +1078,18 @@ html_append_tag_content(const gchar *start, gsize len,
auto cur_offset = tag->content_offset;
auto total_len = tag->content_length;
+ if (tag->flags & FL_CLOSING) {
+ return;
+ }
+
if (cur_offset > len || total_len + cur_offset > len) {
RSPAMD_UNREACHABLE;
}
if (tag->id == Tag_BR || tag->id == Tag_HR) {
- hc->parsed.append("\n");
+ if (!hc->parsed.empty()) {
+ hc->parsed.append("\n");
+ }
return;
}
@@ -1089,8 +1097,12 @@ html_append_tag_content(const gchar *start, gsize len,
return; /* XXX: is it always true? */
}
- if (tag->block->has_display() && tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
- hc->parsed.append("\n");
+ auto is_block = tag->block->has_display() &&
+ tag->block->display == css::css_display_value::DISPLAY_BLOCK;
+ if (is_block) {
+ if (!hc->parsed.empty()) {
+ hc->parsed.append("\n");
+ }
}
for (const auto &cld_tag : tag->children) {
@@ -1102,19 +1114,25 @@ html_append_tag_content(const gchar *start, gsize len,
}
html_append_tag_content(start, len, hc, cld_tag);
auto old_offset = cur_offset;
+
cur_offset = cld_tag->content_offset + cld_tag->content_length;
if (total_len < cur_offset - old_offset) {
/* Child tag spans over parent (e.g. wrong nesting) */
total_len = 0;
- break;
}
- total_len -= cur_offset - old_offset;
+ else {
+ total_len -= cur_offset - old_offset;
+ }
}
if (total_len > 0 && tag->block->is_visible()) {
html_append_content(hc, {start + cur_offset, total_len});
}
+
+ if (is_block) {
+ hc->parsed.append("\n");
+ }
}
static auto
@@ -1412,7 +1430,7 @@ html_process_input(rspamd_mempool_t *pool,
cur_tag->content_offset = p - start + 1;
if (!html_process_tag(pool, hc, cur_tag, tags_stack,
- c - start)) {
+ c - start, p - start)) {
if (cur_tag->id == Tag_STYLE) {
state = content_style;
}
@@ -1540,6 +1558,10 @@ html_process_input(rspamd_mempool_t *pool,
/* Propagate styles */
hc->traverse_block_tags([&hc](const html_tag *tag) -> bool {
+ if (tag->flags & FL_CLOSING) {
+ return true;
+ }
+
if (hc->css_style) {
auto *css_block = hc->css_style->check_tag_block(tag);
@@ -1688,14 +1710,13 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
- {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
{"test", "test"},
{"test ", "test "},
{"test foo, bar", "test foo, bar"},
{"<p>text</p>", "text"},
{"olo<p>text</p>lolo", "olo\ntext\nlolo"},
-
+ {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
{"foo<br>baz", "foo\nbaz"},
{"<div>foo</div><div>bar</div>", "foo\nbar"},
};
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index f6442bdc3..cad5368cf 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -58,10 +58,10 @@ struct html_tag_component {
struct html_tag {
unsigned int tag_start = 0;
- mutable unsigned int content_length = 0; /* Allow content length propagation */
+ unsigned int content_length = 0;
unsigned int content_offset = 0;
std::uint32_t flags = 0;
- std::int16_t id = -1;
+ std::int32_t id = -1;
std::string_view name;
std::vector<html_tag_component> parameters;
More information about the Commits
mailing list