commit 87b8d80: [Project] Html: More fixes
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Jun 30 20:28:06 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-30 18:54:05 +0100
URL: https://github.com/rspamd/rspamd/commit/87b8d80c59ac222d8c78c075882407c039e65b07
[Project] Html: More fixes
---
src/libserver/html/html.cxx | 128 +++++++++++++---------------------------
src/libserver/html/html_tag.hxx | 6 +-
2 files changed, 46 insertions(+), 88 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 805b7b7ba..d08cb75b2 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -270,7 +270,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
state = ignore_bad_tag;
- tag->id = -1;
+ tag->id = N_TAGS;
tag->flags |= FL_BROKEN;
}
else if (g_ascii_isalpha (*in)) {
@@ -292,7 +292,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
if (tag_name_len== 0) {
hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
- tag->id = -1;
+ tag->id = N_TAGS;
tag->flags |= FL_BROKEN;
state = ignore_bad_tag;
}
@@ -313,7 +313,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
if (tag_def == nullptr) {
hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
- tag->id = -1;
+ tag->id = N_TAGS;
}
else {
tag->id = tag_def->id;
@@ -1000,12 +1000,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
const gchar *start, gsize len,
struct html_content *hc,
const html_tag *tag,
- std::vector<const html_tag *> &enclosed_tags,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set) -> goffset
{
auto is_visible = true, is_block = false;
- goffset next_tag_offset = tag->closing.end,
+ goffset next_tag_offset = tag->closing.start,
initial_dest_offset = hc->parsed.size();
if (tag->id == Tag_BR || tag->id == Tag_HR) {
@@ -1014,7 +1013,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
return tag->content_offset;
}
- if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE))) {
+ if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE|CM_HEAD))) {
is_visible = false;
}
else {
@@ -1038,24 +1037,8 @@ html_append_tag_content(rspamd_mempool_t *pool,
goffset cur_offset = tag->content_offset;
- do {
- auto enclosed_end = 0, enclosed_start = 0;
- decltype(tag) next_enclosed = nullptr;
-
- if (!enclosed_tags.empty()) {
- next_enclosed = enclosed_tags.back();
- enclosed_start = next_enclosed->tag_start;
- enclosed_end = next_enclosed->closing.end;
-
- if (enclosed_end > next_tag_offset) {
- next_tag_offset = enclosed_end;
- }
- enclosed_tags.pop_back();
- }
- else {
- enclosed_start = next_tag_offset;
- }
-
+ for (auto *cld : tag->children) {
+ auto enclosed_start = cld->tag_start;
goffset initial_part_len = enclosed_start - cur_offset;
if (is_visible && initial_part_len > 0) {
@@ -1063,37 +1046,19 @@ html_append_tag_content(rspamd_mempool_t *pool,
std::size_t(initial_part_len)});
}
- /* Deal with the remaining part */
- std::decay_t<decltype(enclosed_tags)> nested_stack;
-
- while (!enclosed_tags.empty() && enclosed_end > 0) {
- const auto *last_tag = enclosed_tags.back();
+ cur_offset = html_append_tag_content(pool, start, len,
+ hc, cld, exceptions, url_set);
- if (last_tag->tag_start <= enclosed_end) {
- nested_stack.push_back(last_tag);
- enclosed_tags.pop_back();
- }
- else {
- break;
- }
- }
+ }
- if (next_enclosed) {
- /* Recursively print enclosed tags */
- std::reverse(std::begin(nested_stack), std::end(nested_stack));
- cur_offset = html_append_tag_content(pool, start, len, hc, next_enclosed,
- nested_stack, exceptions, url_set);
+ if (cur_offset < tag->closing.start) {
+ goffset final_part_len = tag->closing.start - cur_offset;
- if (enclosed_tags.empty()) {
- initial_part_len = next_tag_offset - cur_offset;
- if (is_visible && initial_part_len > 0) {
- html_append_content(hc, {start + cur_offset,
- std::size_t(initial_part_len)});
- }
- }
+ if (is_visible && final_part_len > 0) {
+ html_append_content(hc, {start + cur_offset,
+ std::size_t(final_part_len)});
}
-
- } while (!enclosed_tags.empty());
+ }
if (is_block && is_visible) {
if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
@@ -1138,36 +1103,7 @@ html_append_tags_content(rspamd_mempool_t *pool,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set) -> void
{
- auto cur_offset = 0;
- std::vector<const html_tag *> enclosed_tags_stack;
-
- for (auto i = 0; i < hc->all_tags.size();) {
- const auto &tag = hc->all_tags[i];
- html_tag *next_tag = nullptr;
- auto next_offset = tag->closing.end;
-
- auto j = i + 1;
- while (j < hc->all_tags.size()) {
- next_tag = hc->all_tags[j].get();
-
- if (next_tag->content_offset <= next_offset) {
- enclosed_tags_stack.push_back(next_tag);
- if (next_tag->closing.end > next_offset) {
- /* Tag spans over its parent */
- next_offset = next_tag->closing.end;
- }
- j ++;
- }
- else {
- break;
- }
- }
-
- std::reverse(enclosed_tags_stack.begin(), enclosed_tags_stack.end());
- cur_offset = html_append_tag_content(pool, start, len, hc, tag.get(),
- enclosed_tags_stack, exceptions, url_set);
- i = j;
- }
+ html_append_tag_content(pool, start, len, hc, hc->root_tag, exceptions, url_set);
}
static auto
@@ -1232,7 +1168,27 @@ html_process_input(rspamd_mempool_t *pool,
parent->children.push_back(ntag);
}
else {
- hc->root_tag = ntag;
+ if (hc->root_tag) {
+ ntag->parent = hc->root_tag;
+ hc->root_tag->children.push_back(ntag);
+ }
+ else {
+ if (ntag->id == Tag_HTML) {
+ hc->root_tag = ntag;
+ }
+ else {
+ /* Insert a fake html tag */
+ hc->all_tags.emplace_back(std::make_unique<html_tag>());
+ auto *top_tag = hc->all_tags.back().get();
+ top_tag->tag_start = 0;
+ top_tag->flags = CM_HEAD|FL_VIRTUAL;
+ top_tag->id = Tag_HTML;
+ top_tag->content_offset = 0;
+ top_tag->children.push_back(ntag);
+ ntag->parent = top_tag;
+ hc->root_tag = top_tag;
+ }
+ }
}
return ntag;
@@ -1269,7 +1225,6 @@ html_process_input(rspamd_mempool_t *pool,
break;
case content_before_start:
if (t == '<') {
- html_append_content(hc, {c, std::size_t(p - c)});
state = tag_begin;
}
else {
@@ -1499,7 +1454,6 @@ html_process_input(rspamd_mempool_t *pool,
if (t == '>') {
state = html_text_content;
/* We don't know a lot about sgml tags, ignore them */
- cur_tag = nullptr;
}
p ++;
break;
@@ -1541,7 +1495,7 @@ html_process_input(rspamd_mempool_t *pool,
if (cur_tag != nullptr) {
- if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
+ if (cur_tag->id < N_TAGS) {
if (cur_tag->flags & CM_UNIQUE) {
if (!hc->tags_seen[cur_tag->id]) {
/* Duplicate tag has been found */
@@ -1623,6 +1577,8 @@ html_process_input(rspamd_mempool_t *pool,
cur_tag = html_check_balance(hc, cur_tag,
c - start, p - start);
state = html_text_content;
+ p ++;
+ c = p;
break;
case tags_limit_overflow:
msg_warn_pool("tags limit of %d tags is reached at the position %d;"
@@ -1810,6 +1766,7 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
+ {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
/* XML tags */
{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
" <!DOCTYPE html\n"
@@ -1824,7 +1781,6 @@ TEST_CASE("html text extraction")
{"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
{"foo<br>baz", "foo\nbaz"},
- {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
{"<a href=https://example.com>test</a>", "test"},
{"<img alt=test>", "test"},
{"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 40b2eb955..17c15962a 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -24,6 +24,8 @@
#include <vector>
#include <optional>
+#include "html_tags.h"
+
namespace rspamd::html {
enum class html_component_type : std::uint8_t {
@@ -83,7 +85,7 @@ struct html_tag {
unsigned int tag_start = 0;
unsigned int content_offset = 0;
std::uint32_t flags = 0;
- std::int32_t id = -1;
+ tag_id_t id = N_TAGS;
html_closing_tag closing;
std::vector<html_tag_component> components;
@@ -114,7 +116,7 @@ struct html_tag {
}
auto clear(void) -> void {
- id = -1;
+ id = N_TAGS;
tag_start = content_offset = 0;
extra = std::monostate{};
components.clear();
More information about the Commits
mailing list