commit 78d955e: [Minor] Fix the final tag processing + sgml/xml tags fixes
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Jul 1 14:28:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-07-01 15:25:37 +0100
URL: https://github.com/rspamd/rspamd/commit/78d955e5735077953966863c1d8c0ade38c27e73 (HEAD -> master)
[Minor] Fix the final tag processing + sgml/xml tags fixes
---
src/libserver/html/html.cxx | 309 ++++++++++++++++++++++++--------------------
1 file changed, 172 insertions(+), 137 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 6cd5aa246..088202286 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1207,6 +1207,116 @@ html_process_input(rspamd_mempool_t *pool,
return ntag;
};
+ auto process_opening_tag = [&]() {
+ if (cur_tag->id < N_TAGS) {
+ if (cur_tag->flags & CM_UNIQUE) {
+ if (!hc->tags_seen[cur_tag->id]) {
+ /* Duplicate tag has been found */
+ hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
+ }
+ }
+ hc->tags_seen[cur_tag->id] = true;
+ }
+
+ /* Shift to the first unclosed tag */
+ while (parent_tag && (parent_tag->flags & FL_CLOSED)) {
+ parent_tag = parent_tag->parent;
+ }
+
+ if (parent_tag) {
+ cur_tag->parent = parent_tag;
+ parent_tag->children.push_back(cur_tag);
+ }
+ else {
+ if (hc->root_tag) {
+ cur_tag->parent = hc->root_tag;
+ hc->root_tag->children.push_back(cur_tag);
+ parent_tag = hc->root_tag;
+ }
+ else {
+ if (cur_tag->id == Tag_HTML) {
+ hc->root_tag = cur_tag;
+ }
+ else {
+ /* Insert a fake html tag */
+ hc->all_tags.emplace_back(std::make_unique<html_tag>());
+ auto *top_tag = hc->all_tags.back().get();
+ top_tag->tag_start = 0;
+ top_tag->flags = FL_VIRTUAL;
+ if (in_head) {
+ top_tag->flags |= CM_HEAD;
+ }
+ top_tag->id = Tag_HTML;
+ top_tag->content_offset = 0;
+ top_tag->children.push_back(cur_tag);
+ cur_tag->parent = top_tag;
+ hc->root_tag = top_tag;
+ parent_tag = top_tag;
+ }
+ }
+ }
+
+ if (cur_tag->flags & FL_HREF && !in_head) {
+ auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+
+ if (maybe_url) {
+ url = maybe_url.value();
+
+ if (url_set != NULL) {
+ struct rspamd_url *maybe_existing =
+ rspamd_url_set_add_or_return (url_set, maybe_url.value());
+ if (maybe_existing == maybe_url.value()) {
+ html_process_query_url(pool, url, url_set,
+ part_urls);
+ }
+ else {
+ url = maybe_existing;
+ /* Increase count to avoid odd checks failure */
+ url->count ++;
+ }
+ }
+
+ href_offset = hc->parsed.size();
+ }
+ }
+ else if (cur_tag->id == Tag_BASE) {
+ /*
+ * Base is allowed only within head tag but HTML is retarded
+ */
+ if (hc->base_url == NULL) {
+ auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+
+ if (maybe_url) {
+ msg_debug_html ("got valid base tag");
+ hc->base_url = url;
+ cur_tag->extra = url;
+ cur_tag->flags |= FL_HREF;
+ }
+ else {
+ msg_debug_html ("got invalid base tag!");
+ }
+ }
+ }
+
+ if (cur_tag->id == Tag_IMG) {
+ html_process_img_tag(pool, cur_tag, hc, url_set,
+ part_urls);
+ }
+ else if (cur_tag->id == Tag_LINK) {
+ html_process_link_tag(pool, cur_tag, hc, url_set,
+ part_urls);
+ }
+
+ if (!(cur_tag->flags & CM_EMPTY)) {
+ html_process_block_tag(pool, cur_tag, hc);
+ }
+
+ if (cur_tag->flags & FL_CLOSED) {
+ cur_tag->closing.end = cur_tag->content_offset;
+ cur_tag->closing.start = cur_tag->tag_start;
+ }
+ };
+
p = (const char *)in->data;
c = p;
end = p + in->len;
@@ -1219,7 +1329,6 @@ html_process_input(rspamd_mempool_t *pool,
case parse_start:
if (t == '<') {
state = tag_begin;
- in_head = true;
}
else {
/* We have no starting tag, so assume that it's content */
@@ -1349,6 +1458,7 @@ html_process_input(rspamd_mempool_t *pool,
if (t == '>') {
state = tag_end_opening;
cur_tag->content_offset = p - start + 1;
+ continue;
}
else {
hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
@@ -1366,6 +1476,7 @@ html_process_input(rspamd_mempool_t *pool,
else if (t == '>' && obrace == ebrace) {
state = tag_end_opening;
cur_tag->content_offset = p - start + 1;
+ continue;
}
p ++;
break;
@@ -1468,10 +1579,11 @@ html_process_input(rspamd_mempool_t *pool,
/* TODO: parse DOCTYPE here */
if (t == '>') {
cur_tag->content_offset = p - start + 1;
- state = html_text_content;
- /* We don't know a lot about sgml tags, ignore them */
+ state = tag_end_opening;
+ }
+ else {
+ p++;
}
- p ++;
break;
case tag_content:
@@ -1510,111 +1622,7 @@ html_process_input(rspamd_mempool_t *pool,
content_parser_env.reset();
if (cur_tag != nullptr) {
-
- if (cur_tag->id < N_TAGS) {
- if (cur_tag->flags & CM_UNIQUE) {
- if (!hc->tags_seen[cur_tag->id]) {
- /* Duplicate tag has been found */
- hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
- }
- }
- hc->tags_seen[cur_tag->id] = true;
- }
-
- /* Shift to the first unclosed tag */
- while (parent_tag && (parent_tag->flags & FL_CLOSED)) {
- parent_tag = parent_tag->parent;
- }
-
- if (parent_tag) {
- cur_tag->parent = parent_tag;
- parent_tag->children.push_back(cur_tag);
- }
- else {
- if (hc->root_tag) {
- cur_tag->parent = hc->root_tag;
- hc->root_tag->children.push_back(cur_tag);
- parent_tag = hc->root_tag;
- }
- else {
- if (cur_tag->id == Tag_HTML) {
- hc->root_tag = cur_tag;
- }
- else {
- /* Insert a fake html tag */
- hc->all_tags.emplace_back(std::make_unique<html_tag>());
- auto *top_tag = hc->all_tags.back().get();
- top_tag->tag_start = 0;
- top_tag->flags = CM_HEAD|FL_VIRTUAL;
- top_tag->id = Tag_HTML;
- top_tag->content_offset = 0;
- top_tag->children.push_back(cur_tag);
- cur_tag->parent = top_tag;
- hc->root_tag = top_tag;
- parent_tag = top_tag;
- }
- }
- }
-
- if (cur_tag->flags & FL_HREF && !in_head) {
- auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
-
- if (maybe_url) {
- url = maybe_url.value();
-
- if (url_set != NULL) {
- struct rspamd_url *maybe_existing =
- rspamd_url_set_add_or_return (url_set, maybe_url.value());
- if (maybe_existing == maybe_url.value()) {
- html_process_query_url(pool, url, url_set,
- part_urls);
- }
- else {
- url = maybe_existing;
- /* Increase count to avoid odd checks failure */
- url->count ++;
- }
- }
-
- href_offset = hc->parsed.size();
- }
- }
- else if (cur_tag->id == Tag_BASE) {
- /*
- * Base is allowed only within head tag but HTML is retarded
- */
- if (hc->base_url == NULL) {
- auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
-
- if (maybe_url) {
- msg_debug_html ("got valid base tag");
- hc->base_url = url;
- cur_tag->extra = url;
- cur_tag->flags |= FL_HREF;
- }
- else {
- msg_debug_html ("got invalid base tag!");
- }
- }
- }
-
- if (cur_tag->id == Tag_IMG) {
- html_process_img_tag(pool, cur_tag, hc, url_set,
- part_urls);
- }
- else if (cur_tag->id == Tag_LINK) {
- html_process_link_tag(pool, cur_tag, hc, url_set,
- part_urls);
- }
-
- if (!(cur_tag->flags & CM_EMPTY)) {
- html_process_block_tag(pool, cur_tag, hc);
- }
-
- if (cur_tag->flags & FL_CLOSED) {
- cur_tag->closing.end = cur_tag->content_offset;
- cur_tag->closing.start = cur_tag->tag_start;
- }
+ process_opening_tag();
}
if (cur_tag && (cur_tag->id == Tag_STYLE)) {
@@ -1646,14 +1654,19 @@ html_process_input(rspamd_mempool_t *pool,
*/
auto *cur_opening_tag = cur_tag->parent;
+ while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
+ cur_opening_tag = cur_opening_tag->parent;
+ }
+
if (!cur_opening_tag) {
cur_opening_tag = hc->root_tag;
}
+
auto &&vtag = std::make_unique<html_tag>();
vtag->id = cur_tag->id;
- vtag->flags = FL_VIRTUAL | FL_CLOSED | FL_IGNORE;
+ vtag->flags = FL_VIRTUAL | FL_CLOSED;
vtag->tag_start = cur_tag->closing.start;
- vtag->content_offset = p - start;
+ vtag->content_offset = p - start + 1;
vtag->closing = cur_tag->closing;
vtag->parent = cur_opening_tag;
cur_opening_tag->children.push_back(vtag.get());
@@ -1719,6 +1732,18 @@ html_process_input(rspamd_mempool_t *pool,
return true;
}, html_content::traverse_type::PRE_ORDER);
+ /* Leftover before content */
+ switch (state) {
+ case tag_end_opening:
+ if (cur_tag != nullptr) {
+ process_opening_tag();
+ }
+ break;
+ default:
+ /* Do nothing */
+ break;
+ }
+
if (!hc->all_tags.empty()) {
std::sort(hc->all_tags.begin(), hc->all_tags.end(), [](const auto &pt1, const auto &pt2) -> auto {
return pt1->tag_start < pt2->tag_start;
@@ -1727,7 +1752,7 @@ html_process_input(rspamd_mempool_t *pool,
exceptions, url_set);
}
- /* Leftover */
+ /* Leftover after content */
switch (state) {
case html_text_content:
case content_before_start:
@@ -1735,6 +1760,11 @@ html_process_input(rspamd_mempool_t *pool,
html_append_content(hc, {c, std::size_t(p - c)});
}
break;
+ case tag_end_opening:
+ if (cur_tag != nullptr) {
+ process_opening_tag();
+ }
+ break;
default:
/* Do nothing */
break;
@@ -1788,8 +1818,13 @@ html_debug_structure(const html_content &hc) -> std::string
std::string pluses(level, '+');
if (!(t->flags & (FL_VIRTUAL|FL_IGNORE))) {
- output += fmt::format("{}{};", pluses,
- html_tags_defs.name_by_id_safe(t->id));
+ if (t->flags & FL_XML) {
+ output += fmt::format("{}xml;", pluses);
+ }
+ else {
+ output += fmt::format("{}{};", pluses,
+ html_tags_defs.name_by_id_safe(t->id));
+ }
level ++;
}
for (const auto *cld : t->children) {
@@ -1824,14 +1859,14 @@ TEST_CASE("html parsing")
{
const std::vector<std::pair<std::string, std::string>> cases{
- {"<html><!DOCTYPE html><body>", "+html;++body;"},
+ {"<html><!DOCTYPE html><body>", "+html;++xml;++body;"},
{"<html><div><div></div></div></html>", "+html;++div;+++div;"},
{"<html><div><div></div></html>", "+html;++div;+++div;"},
{"<html><div><div></div></html></div>", "+html;++div;+++div;"},
{"<p><p><a></p></a></a>", "+p;++p;+++a;"},
{"<div><a href=\"http://example.com\"></div></a>", "+div;++a;"},
{"<html><!DOCTYPE html><body><head><body></body></html></body></html>",
- "+html;++body;+++head;++++body;"}
+ "+html;++xml;++body;+++head;++++body;"}
};
rspamd_url_init(NULL);
@@ -1857,23 +1892,6 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
- /* Complex html with bad tags */
- {"<!DOCTYPE html>\n"
- "<html lang=\"en\">\n"
- " <head>\n"
- " <meta charset=\"utf-8\">\n"
- " <title>title</title>\n"
- " <link rel=\"stylesheet\" href=\"style.css\">\n"
- " <script src=\"script.js\"></script>\n"
- " </head>\n"
- " <body>\n"
- " <!-- page content -->\n"
- " Hello, world! <b>test</b>\n"
- " <p>data<>\n"
- " </P>\n"
- " <b>stuff</p>?\n"
- " </body>\n"
- "</html>", "Hello, world! test\ndata<> \nstuff?"},
/* XML tags */
{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
" <!DOCTYPE html\n"
@@ -1886,8 +1904,8 @@ TEST_CASE("html text extraction")
{"<p>text</p>", "text\n"},
{"olo<p>text</p>lolo", "olo\ntext\nlolo"},
{"<div>foo</div><div>bar</div>", "foo\nbar\n"},
- {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
+ {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
{"foo<br>baz", "foo\nbaz"},
{"<a href=https://example.com>test</a>", "test"},
{"<img alt=test>", "test"},
@@ -1901,10 +1919,27 @@ TEST_CASE("html text extraction")
"</body>", "\n\n\ntest\n"},
{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
"sh<span style=\"FONT-SIZE: 0px\">aring </span></div>", "fish\n"},
- /* FIXME: broken until rework */
+ /* FIXME: broken until rework of css parser */
//{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
// "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
- {"<p><!--comment-->test", "test"},
+ /* Complex html with bad tags */
+ {"<!DOCTYPE html>\n"
+ "<html lang=\"en\">\n"
+ " <head>\n"
+ " <meta charset=\"utf-8\">\n"
+ " <title>title</title>\n"
+ " <link rel=\"stylesheet\" href=\"style.css\">\n"
+ " <script src=\"script.js\"></script>\n"
+ " </head>\n"
+ " <body>\n"
+ " <!-- page content -->\n"
+ " Hello, world! <b>test</b>\n"
+ " <p>data<>\n"
+ " </P>\n"
+ " <b>stuff</p>?\n"
+ " </body>\n"
+ "</html>", "Hello, world! test\ndata<> \nstuff?"},
+ {"<p><!--comment-->test</br></hr><br>", "test\n"},
};
@@ -1939,7 +1974,7 @@ TEST_CASE("html text extraction")
}
-}
+} /* namespace rspamd::html */
void *
rspamd_html_process_part_full(rspamd_mempool_t *pool,
More information about the Commits
mailing list