commit 620d003: [Project] Html: Implement logic for tags pairing
Vsevolod Stakhov
vsevolod at highsecure.ru
Sat Jun 26 13:49:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-26 14:30:19 +0100
URL: https://github.com/rspamd/rspamd/commit/620d003e7ab56e8f4cf1fc4a2569fd20f8f98b3f
[Project] Html: Implement logic for tags pairing
---
src/libserver/html/html.cxx | 135 ++++++++++++++++++++++++++++++++--------
src/libserver/html/html_tag.hxx | 15 ++---
src/libserver/html/html_tags.h | 7 +--
3 files changed, 120 insertions(+), 37 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 894b1ee45..8d312b733 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -76,51 +76,136 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
INIT_LOG_MODULE(html)
static auto
-html_check_balance(struct html_tag *tag,
+html_check_balance(struct html_content *hc,
+ struct html_tag *tag,
struct html_tag *parent,
std::vector<html_tag *> &tags_stack,
goffset tag_start_offset,
goffset tag_end_offset) -> bool
{
- if (tag->flags & FL_CLOSING) {
- /* Find the opening pair if any and check if it is correctly placed */
- auto found_opening = std::find_if(tags_stack.rbegin(), tags_stack.rend(),
- [&](const html_tag *t) {
- return (t->flags & FL_CLOSED) == 0 && t->id == tag->id;
- });
+ auto calculate_content_length = [tag_start_offset](html_tag *t) {
+ auto opening_content_offset = t->content_offset;
- if (found_opening != tags_stack.rend()) {
- auto *opening_tag = (*found_opening);
- opening_tag->flags |= FL_CLOSED;
+ if (opening_content_offset <= tag_start_offset) {
+ t->content_length = tag_start_offset - opening_content_offset;
+ }
+ else {
+ t->content_length = 0;
+ }
+ };
- /* Adjust size */
- auto opening_content_offset = opening_tag->content_offset;
+ auto balance_tag = [&]() -> void {
+ auto it = tags_stack.rbegin();
- if (opening_content_offset <= tag_start_offset) {
- opening_tag->content_length =
- tag_start_offset - opening_content_offset;
- }
- else {
- opening_tag->content_length = 0;
+ for (auto end_it = tags_stack.rend(); it != end_it; ++it) {
+ if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) {
+ break;
}
+ /* Insert a virtual closing tag for all tags that are not closed */
+ auto &&vtag = std::make_unique<html_tag>();
+ vtag->id = (*it)->id;
+ vtag->flags = FL_VIRTUAL|FL_CLOSING;
+ vtag->tag_start = tag->tag_start;
+ vtag->content_offset = tag->content_offset;
+ vtag->content_length = 0;
+ vtag->parent = (*it)->parent;
+ calculate_content_length(*it);
+ (*it)->flags |= FL_CLOSED;
+ hc->all_tags.emplace_back(std::move(vtag));
+ }
+
+ /* Remove tags */
+ tags_stack.erase(it.base(), std::end(tags_stack));
+ };
- if (found_opening == tags_stack.rbegin()) {
+ if (tag->flags & FL_CLOSING) {
+ if (!tags_stack.empty()) {
+ auto *last_tag = tags_stack.back();
+
+ if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) {
+ last_tag->flags |= FL_CLOSED;
+
+ calculate_content_length(last_tag);
tags_stack.pop_back();
/* All good */
return true;
}
else {
- /* Move to front */
- std::iter_swap(found_opening, tags_stack.rbegin());
- tags_stack.pop_back();
+ balance_tag();
+
return false;
}
}
else {
- /* We have unpaired tag */
- return false;
+ /*
+ * We have no opening tags in the stack, so we need to assume that there
+ * is an opening tag at the beginning of the document.
+ * There are two possibilities:
+ *
+ * 1) We have some block tag in hc->all_tags;
+ * 2) We have no tags
+ */
+
+ if (hc->all_tags.empty()) {
+ auto &&vtag = std::make_unique<html_tag>();
+ vtag->id = tag->id;
+ vtag->flags = FL_VIRTUAL|FL_CLOSED;
+ vtag->tag_start = 0;
+ vtag->content_offset = 0;
+ calculate_content_length(vtag.get());
+
+
+ if (!hc->root_tag) {
+ hc->root_tag = vtag.get();
+ }
+ else {
+ vtag->parent = hc->root_tag;
+ }
+ hc->all_tags.emplace_back(std::move(vtag));
+ }
+ else {
+ auto found_closing = std::find_if(hc->all_tags.rbegin(),
+ hc->all_tags.rend(),
+ [&](const auto &t) {
+ constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING;
+ return (t->flags & expect_flags) == (expect_flags) &&
+ t.get() != tag &&
+ t->parent != nullptr;
+ });
+
+ if (found_closing != hc->all_tags.rend()) {
+ auto *closing_tag = (*found_closing).get();
+ auto &&vtag = std::make_unique<html_tag>();
+ vtag->id = tag->id;
+ vtag->flags = FL_VIRTUAL|FL_CLOSED;
+ vtag->tag_start = closing_tag->content_offset - 1;
+ vtag->content_offset = vtag->tag_start + 1;
+ vtag->parent = closing_tag->parent;
+ vtag->content_length = tag->tag_start - vtag->content_offset;
+ hc->all_tags.emplace_back(std::move(vtag));
+ }
+ else {
+ auto &&vtag = std::make_unique<html_tag>();
+ vtag->id = tag->id;
+ vtag->flags = FL_VIRTUAL|FL_CLOSED;
+ vtag->tag_start = 0;
+ vtag->content_offset = 0;
+ calculate_content_length(vtag.get());
+
+
+ if (!hc->root_tag) {
+ hc->root_tag = vtag.get();
+ }
+ else {
+ vtag->parent = hc->root_tag;
+ }
+ hc->all_tags.emplace_back(std::move(vtag));
+ }
+ }
}
+
+ return false;
}
/* Misuse */
@@ -166,7 +251,7 @@ html_process_tag(rspamd_mempool_t *pool,
return false;
}
- if (!html_check_balance(tag, parent, tags_stack,
+ if (!html_check_balance(hc, tag, parent, tags_stack,
tag_start_offset, tag_end_offset)) {
msg_debug_html (
"mark part as unbalanced as it has not pairable closing tags");
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 9091b9060..36110c8c7 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -43,16 +43,17 @@ enum class html_component_type : std::uint8_t {
/* Public tags flags */
/* XML tag */
-#define FL_XML (1 << 23)
+#define FL_XML (1 << 22)
/* Closing tag */
-#define FL_CLOSING (1 << 24)
+#define FL_CLOSING (1 << 23)
/* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED (1 << 25)
-#define FL_BROKEN (1 << 26)
-#define FL_IGNORE (1 << 27)
-#define FL_BLOCK (1 << 28)
-#define FL_HREF (1 << 29)
+#define FL_CLOSED (1 << 24)
+#define FL_BROKEN (1 << 25)
+#define FL_IGNORE (1 << 26)
+#define FL_BLOCK (1 << 27)
+#define FL_HREF (1 << 28)
#define FL_COMMENT (1 << 29)
+#define FL_VIRTUAL (1 << 30)
/**
* Returns component type from a string
diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h
index 8f430204e..3f209c08e 100644
--- a/src/libserver/html/html_tags.h
+++ b/src/libserver/html/html_tags.h
@@ -191,13 +191,10 @@ typedef enum {
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
-/* User defined elements. Used to determine how attributes without value
- should be printed. */
-#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
-#define CM_OMITST (1 << 21)
+#define CM_OMITST (1 << 20)
/* Unique elements */
-#define CM_UNIQUE (1 << 22)
+#define CM_UNIQUE (1 << 21)
#ifdef __cplusplus
}
More information about the Commits
mailing list