commit ce88665: [Rework] Html: Further rework of the tags content extraction

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jun 22 16:21:07 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-22 00:21:24 +0100
URL: https://github.com/rspamd/rspamd/commit/ce88665b0cd344fff4b2924448499b5a4ebe4f94

[Rework] Html: Further rework of the tags content extraction

---
 src/libserver/html/html.cxx         | 123 ++++++++++++++++++------------------
 src/libserver/html/html_block.hxx   |   6 +-
 src/libstat/tokenizers/tokenizers.c |   8 ---
 src/libutil/util.h                  |   1 -
 4 files changed, 66 insertions(+), 72 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 925735f41..f82bd0359 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1069,6 +1069,54 @@ html_append_content(struct html_content *hc, std::string_view data) -> auto
 	return nlen;
 }
 
+static auto
+html_append_tag_content(const gchar *start, gsize len,
+						struct html_content *hc, const struct html_tag *tag) -> void
+{
+	auto cur_offset = tag->content_offset;
+	auto total_len = tag->content_length;
+
+	if (cur_offset > len || total_len + cur_offset > len) {
+		RSPAMD_UNREACHABLE;
+	}
+
+	if (tag->id == Tag_BR || tag->id == Tag_HR) {
+		hc->parsed.append("\n");
+		return;
+	}
+
+	if (!tag->block) {
+		return; /* XXX: is it always true? */
+	}
+
+	if (tag->block->has_display() && tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+		hc->parsed.append("\n");
+	}
+
+	for (const auto &cld_tag : tag->children) {
+		if (cld_tag->tag_start > cur_offset) {
+			if (tag->block->is_visible()) {
+				html_append_content(hc, {start + cur_offset,
+										 cld_tag->tag_start - cur_offset});
+			}
+		}
+		html_append_tag_content(start, len, hc, cld_tag);
+		auto old_offset = cur_offset;
+		cur_offset = cld_tag->content_offset + cld_tag->content_length;
+
+		if (total_len < cur_offset - old_offset) {
+			/* Child tag spans over parent (e.g. wrong nesting) */
+			total_len = 0;
+			break;
+		}
+		total_len -= cur_offset - old_offset;
+	}
+
+	if (total_len > 0 && tag->block->is_visible()) {
+		html_append_content(hc, {start + cur_offset, total_len});
+	}
+}
+
 static auto
 html_process_input(rspamd_mempool_t *pool,
 					GByteArray *in,
@@ -1490,17 +1538,8 @@ html_process_input(rspamd_mempool_t *pool,
 		}
 	}
 
-	/* Summarize content length from children */
-	hc->traverse_block_tags([](const html_tag *tag) -> bool {
-
-		for (const auto *cld_tag : tag->children) {
-			tag->content_length += cld_tag->content_length;
-		}
-		return true;
-	}, html_content::traverse_type::POST_ORDER);
-
 	/* Propagate styles */
-	hc->traverse_block_tags([&hc, &exceptions,&pool](const html_tag *tag) -> bool {
+	hc->traverse_block_tags([&hc](const html_tag *tag) -> bool {
 		if (hc->css_style) {
 			auto *css_block = hc->css_style->check_tag_block(tag);
 
@@ -1514,62 +1553,18 @@ html_process_input(rspamd_mempool_t *pool,
 			}
 		}
 		if (tag->block) {
-			tag->block->compute_visibility();
-
-			if (exceptions) {
-				if (!tag->block->is_visible()) {
-					if (tag->parent == nullptr || (tag->parent->block && tag->parent->block->is_visible())) {
-						/* Add exception for an invisible element */
-						auto * ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
-						ex->pos = tag->content_offset;
-						ex->len = tag->content_length;
-						ex->type = RSPAMD_EXCEPTION_INVISIBLE;
-						ex->ptr = (void *)tag;
-
-						*exceptions = g_list_prepend(*exceptions, ex);
-					}
+			if (!tag->block->has_display()) {
+				/* If we have no display field, we can check it by tag */
+				if (tag->flags & CM_BLOCK) {
+					tag->block->set_display(css::css_display_value::DISPLAY_BLOCK);
 				}
-				else if (*exceptions && tag->parent) {
-					/* Current block is visible, check if parent is invisible */
-					auto *ex = (struct rspamd_process_exception*)g_list_first(*exceptions)->data;
-
-					/*
-					 * TODO: we need to handle the following cases:
-					 * <inv><vis><inv> -< insert one more exception
-					 * <vis><inv> -< increase content_offset decrease length
-					 * <inv><vis> -< decrease length
-					 */
-					if (ex && ex->type == RSPAMD_EXCEPTION_INVISIBLE &&
-						ex->ptr == (void *)tag->parent) {
-						auto *parent = tag->parent;
-
-						if (tag->content_offset + tag->content_length ==
-							parent->content_offset + parent->content_length) {
-							/* <inv><vis> */
-							ex->len -= tag->content_length;
-						}
-						else if (tag->content_offset == parent->content_offset) {
-							/* <vis><inv> */
-							ex->len -= tag->content_length;
-							ex->pos += tag->content_length;
-						}
-						else if (tag->content_offset > ex->pos) {
-							auto *nex = rspamd_mempool_alloc_type (pool,
-									struct rspamd_process_exception);
-							auto start_len = tag->content_offset - ex->pos;
-							auto end_len = ex->len - tag->content_length - tag->content_length;
-							nex->pos = tag->content_offset + tag->content_length;
-							nex->len = end_len;
-							nex->type = RSPAMD_EXCEPTION_INVISIBLE;
-							nex->ptr = (void *)parent; /* ! */
-							ex->len = start_len;
-							*exceptions = g_list_prepend(*exceptions, ex);
-						}
-
-					}
+				else {
+					tag->block->set_display(css::css_display_value::DISPLAY_INLINE);
 				}
 			}
 
+			tag->block->compute_visibility();
+
 			for (const auto *cld_tag : tag->children) {
 				if (cld_tag->block) {
 					cld_tag->block->propagate_block(*tag->block);
@@ -1582,6 +1577,10 @@ html_process_input(rspamd_mempool_t *pool,
 		return true;
 	}, html_content::traverse_type::PRE_ORDER);
 
+	if (hc->root_tag) {
+		html_append_tag_content(start, end - start, hc, hc->root_tag);
+	}
+
 	/* Leftover */
 	switch (state) {
 	case html_text_content:
diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx
index 51f3dbb9d..f2bbf1d64 100644
--- a/src/libserver/html/html_block.hxx
+++ b/src/libserver/html/html_block.hxx
@@ -219,6 +219,10 @@ struct html_block {
 		return (mask & transparent_flag) != 0;
 	}
 
+	constexpr auto has_display(void) const -> bool {
+		return (mask & display_mask) != 0;
+	}
+
 	/**
 	 * Returns a default html block for root HTML element
 	 * @return
@@ -227,7 +231,7 @@ struct html_block {
 		return html_block{rspamd::css::css_color::black(),
 						  rspamd::css::css_color::white(),
 						  0, 0,
-						  (fg_color_mask|bg_color_mask|display_mask|font_size_mask),
+						  (fg_color_mask|bg_color_mask|font_size_mask),
 						  rspamd::css::css_display_value::DISPLAY_INLINE,
 						  12};
 	}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 2dd4a6f5a..b4f8ac75c 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -275,14 +275,6 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
 		g_array_append_val (res, token);
 		token.flags = 0;
 	}
-	else if (ex->type == RSPAMD_EXCEPTION_INVISIBLE) {
-		token.original.begin = "!!INV!!";
-		token.original.len = sizeof ("!!INV!!") - 1;
-		token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-
-		g_array_append_val (res, token);
-		token.flags = 0;
-	}
 }
 
 
diff --git a/src/libutil/util.h b/src/libutil/util.h
index 9ee8a09ae..d993fcbdf 100644
--- a/src/libutil/util.h
+++ b/src/libutil/util.h
@@ -25,7 +25,6 @@ enum rspamd_exception_type {
 	RSPAMD_EXCEPTION_NEWLINE = 0,
 	RSPAMD_EXCEPTION_URL,
 	RSPAMD_EXCEPTION_GENERIC,
-	RSPAMD_EXCEPTION_INVISIBLE,
 };
 /**
  * Structure to point exception in text from processing


More information about the Commits mailing list