commit b980196: [Rework] Html: Start html text extraction rework

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jun 22 16:21:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-21 19:59:35 +0100
URL: https://github.com/rspamd/rspamd/commit/b9801960d0ebb388dc2a4e93071f19868fe44bc0

[Rework] Html: Start html text extraction rework

---
 src/libserver/html/html.cxx | 263 ++++++++++++++++----------------------------
 1 file changed, 96 insertions(+), 167 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 073b733a2..00dcebad6 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1037,14 +1037,16 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	}
 }
 
-using tags_vector = std::vector<std::unique_ptr<struct html_tag>>;
-
-static auto
-tags_vector_ptr_dtor(void *ptr)
+static inline auto
+html_append_content(struct html_content *hc, std::string_view data) -> auto
 {
-	auto *ptags = (tags_vector *)ptr;
+	auto cur_offset = hc->parsed.size();
+	hc->parsed.append(data);
+	auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset,
+			hc->parsed.size() - cur_offset, true);
+	hc->parsed.resize(nlen + cur_offset);
 
-	delete ptags;
+	return nlen;
 }
 
 static auto
@@ -1055,9 +1057,9 @@ html_process_input(rspamd_mempool_t *pool,
 					GPtrArray *part_urls,
 					bool allow_css) -> html_content *
 {
-	const gchar *p, *c, *end;
+	const gchar *p, *c, *end, *start;
 	guchar t;
-	gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE;
+	gboolean closing = FALSE;
 	guint obrace = 0, ebrace = 0;
 	struct rspamd_url *url = NULL;
 	gint len, href_offset = -1;
@@ -1067,6 +1069,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 	enum {
 		parse_start = 0,
+		content_before_start,
 		tag_begin,
 		sgml_tag,
 		xml_tag,
@@ -1076,11 +1079,9 @@ html_process_input(rspamd_mempool_t *pool,
 		sgml_content,
 		tag_content,
 		tag_end,
+		html_text_content,
 		xml_tag_end,
-		content_ignore,
-		content_write,
 		content_style,
-		content_ignore_sp
 	} state = parse_start;
 
 	g_assert (in != NULL);
@@ -1092,6 +1093,7 @@ html_process_input(rspamd_mempool_t *pool,
 	p = (const char *)in->data;
 	c = p;
 	end = p + in->len;
+	start = c;
 
 	while (p < end) {
 		t = *p;
@@ -1104,9 +1106,17 @@ html_process_input(rspamd_mempool_t *pool,
 			else {
 				/* We have no starting tag, so assume that it's content */
 				hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
-				state = content_write;
+				state = content_before_start;
+			}
+			break;
+		case content_before_start:
+			if (t == '<') {
+				html_append_content(hc, {c, std::size_t(p - c)});
+				state = tag_begin;
+			}
+			else {
+				p ++;
 			}
-
 			break;
 		case tag_begin:
 			switch (t) {
@@ -1248,7 +1258,7 @@ html_process_input(rspamd_mempool_t *pool,
 			p ++;
 			break;
 
-		case content_ignore:
+		case html_text_content:
 			if (t != '<') {
 				p ++;
 			}
@@ -1257,126 +1267,6 @@ html_process_input(rspamd_mempool_t *pool,
 			}
 			break;
 
-		case content_write:
-
-			if (t != '<') {
-				if (t == '&') {
-					need_decode = TRUE;
-				}
-				else if (g_ascii_isspace (t)) {
-					save_space = TRUE;
-
-					if (p > c) {
-						if (need_decode) {
-							goffset old_offset = hc->parsed.size();
-
-							if (content_tag) {
-								if (content_tag->content_length == 0) {
-									content_tag->content_offset = old_offset;
-								}
-							}
-
-							hc->parsed.append(c, p - c);
-
-							len = decode_html_entitles_inplace(
-									hc->parsed.data() + old_offset,
-									(std::size_t)(p - c));
-							hc->parsed.resize(hc->parsed.size() + len - (p - c));
-
-							if (content_tag) {
-								content_tag->content_length += len;
-							}
-						}
-						else {
-							len = p - c;
-
-							if (content_tag) {
-								if (content_tag->content_length == 0) {
-									content_tag->content_offset = hc->parsed.size();
-								}
-
-								content_tag->content_length += len;
-							}
-
-							hc->parsed.append(c, len);
-						}
-					}
-
-					c = p;
-					state = content_ignore_sp;
-				}
-				else {
-					if (save_space) {
-						/* Append one space if needed */
-						if (!hc->parsed.empty() &&
-							!g_ascii_isspace (hc->parsed.back())) {
-							hc->parsed += " ";
-
-							if (content_tag) {
-								if (content_tag->content_length == 0) {
-									/*
-									 * Special case
-									 * we have a space at the beginning but
-									 * we have no set content_offset
-									 * so we need to do it here
-									 */
-									content_tag->content_offset = hc->parsed.size();
-								}
-								else {
-									content_tag->content_length++;
-								}
-							}
-						}
-						save_space = FALSE;
-					}
-				}
-			}
-			else {
-				if (c != p) {
-
-					if (need_decode) {
-						goffset old_offset = hc->parsed.size();
-
-						if (content_tag) {
-							if (content_tag->content_length == 0) {
-								content_tag->content_offset = hc->parsed.size();
-							}
-						}
-
-						hc->parsed.append(c, p - c);
-						len = decode_html_entitles_inplace(
-								hc->parsed.data() + old_offset,
-								(std::size_t)(p - c));
-						hc->parsed.resize(hc->parsed.size() + len - (p - c));
-
-						if (content_tag) {
-							content_tag->content_length += len;
-						}
-					}
-					else {
-						len = p - c;
-
-						if (content_tag) {
-							if (content_tag->content_length == 0) {
-								content_tag->content_offset = hc->parsed.size();
-							}
-
-							content_tag->content_length += len;
-						}
-
-						hc->parsed.append(c, len);
-					}
-				}
-
-				content_tag = NULL;
-
-				state = tag_begin;
-				continue;
-			}
-
-			p ++;
-			break;
-
 		case content_style: {
 
 			/*
@@ -1387,7 +1277,7 @@ html_process_input(rspamd_mempool_t *pool,
 					"</", 2);
 			if (end_style == -1 || g_ascii_tolower (p[end_style + 2]) != 's') {
 				/* Invalid style */
-				state = content_ignore;
+				state = tag_content;
 			}
 			else {
 
@@ -1411,17 +1301,6 @@ html_process_input(rspamd_mempool_t *pool,
 			}
 			break;
 		}
-
-		case content_ignore_sp:
-			if (!g_ascii_isspace (t)) {
-				c = p;
-				state = content_write;
-				continue;
-			}
-
-			p ++;
-			break;
-
 		case sgml_content:
 			/* TODO: parse DOCTYPE here */
 			if (t == '>') {
@@ -1457,18 +1336,13 @@ html_process_input(rspamd_mempool_t *pool,
 			content_parser_env.reset();
 
 			if (cur_tag != nullptr) {
+				state = html_text_content;
 
-				if (html_process_tag(pool, hc, cur_tag, tags_stack)) {
-					state = content_write;
-					need_decode = FALSE;
-				}
-				else {
+				cur_tag->content_offset = p - start;
+				if (!html_process_tag(pool, hc, cur_tag, tags_stack)) {
 					if (cur_tag->id == Tag_STYLE) {
 						state = content_style;
 					}
-					else {
-						state = content_ignore;
-					}
 				}
 
 				if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
@@ -1507,7 +1381,6 @@ html_process_input(rspamd_mempool_t *pool,
 							}
 						}
 					}
-					save_space = FALSE;
 				}
 
 				if ((cur_tag->id == Tag_P ||
@@ -1533,7 +1406,6 @@ html_process_input(rspamd_mempool_t *pool,
 							}
 						}
 					}
-					save_space = FALSE;
 				}
 
 				/* XXX: uncomment when styles parsing is not so broken */
@@ -1637,11 +1509,8 @@ html_process_input(rspamd_mempool_t *pool,
 					}
 				}
 			}
-			else {
-				state = content_write;
-			}
-
 
+			state = html_text_content;
 			p++;
 			c = p;
 			cur_tag = NULL;
@@ -1740,6 +1609,19 @@ html_process_input(rspamd_mempool_t *pool,
 		return true;
 	}, html_content::traverse_type::PRE_ORDER);
 
+	/* Leftover */
+	switch (state) {
+	case html_text_content:
+	case content_before_start:
+		if (p > c) {
+			html_append_content(hc, {c, std::size_t(p - c)});
+		}
+		break;
+	default:
+		/* Do nothing */
+		break;
+	}
+
 	return hc;
 }
 
@@ -1816,17 +1698,64 @@ TEST_CASE("html parsing")
 			"html", 0);
 
 	for (const auto &c : cases) {
-		GByteArray *tmp = g_byte_array_sized_new(c.first.size());
-		g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
-		auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true);
-		CHECK(hc != nullptr);
-		auto dump = html_debug_structure(*hc);
-		CHECK(c.second == dump);
-		g_byte_array_free(tmp, TRUE);
+		SUBCASE((std::string("extract tags from: ") + c.first).c_str()) {
+			GByteArray *tmp = g_byte_array_sized_new(c.first.size());
+			g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+			auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true);
+			CHECK(hc != nullptr);
+			auto dump = html_debug_structure(*hc);
+			CHECK(c.second == dump);
+			g_byte_array_free(tmp, TRUE);
+		}
 	}
 
 	rspamd_mempool_delete(pool);
 }
+
+TEST_CASE("html text extraction")
+{
+
+	const std::vector<std::pair<std::string, std::string>> cases{
+			{"test", "test"},
+			{"test   ", "test "},
+			{"test   foo,   bar", "test foo, bar"},
+			{"<p>text</p>", "text"},
+			{"olo<p>text</p>lolo", "olo\ntext\nlolo"},
+			{"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
+			{"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
+			{"foo<br>baz", "foo\nbaz"},
+			{"<div>foo</div><div>bar</div>", "foo\nbar"},
+	};
+
+	rspamd_url_init(NULL);
+	auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+			"html", 0);
+
+	auto replace_newlines = [](std::string &str) {
+		auto start_pos = 0;
+		while((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) {
+			str.replace(start_pos, 1, "\\n", 2);
+			start_pos += 2;
+		}
+	};
+
+	for (const auto &c : cases) {
+		SUBCASE((std::string("extract text from: ") + c.first).c_str()) {
+			GByteArray *tmp = g_byte_array_sized_new(c.first.size());
+			g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+			auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true);
+			CHECK(hc != nullptr);
+			replace_newlines(hc->parsed);
+			auto expected = c.second;
+			replace_newlines(expected);
+			CHECK(hc->parsed == expected);
+			g_byte_array_free(tmp, TRUE);
+		}
+	}
+
+	rspamd_mempool_delete(pool);
+}
+
 }
 
 }


More information about the Commits mailing list