commit 1d3c937: [Rework] Html: Deal with the utf_content part

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jun 2 19:56:08 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-02 20:50:48 +0100
URL: https://github.com/rspamd/rspamd/commit/1d3c9379b9044a59e3db06697f9967ba88137a1d

[Rework] Html: Deal with the utf_content part

---
 src/libmime/message.c          | 38 ++++++++++++++++++--------------------
 src/libmime/message.h          |  2 +-
 src/libmime/mime_expressions.c |  2 +-
 src/libserver/html/html.cxx    | 11 +++++++++++
 src/libserver/html/html.h      |  8 ++++++++
 src/libserver/re_cache.c       |  4 ++--
 src/lua/lua_html.cxx           | 36 +++++++++++++++---------------------
 src/lua/lua_mimepart.c         | 12 ++++++------
 src/lua/lua_parsers.c          | 16 ++++++----------
 src/lua/lua_trie.c             |  6 +++---
 10 files changed, 71 insertions(+), 64 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index 21ab36e27..4bdeb6612 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -522,10 +522,10 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 		part->utf_stripped_content = g_byte_array_new ();
 	}
 	else {
-		part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
+		part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len);
 
-		p = (const gchar *)part->utf_content->data;
-		end = p + part->utf_content->len;
+		p = (const gchar *)part->utf_content.begin;
+		end = p + part->utf_content.len;
 
 		rspamd_strip_newlines_parse (task, p, end, part);
 
@@ -668,10 +668,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
 		g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
 	}
 
-	if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
-			part->utf_content->len <= max_check_size) {
-		if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
-				part->utf_content->len,
+	if (part->utf_content.len >= sizeof (gtube_pattern_reject) &&
+			part->utf_content.len <= max_check_size) {
+		if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin,
+				part->utf_content.len,
 				rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
 
 			switch (ret) {
@@ -698,7 +698,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
 				msg_info_task (
 						"gtube %s pattern has been found in part of length %ud",
 						rspamd_action_to_str (act),
-						part->utf_content->len);
+						part->utf_content.len);
 			}
 		}
 	}
@@ -728,13 +728,16 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
 
 	if (text_part->utf_raw_content != NULL) {
 		/* Just have the same content */
-		text_part->utf_content = text_part->utf_raw_content;
+		text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data;
+		text_part->utf_content.len = text_part->utf_raw_content->len;
 	}
 	else {
 		/*
 		 * We ignore unconverted parts from now as it is dangerous
 		 * to treat them as text parts
 		 */
+		text_part->utf_content.begin = NULL;
+		text_part->utf_content.len = 0;
 
 		return FALSE;
 	}
@@ -760,26 +763,21 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
 		return FALSE;
 	}
 
-	text_part->html = rspamd_mempool_alloc0 (task->task_pool,
-			sizeof (*text_part->html));
+
 	text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
-	text_part->utf_content = rspamd_html_process_part_full (
+	text_part->html = rspamd_html_process_part_full (
 			task->task_pool,
-			text_part->html,
 			text_part->utf_raw_content,
 			&text_part->exceptions,
 			MESSAGE_FIELD (task, urls),
 			text_part->mime_part->urls,
 			task->cfg ? task->cfg->enable_css_parser : false);
+	rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
 
-	if (text_part->utf_content->len == 0) {
+	if (text_part->utf_content.len == 0) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
 	}
 
-	rspamd_mempool_add_destructor (task->task_pool,
-			(rspamd_mempool_destruct_t) free_byte_array_callback,
-			text_part->utf_content);
-
 	return TRUE;
 }
 
@@ -1546,7 +1544,7 @@ rspamd_message_process (struct rspamd_task *task)
 						sel = p2;
 					}
 					else {
-						if (p1->utf_content->len > p2->utf_content->len) {
+						if (p1->utf_content.len > p2->utf_content.len) {
 							sel = p1;
 						}
 						else {
@@ -1659,4 +1657,4 @@ void rspamd_message_update_digest (struct rspamd_message *msg,
 	memcpy (n, msg->digest, sizeof (msg->digest));
 	n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]);
 	memcpy (msg->digest, n, sizeof (msg->digest));
-}
\ No newline at end of file
+}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 13e40e2ef..8805fbf30 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -138,7 +138,7 @@ struct rspamd_mime_text_part {
 	rspamd_ftok_t parsed; /* decoded from mime encodings */
 
 	/* UTF8 content */
-	GByteArray *utf_content; /* utf8 encoded processed content */
+	rspamd_ftok_t utf_content; /* utf8 encoded processed content */
 	GByteArray *utf_raw_content; /* utf raw content */
 	GByteArray *utf_stripped_content; /* utf content with no newlines */
 	GArray *normalized_hashes; /* Array of guint64 */
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index 99c5d3a19..a528be50c 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -1625,7 +1625,7 @@ rspamd_has_fake_html (struct rspamd_task * task, GArray * args, void *unused)
 	gboolean res = FALSE;
 
 	PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, p) {
-		if (IS_TEXT_PART_HTML (p) && (p->html == NULL || p->html->html_tags == NULL)) {
+		if (IS_TEXT_PART_HTML (p) && (p->html == NULL)) {
 			res = TRUE;
 		}
 
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 00f1d331f..47e4e81a0 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -2387,4 +2387,15 @@ rspamd_html_find_embedded_image(void *html_content,
 	}
 
 	return nullptr;
+}
+
+bool
+rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
+{
+	auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+	dest->begin = hc->parsed.data();
+	dest->len = hc->parsed.size();
+
+	return true;
 }
\ No newline at end of file
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 1e71d0c2d..3b6592402 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -154,6 +154,14 @@ const gchar *rspamd_html_tag_name(void *tag, gsize *len);
 struct html_image* rspamd_html_find_embedded_image(void *html_content,
 		const char *cid, gsize cid_len);
 
+/**
+ * Stores parsed content in ftok_t structure
+ * @param html_content
+ * @param dest
+ * @return
+ */
+bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);
+
 
 #ifdef  __cplusplus
 }
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 1b591a81c..631981b30 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -1224,8 +1224,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
 							raw = TRUE;
 						}
 
-						in = text_part->utf_content->data;
-						len = text_part->utf_content->len;
+						in = text_part->utf_content.begin;
+						len = text_part->utf_content.len;
 					}
 				}
 
diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx
index 30bfa55d6..4dd59083c 100644
--- a/src/lua/lua_html.cxx
+++ b/src/lua/lua_html.cxx
@@ -16,6 +16,7 @@
 #include "lua_common.h"
 #include "message.h"
 #include "libserver/html/html.h"
+#include "libserver/html/html.hxx"
 #include "libserver/html/html_tag.hxx"
 #include "images.h"
 
@@ -180,12 +181,12 @@ static const struct luaL_reg taglib_m[] = {
 	{NULL, NULL}
 };
 
-static struct html_content *
+static struct rspamd::html::html_content *
 lua_check_html (lua_State * L, gint pos)
 {
 	void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html}");
 	luaL_argcheck (L, ud != NULL, pos, "'html' expected");
-	return ud ? *((struct html_content **)ud) : NULL;
+	return ud ? *((struct rspamd::html::html_content **)ud) : NULL;
 }
 
 struct lua_html_tag {
@@ -205,7 +206,7 @@ static gint
 lua_html_has_tag (lua_State *L)
 {
 	LUA_TRACE_POINT;
-	struct html_content *hc = lua_check_html (L, 1);
+	auto *hc = lua_check_html (L, 1);
 	const gchar *tagname = luaL_checkstring (L, 2);
 	gboolean ret = FALSE;
 
@@ -238,7 +239,7 @@ static gint
 lua_html_has_property (lua_State *L)
 {
 	LUA_TRACE_POINT;
-	struct html_content *hc = lua_check_html (L, 1);
+	auto *hc = lua_check_html (L, 1);
 	const gchar *propname = luaL_checkstring (L, 2);
 	gboolean ret = FALSE;
 
@@ -256,7 +257,7 @@ lua_html_has_property (lua_State *L)
 }
 
 static void
-lua_html_push_image (lua_State *L, struct html_image *img)
+lua_html_push_image (lua_State *L, const struct html_image *img)
 {
 	LUA_TRACE_POINT;
 	struct lua_html_tag *ltag;
@@ -319,22 +320,15 @@ static gint
 lua_html_get_images (lua_State *L)
 {
 	LUA_TRACE_POINT;
-	struct html_content *hc = lua_check_html (L, 1);
-	struct html_image *img;
-
-	guint i;
+	auto *hc = lua_check_html (L, 1);
+	guint i = 1;
 
 	if (hc != NULL) {
-		if (hc->images) {
-			lua_createtable (L, hc->images->len, 0);
+		lua_createtable (L, hc->images.size(), 0);
 
-			PTR_ARRAY_FOREACH (hc->images, i, img) {
-				lua_html_push_image (L, img);
-				lua_rawseti (L, -2, i + 1);
-			}
-		}
-		else {
-			lua_newtable (L);
+		for (const auto *img : hc->images) {
+			lua_html_push_image (L, img);
+			lua_rawseti (L, -2, i++);
 		}
 	}
 	else {
@@ -410,14 +404,14 @@ static gint
 lua_html_get_blocks (lua_State *L)
 {
 	LUA_TRACE_POINT;
-	struct html_content *hc = lua_check_html (L, 1);
+	auto *hc = lua_check_html (L, 1);
 	struct html_block *bl;
 
 	guint i;
 
 	if (hc != NULL) {
-		if (hc->blocks && hc->blocks->len > 0) {
-			lua_createtable (L, hc->blocks->len, 0);
+		if (hc->blocks.size() > 0) {
+			lua_createtable (L, hc->blocks.size(), 0);
 
 			for (i = 0; i < hc->blocks->len; i ++) {
 				bl = static_cast<decltype(bl)>(g_ptr_array_index (hc->blocks, i));
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index fe8bb4246..b6e5b157d 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -694,8 +694,8 @@ lua_textpart_get_content (lua_State * L)
 			lua_pushnil (L);
 			return 1;
 		}
-		start = part->utf_content->data;
-		len = part->utf_content->len;
+		start = part->utf_content.begin;
+		len = part->utf_content.len;
 	}
 	else if (strcmp (type, "content") == 0) {
 		if (IS_TEXT_PART_EMPTY (part)) {
@@ -703,8 +703,8 @@ lua_textpart_get_content (lua_State * L)
 			return 1;
 		}
 
-		start = part->utf_content->data;
-		len = part->utf_content->len;
+		start = part->utf_content.begin;
+		len = part->utf_content.len;
 	}
 	else if (strcmp (type, "content_oneline") == 0) {
 		if (IS_TEXT_PART_EMPTY (part)) {
@@ -809,11 +809,11 @@ lua_textpart_get_length (lua_State * L)
 		return 1;
 	}
 
-	if (IS_TEXT_PART_EMPTY (part) || part->utf_content == NULL) {
+	if (IS_TEXT_PART_EMPTY (part) || part->utf_content.len == 0) {
 		lua_pushinteger (L, 0);
 	}
 	else {
-		lua_pushinteger (L, part->utf_content->len);
+		lua_pushinteger (L, part->utf_content.len);
 	}
 
 	return 1;
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
index a0c2f264d..6c75d8039 100644
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -206,9 +206,9 @@ lua_parsers_parse_html (lua_State *L)
 	struct rspamd_lua_text *t;
 	const gchar *start = NULL;
 	gsize len;
-	GByteArray *res, *in;
+	GByteArray *in;
 	rspamd_mempool_t *pool;
-	struct html_content *hc;
+	void *hc;
 
 	if (lua_type (L, 1) == LUA_TUSERDATA) {
 		t = lua_check_text (L, 1);
@@ -224,19 +224,15 @@ lua_parsers_parse_html (lua_State *L)
 
 	if (start != NULL) {
 		pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
-		hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
 		in = g_byte_array_sized_new (len);
 		g_byte_array_append (in, start, len);
 
-		res = rspamd_html_process_part (pool, hc, in);
+		hc = rspamd_html_process_part(pool, in);
 
-		t = lua_newuserdata (L, sizeof (*t));
-		rspamd_lua_setclass (L, "rspamd{text}", -1);
-		t->start = res->data;
-		t->len = res->len;
-		t->flags = RSPAMD_TEXT_FLAG_OWN;
+		rspamd_ftok_t res;
+		rspamd_html_get_parsed_content(hc, &res);
+		lua_new_text(L, res.begin, res.len, TRUE);
 
-		g_byte_array_free (res, FALSE);
 		g_byte_array_free (in, TRUE);
 		rspamd_mempool_delete (pool);
 	}
diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c
index 33e5832a8..3b1e946ec 100644
--- a/src/lua/lua_trie.c
+++ b/src/lua/lua_trie.c
@@ -375,9 +375,9 @@ lua_trie_search_mime (lua_State *L)
 
 	if (trie && task) {
 		PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
-			if (!IS_TEXT_PART_EMPTY (part) && part->utf_content != NULL) {
-				text = part->utf_content->data;
-				len = part->utf_content->len;
+			if (!IS_TEXT_PART_EMPTY (part) && part->utf_content.len > 0) {
+				text = part->utf_content.begin;
+				len = part->utf_content.len;
 
 				if (lua_trie_search_str (L, trie, text, len, cb) != 0) {
 					found = TRUE;


More information about the Commits mailing list