commit 1d3c937: [Rework] Html: Deal with the utf_content part
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Jun 2 19:56:08 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-02 20:50:48 +0100
URL: https://github.com/rspamd/rspamd/commit/1d3c9379b9044a59e3db06697f9967ba88137a1d
[Rework] Html: Deal with the utf_content part
---
src/libmime/message.c | 38 ++++++++++++++++++--------------------
src/libmime/message.h | 2 +-
src/libmime/mime_expressions.c | 2 +-
src/libserver/html/html.cxx | 11 +++++++++++
src/libserver/html/html.h | 8 ++++++++
src/libserver/re_cache.c | 4 ++--
src/lua/lua_html.cxx | 36 +++++++++++++++---------------------
src/lua/lua_mimepart.c | 12 ++++++------
src/lua/lua_parsers.c | 16 ++++++----------
src/lua/lua_trie.c | 6 +++---
10 files changed, 71 insertions(+), 64 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 21ab36e27..4bdeb6612 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -522,10 +522,10 @@ rspamd_normalize_text_part (struct rspamd_task *task,
part->utf_stripped_content = g_byte_array_new ();
}
else {
- part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
+ part->utf_stripped_content = g_byte_array_sized_new (part->utf_content.len);
- p = (const gchar *)part->utf_content->data;
- end = p + part->utf_content->len;
+ p = (const gchar *)part->utf_content.begin;
+ end = p + part->utf_content.len;
rspamd_strip_newlines_parse (task, p, end, part);
@@ -668,10 +668,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
}
- if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
- part->utf_content->len <= max_check_size) {
- if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
- part->utf_content->len,
+ if (part->utf_content.len >= sizeof (gtube_pattern_reject) &&
+ part->utf_content.len <= max_check_size) {
+ if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content.begin,
+ part->utf_content.len,
rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
switch (ret) {
@@ -698,7 +698,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
msg_info_task (
"gtube %s pattern has been found in part of length %ud",
rspamd_action_to_str (act),
- part->utf_content->len);
+ part->utf_content.len);
}
}
}
@@ -728,13 +728,16 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
if (text_part->utf_raw_content != NULL) {
/* Just have the same content */
- text_part->utf_content = text_part->utf_raw_content;
+ text_part->utf_content.begin = (const gchar *)text_part->utf_raw_content->data;
+ text_part->utf_content.len = text_part->utf_raw_content->len;
}
else {
/*
* We ignore unconverted parts from now as it is dangerous
* to treat them as text parts
*/
+ text_part->utf_content.begin = NULL;
+ text_part->utf_content.len = 0;
return FALSE;
}
@@ -760,26 +763,21 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
return FALSE;
}
- text_part->html = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (*text_part->html));
+
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
- text_part->utf_content = rspamd_html_process_part_full (
+ text_part->html = rspamd_html_process_part_full (
task->task_pool,
- text_part->html,
text_part->utf_raw_content,
&text_part->exceptions,
MESSAGE_FIELD (task, urls),
text_part->mime_part->urls,
task->cfg ? task->cfg->enable_css_parser : false);
+ rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
- if (text_part->utf_content->len == 0) {
+ if (text_part->utf_content.len == 0) {
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
}
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- text_part->utf_content);
-
return TRUE;
}
@@ -1546,7 +1544,7 @@ rspamd_message_process (struct rspamd_task *task)
sel = p2;
}
else {
- if (p1->utf_content->len > p2->utf_content->len) {
+ if (p1->utf_content.len > p2->utf_content.len) {
sel = p1;
}
else {
@@ -1659,4 +1657,4 @@ void rspamd_message_update_digest (struct rspamd_message *msg,
memcpy (n, msg->digest, sizeof (msg->digest));
n[0] = t1ha2_atonce128 (&n[1], input, len, n[0]);
memcpy (msg->digest, n, sizeof (msg->digest));
-}
\ No newline at end of file
+}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 13e40e2ef..8805fbf30 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -138,7 +138,7 @@ struct rspamd_mime_text_part {
rspamd_ftok_t parsed; /* decoded from mime encodings */
/* UTF8 content */
- GByteArray *utf_content; /* utf8 encoded processed content */
+ rspamd_ftok_t utf_content; /* utf8 encoded processed content */
GByteArray *utf_raw_content; /* utf raw content */
GByteArray *utf_stripped_content; /* utf content with no newlines */
GArray *normalized_hashes; /* Array of guint64 */
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index 99c5d3a19..a528be50c 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -1625,7 +1625,7 @@ rspamd_has_fake_html (struct rspamd_task * task, GArray * args, void *unused)
gboolean res = FALSE;
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, p) {
- if (IS_TEXT_PART_HTML (p) && (p->html == NULL || p->html->html_tags == NULL)) {
+ if (IS_TEXT_PART_HTML (p) && (p->html == NULL)) {
res = TRUE;
}
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 00f1d331f..47e4e81a0 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -2387,4 +2387,15 @@ rspamd_html_find_embedded_image(void *html_content,
}
return nullptr;
+}
+
+bool
+rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
+{
+ auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+ dest->begin = hc->parsed.data();
+ dest->len = hc->parsed.size();
+
+ return true;
}
\ No newline at end of file
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 1e71d0c2d..3b6592402 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -154,6 +154,14 @@ const gchar *rspamd_html_tag_name(void *tag, gsize *len);
struct html_image* rspamd_html_find_embedded_image(void *html_content,
const char *cid, gsize cid_len);
+/**
+ * Stores parsed content in ftok_t structure
+ * @param html_content
+ * @param dest
+ * @return
+ */
+bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);
+
#ifdef __cplusplus
}
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 1b591a81c..631981b30 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -1224,8 +1224,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
raw = TRUE;
}
- in = text_part->utf_content->data;
- len = text_part->utf_content->len;
+ in = text_part->utf_content.begin;
+ len = text_part->utf_content.len;
}
}
diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx
index 30bfa55d6..4dd59083c 100644
--- a/src/lua/lua_html.cxx
+++ b/src/lua/lua_html.cxx
@@ -16,6 +16,7 @@
#include "lua_common.h"
#include "message.h"
#include "libserver/html/html.h"
+#include "libserver/html/html.hxx"
#include "libserver/html/html_tag.hxx"
#include "images.h"
@@ -180,12 +181,12 @@ static const struct luaL_reg taglib_m[] = {
{NULL, NULL}
};
-static struct html_content *
+static struct rspamd::html::html_content *
lua_check_html (lua_State * L, gint pos)
{
void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html}");
luaL_argcheck (L, ud != NULL, pos, "'html' expected");
- return ud ? *((struct html_content **)ud) : NULL;
+ return ud ? *((struct rspamd::html::html_content **)ud) : NULL;
}
struct lua_html_tag {
@@ -205,7 +206,7 @@ static gint
lua_html_has_tag (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_content *hc = lua_check_html (L, 1);
+ auto *hc = lua_check_html (L, 1);
const gchar *tagname = luaL_checkstring (L, 2);
gboolean ret = FALSE;
@@ -238,7 +239,7 @@ static gint
lua_html_has_property (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_content *hc = lua_check_html (L, 1);
+ auto *hc = lua_check_html (L, 1);
const gchar *propname = luaL_checkstring (L, 2);
gboolean ret = FALSE;
@@ -256,7 +257,7 @@ lua_html_has_property (lua_State *L)
}
static void
-lua_html_push_image (lua_State *L, struct html_image *img)
+lua_html_push_image (lua_State *L, const struct html_image *img)
{
LUA_TRACE_POINT;
struct lua_html_tag *ltag;
@@ -319,22 +320,15 @@ static gint
lua_html_get_images (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_content *hc = lua_check_html (L, 1);
- struct html_image *img;
-
- guint i;
+ auto *hc = lua_check_html (L, 1);
+ guint i = 1;
if (hc != NULL) {
- if (hc->images) {
- lua_createtable (L, hc->images->len, 0);
+ lua_createtable (L, hc->images.size(), 0);
- PTR_ARRAY_FOREACH (hc->images, i, img) {
- lua_html_push_image (L, img);
- lua_rawseti (L, -2, i + 1);
- }
- }
- else {
- lua_newtable (L);
+ for (const auto *img : hc->images) {
+ lua_html_push_image (L, img);
+ lua_rawseti (L, -2, i++);
}
}
else {
@@ -410,14 +404,14 @@ static gint
lua_html_get_blocks (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_content *hc = lua_check_html (L, 1);
+ auto *hc = lua_check_html (L, 1);
struct html_block *bl;
guint i;
if (hc != NULL) {
- if (hc->blocks && hc->blocks->len > 0) {
- lua_createtable (L, hc->blocks->len, 0);
+ if (hc->blocks.size() > 0) {
+ lua_createtable (L, hc->blocks.size(), 0);
for (i = 0; i < hc->blocks->len; i ++) {
bl = static_cast<decltype(bl)>(g_ptr_array_index (hc->blocks, i));
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index fe8bb4246..b6e5b157d 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -694,8 +694,8 @@ lua_textpart_get_content (lua_State * L)
lua_pushnil (L);
return 1;
}
- start = part->utf_content->data;
- len = part->utf_content->len;
+ start = part->utf_content.begin;
+ len = part->utf_content.len;
}
else if (strcmp (type, "content") == 0) {
if (IS_TEXT_PART_EMPTY (part)) {
@@ -703,8 +703,8 @@ lua_textpart_get_content (lua_State * L)
return 1;
}
- start = part->utf_content->data;
- len = part->utf_content->len;
+ start = part->utf_content.begin;
+ len = part->utf_content.len;
}
else if (strcmp (type, "content_oneline") == 0) {
if (IS_TEXT_PART_EMPTY (part)) {
@@ -809,11 +809,11 @@ lua_textpart_get_length (lua_State * L)
return 1;
}
- if (IS_TEXT_PART_EMPTY (part) || part->utf_content == NULL) {
+ if (IS_TEXT_PART_EMPTY (part) || part->utf_content.len == 0) {
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->utf_content->len);
+ lua_pushinteger (L, part->utf_content.len);
}
return 1;
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
index a0c2f264d..6c75d8039 100644
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -206,9 +206,9 @@ lua_parsers_parse_html (lua_State *L)
struct rspamd_lua_text *t;
const gchar *start = NULL;
gsize len;
- GByteArray *res, *in;
+ GByteArray *in;
rspamd_mempool_t *pool;
- struct html_content *hc;
+ void *hc;
if (lua_type (L, 1) == LUA_TUSERDATA) {
t = lua_check_text (L, 1);
@@ -224,19 +224,15 @@ lua_parsers_parse_html (lua_State *L)
if (start != NULL) {
pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
- hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
in = g_byte_array_sized_new (len);
g_byte_array_append (in, start, len);
- res = rspamd_html_process_part (pool, hc, in);
+ hc = rspamd_html_process_part(pool, in);
- t = lua_newuserdata (L, sizeof (*t));
- rspamd_lua_setclass (L, "rspamd{text}", -1);
- t->start = res->data;
- t->len = res->len;
- t->flags = RSPAMD_TEXT_FLAG_OWN;
+ rspamd_ftok_t res;
+ rspamd_html_get_parsed_content(hc, &res);
+ lua_new_text(L, res.begin, res.len, TRUE);
- g_byte_array_free (res, FALSE);
g_byte_array_free (in, TRUE);
rspamd_mempool_delete (pool);
}
diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c
index 33e5832a8..3b1e946ec 100644
--- a/src/lua/lua_trie.c
+++ b/src/lua/lua_trie.c
@@ -375,9 +375,9 @@ lua_trie_search_mime (lua_State *L)
if (trie && task) {
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
- if (!IS_TEXT_PART_EMPTY (part) && part->utf_content != NULL) {
- text = part->utf_content->data;
- len = part->utf_content->len;
+ if (!IS_TEXT_PART_EMPTY (part) && part->utf_content.len > 0) {
+ text = part->utf_content.begin;
+ len = part->utf_content.len;
if (lua_trie_search_str (L, trie, text, len, cb) != 0) {
found = TRUE;
More information about the Commits
mailing list