commit f0d4093: [Feature] Process HTML parts before text ones
Vsevolod Stakhov
vsevolod at rspamd.com
Tue Jul 25 15:07:05 UTC 2023
Author: Vsevolod Stakhov
Date: 2023-07-24 13:27:44 +0100
URL: https://github.com/rspamd/rspamd/commit/f0d4093ec07282e2a029637ff36908a028cedf15
[Feature] Process HTML parts before text ones
---
src/libmime/message.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 47 insertions(+), 4 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index d99e62dc2..f3cd482a0 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1303,6 +1303,36 @@ rspamd_message_parse (struct rspamd_task *task)
return TRUE;
}
+
+/*
+ * A helper structure to store text parts positions, if it was C++, I could just use std::pair,
+ * but here I have to make it all manually, sigh...
+ */
+struct rspamd_mime_part_text_position {
+ unsigned pos;
+ enum rspamd_message_part_is_text_result res;
+};
+
+/* Place html parts first during analysis */
+static int
+rspamd_mime_text_part_position_compare_func(const void *v1, const void *v2)
+{
+ const struct rspamd_mime_part_text_position *p1 = (const struct rspamd_mime_part_text_position *)v1;
+ const struct rspamd_mime_part_text_position *p2 = (const struct rspamd_mime_part_text_position *)v2;
+
+ if (p1->res == p2->res) {
+ return (int)p2->pos - (int)p1->pos;
+ }
+ else {
+ if (p1->res == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
+ return -1;
+ }
+ else {
+ return 1;
+ }
+ }
+}
+
void
rspamd_message_process (struct rspamd_task *task)
{
@@ -1344,6 +1374,8 @@ rspamd_message_process (struct rspamd_task *task)
funcs_top = lua_gettop (L);
}
+ GArray *detected_text_parts = g_array_sized_new (FALSE, FALSE, sizeof(struct rspamd_mime_part_text_position), 2);
+
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
if (magic_func_pos != -1 && part->parsed_data.len > 0) {
struct rspamd_mime_part **pmime;
@@ -1441,18 +1473,29 @@ rspamd_message_process (struct rspamd_task *task)
/* Try to detect image before checking for text */
rspamd_images_process_mime_part_maybe (task, part);
- /* Still no content detected, try text heuristic */
if (part->part_type == RSPAMD_MIME_PART_UNDEFINED &&
- !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
+ !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part);
if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) {
- rspamd_message_process_text_part_maybe (task, part, res);
+ struct rspamd_mime_part_text_position p = {
+ .pos = i,
+ .res = res
+ };
+ g_array_append_val (detected_text_parts, p);
}
-
}
}
+ g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
+ /* One more iteration to process text parts in a more specific order */
+ for (i = 0; i < detected_text_parts->len; i ++) {
+ part = g_ptr_array_index (MESSAGE_FIELD (task, parts),
+ g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
+ rspamd_message_process_text_part_maybe(task, part,
+ g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res);
+ }
+
if (old_top != -1) {
lua_settop (L, old_top);
}
More information about the Commits
mailing list