commit 360bba6: [Minor] Slightly change the text processing logic
Vsevolod Stakhov
vsevolod at rspamd.com
Tue Jul 25 15:07:03 UTC 2023
Author: Vsevolod Stakhov
Date: 2023-07-24 13:09:16 +0100
URL: https://github.com/rspamd/rspamd/commit/360bba643a67f79fa24419d502be1e8fc8119217
[Minor] Slightly change the text processing logic
---
src/libmime/message.c | 62 +++++++++++++++++++++++++++++++++------------------
1 file changed, 40 insertions(+), 22 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index bdc1378d9..d99e62dc2 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -796,20 +796,23 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
return TRUE;
}
-static gboolean
-rspamd_message_process_text_part_maybe (struct rspamd_task *task,
- struct rspamd_mime_part *mime_part)
+enum rspamd_message_part_is_text_result {
+ RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN = 0,
+ RSPAMD_MESSAGE_PART_IS_TEXT_HTML,
+ RSPAMD_MESSAGE_PART_IS_NOT_TEXT
+};
+
+static enum rspamd_message_part_is_text_result
+rspamd_message_part_can_be_parsed_as_text (struct rspamd_task *task,
+ struct rspamd_mime_part *mime_part)
{
- struct rspamd_mime_text_part *text_part;
- rspamd_ftok_t html_tok, xhtml_tok;
- gboolean found_html = FALSE, found_txt = FALSE;
- guint flags = 0;
- enum rspamd_action_type act;
+ enum rspamd_message_part_is_text_result res = RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) ||
(mime_part->detected_type && strcmp (mime_part->detected_type, "text") == 0)) {
- found_txt = TRUE;
+ res = RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN;
+ rspamd_ftok_t html_tok, xhtml_tok;
html_tok.begin = "html";
html_tok.len = 4;
@@ -819,25 +822,35 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0 ||
rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0 ||
(mime_part->detected_ext &&
- strcmp (mime_part->detected_ext, "html") == 0)) {
- found_html = TRUE;
+ strcmp (mime_part->detected_ext, "html") == 0)) {
+ res = RSPAMD_MESSAGE_PART_IS_TEXT_HTML;
}
}
/* Skip attachments */
- if ((found_txt || found_html) &&
- (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
+ if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT &&
+ (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
if (!task->cfg->check_text_attachements) {
debug_task ("skip attachments for checking as text parts");
- return FALSE;
- }
- else {
- flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT;
+ return RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
}
}
- else if (!(found_txt || found_html)) {
- /* Not a text part */
- return FALSE;
+
+ return res;
+}
+
+static gboolean
+rspamd_message_process_text_part_maybe (struct rspamd_task *task,
+ struct rspamd_mime_part *mime_part,
+ enum rspamd_message_part_is_text_result is_text)
+{
+ struct rspamd_mime_text_part *text_part;
+ guint flags = 0;
+ enum rspamd_action_type act;
+
+ /* Skip attachments */
+ if ((mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
+ flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT;
}
text_part = rspamd_mempool_alloc0 (task->task_pool,
@@ -850,7 +863,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
text_part->utf_stripped_text = (UText)UTEXT_INITIALIZER;
text_part->flags |= flags;
- if (found_html) {
+ if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
if (!rspamd_message_process_html_text_part (task, text_part)) {
return FALSE;
}
@@ -1431,7 +1444,12 @@ rspamd_message_process (struct rspamd_task *task)
/* Still no content detected, try text heuristic */
if (part->part_type == RSPAMD_MIME_PART_UNDEFINED &&
!(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
- rspamd_message_process_text_part_maybe (task, part);
+ enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part);
+
+ if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) {
+ rspamd_message_process_text_part_maybe (task, part, res);
+ }
+
}
}
More information about the Commits
mailing list