commit a455c65: [Fix] Further fixes in charset detection

Mon Jan 27 18:28:07 UTC 2020

Author: Vsevolod Stakhov
Date: 2020-01-27 18:21:12 +0000
URL: https://github.com/rspamd/rspamd/commit/a455c6542212948969b6c115344e9d059606407a (HEAD -> master)

[Fix] Further fixes in charset detection

---
 src/libmime/mime_encoding.c | 119 ++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 55 deletions(-)

diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 1f130325e..646b31eae 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -573,17 +573,10 @@ rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
 		g_assert (csd != NULL);
 	}
 
-	/* If text is ascii, then we can treat it as utf8 data */
-	for (i = 0; i < inlen; i++) {
-		if ((((guchar)in[i]) & 0x80) != 0) {
-			goto detect;
-		}
+	if (rspamd_fast_utf8_validate (in, inlen) == 0) {
+		return UTF8_CHARSET;
 	}
 
-	return UTF8_CHARSET;
-
-detect:
-
 	ucsdet_setText (csd, in, inlen, &uc_err);
 	csm = ucsdet_detectAll (csd, &matches, &uc_err);
 
@@ -661,15 +654,11 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 {
 	GError *err = NULL;
 	const gchar *charset = NULL;
-	gboolean checked = FALSE, need_charset_heuristic = TRUE;
+	gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
 	GByteArray *part_content;
 	rspamd_ftok_t charset_tok;
 	struct rspamd_mime_part *part = text_part->mime_part;
 
-	if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
-		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT;
-	}
-
 	/* Allocate copy storage */
 	part_content = g_byte_array_sized_new (text_part->parsed.len);
 	memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
@@ -680,18 +669,20 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 			(rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
 
 	if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
+		if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
+			/* Valid UTF, likely all good */
+			need_charset_heuristic = FALSE;
+			valid_utf8 = TRUE;
+			checked = TRUE;
+		}
+
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
 	}
-
-	if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) {
+	else {
+		/* All 7bit characters, assume it valid utf */
 		need_charset_heuristic = FALSE;
-	}
-
-	if (task->cfg && task->cfg->raw_mode) {
-		SET_PART_RAW (text_part);
-		text_part->utf_raw_content = part_content;
-
-		return;
+		valid_utf8 = TRUE;
+		checked = TRUE; /* Already valid utf, no need in further checks */
 	}
 
 	if (part->ct->charset.len == 0) {
@@ -706,7 +697,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 			checked = TRUE;
 			text_part->real_charset = charset;
 		}
-		else {
+		else if (valid_utf8) {
 			SET_PART_UTF (text_part);
 			text_part->utf_raw_content = part_content;
 			text_part->real_charset = UTF8_CHARSET;
@@ -719,17 +710,30 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 				task->task_pool);
 
 		if (charset == NULL) {
-			charset = rspamd_mime_charset_find_by_content (part_content->data,
-					MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
-			msg_info_task ("detected charset: %s", charset);
-			checked = TRUE;
-			text_part->real_charset = charset;
+			/* We don't know the real charset but can try heuristic */
+			if (need_charset_heuristic) {
+				charset = rspamd_mime_charset_find_by_content (part_content->data,
+						MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
+				msg_info_task ("detected charset: %s", charset);
+				checked = TRUE;
+				text_part->real_charset = charset;
+			}
+			else if (valid_utf8) {
+				/* We already know that the input is valid utf, so skip heuristic */
+				text_part->real_charset = UTF8_CHARSET;
+			}
+		}
+		else {
+			/*
+			 * We have detected some charset, but we don't know which one
+			 */
+			valid_utf8 = FALSE;
 		}
 	}
 
-	if (charset == NULL) {
-		msg_info_task ("<%s>: has invalid charset",
-				MESSAGE_FIELD_CHECK (task, message_id));
+	if (text_part->real_charset == NULL) {
+		msg_info_task ("<%s>: has invalid charset; original: %T",
+				MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset);
 		SET_PART_RAW (text_part);
 		text_part->utf_raw_content = part_content;
 
@@ -738,32 +742,37 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 
 	RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
 
-	if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
-			part_content->len, !checked)) {
-		SET_PART_UTF (text_part);
-		text_part->utf_raw_content = part_content;
-		text_part->real_charset = UTF8_CHARSET;
-
-		return;
-	}
-	else {
-		charset = charset_tok.begin;
-
-		if (!rspamd_mime_text_part_utf8_convert (task, text_part,
-				part_content, charset, &err)) {
-			msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
-					MESSAGE_FIELD (task, message_id),
-					charset,
-					err ? err->message : "unknown problem");
-			SET_PART_RAW (text_part);
-			g_error_free (err);
-
+	if (!valid_utf8) {
+		if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
+				part_content->len, !checked)) {
+			SET_PART_UTF (text_part);
 			text_part->utf_raw_content = part_content;
+			text_part->real_charset = UTF8_CHARSET;
+
 			return;
 		}
+		else {
+			charset = charset_tok.begin;
+
+			if (!rspamd_mime_text_part_utf8_convert (task, text_part,
+					part_content, charset, &err)) {
+				msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
+						MESSAGE_FIELD (task, message_id),
+						charset,
+						err ? err->message : "unknown problem");
+				SET_PART_RAW (text_part);
+				g_error_free (err);
+
+				text_part->utf_raw_content = part_content;
+				return;
+			}
 
-		text_part->real_charset = charset;
+			SET_PART_UTF (text_part);
+			text_part->real_charset = charset;
+		}
+	}
+	else {
+		SET_PART_UTF (text_part);
+		text_part->utf_raw_content = part_content;
 	}
-
-	SET_PART_UTF (text_part);
 }