commit bdad476: [Minor] Do not try to detect utf8 using heuristic

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Jan 9 15:28:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-09 15:22:14 +0000
URL: https://github.com/rspamd/rspamd/commit/bdad476ce3f9b889c3d498bf66755882caf5e067 (HEAD -> master)

[Minor] Do not try to detect utf8 using heuristic

---
 src/libmime/mime_encoding.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 0ba0e0edd..1f130325e 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -36,7 +36,7 @@
 #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
 
 #define RSPAMD_CHARSET_CACHE_SIZE 32
-#define RSPAMD_CHARSET_MAX_CONTENT 128
+#define RSPAMD_CHARSET_MAX_CONTENT 512
 
 #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
 #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
@@ -625,28 +625,30 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
 		 * corner cases
 		 */
 		if (content_check) {
-			real_charset = rspamd_mime_charset_find_by_content (in,
-					MIN (RSPAMD_CHARSET_MAX_CONTENT, len));
+			if (rspamd_fast_utf8_validate (in, len) != 0) {
+				real_charset = rspamd_mime_charset_find_by_content (in,
+						MIN (RSPAMD_CHARSET_MAX_CONTENT, len));
 
-			if (real_charset) {
+				if (real_charset) {
 
-				if (rspamd_regexp_match (utf_compatible_re,
-						real_charset, strlen (real_charset), TRUE)) {
-					RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
+					if (rspamd_regexp_match (utf_compatible_re,
+							real_charset, strlen (real_charset), TRUE)) {
+						RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
 
-					return TRUE;
-				}
-				else {
-					charset->begin = real_charset;
-					charset->len = strlen (real_charset);
+						return TRUE;
+					}
+					else {
+						charset->begin = real_charset;
+						charset->len = strlen (real_charset);
 
-					return FALSE;
+						return FALSE;
+					}
 				}
+
+				rspamd_mime_charset_utf_enforce (in, len);
 			}
 		}
 
-		rspamd_mime_charset_utf_enforce (in, len);
-
 		return TRUE;
 	}
 


More information about the Commits mailing list