commit ab43e08: [Feature] Try to filter bad unicode types during normalisation

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Feb 25 18:21:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-02-25 18:19:51 +0000
URL: https://github.com/rspamd/rspamd/commit/ab43e080ebc5fea5a2c54bcad9180202b1a38711 (HEAD -> master)

[Feature] Try to filter bad unicode types during normalisation

---
 src/libstat/stat_api.h              |  1 +
 src/libstat/tokenizers/tokenizers.c | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 533c42948..f9d1aab5a 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -39,6 +39,7 @@
 #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
 #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
 #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
+#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
 
 typedef struct rspamd_stat_token_s {
 	rspamd_ftok_t original; /* utf8 raw */
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index acbbcf2f0..caa4a48a5 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -610,7 +610,25 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
 		U16_NEXT_UNSAFE (src, i, t);
 
 		if (u_isgraph (t)) {
-			*d++ = u_tolower (t);
+			UCharCategory cat;
+
+			cat = u_charType (t);
+#if U_ICU_VERSION_MAJOR_NUM >= 57
+			if (u_hasBinaryProperty (t, UCHAR_EMOJI)) {
+				tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
+			}
+#endif
+
+			if (cat == U_UPPERCASE_LETTER ||
+					cat == U_LOWERCASE_LETTER ||
+					cat == U_DECIMAL_DIGIT_NUMBER ||
+					cat == U_CONNECTOR_PUNCTUATION ||
+					cat == U_MATH_SYMBOL ||
+					cat == U_CURRENCY_SYMBOL ||
+					cat == U_INITIAL_PUNCTUATION ||
+					cat == U_FINAL_PUNCTUATION) {
+				*d++ = u_tolower (t);
+			}
 		}
 		else {
 			/* Invisible spaces ! */


More information about the Commits mailing list