commit a7f5e5e: [Fix] Fix normalization of non-alphabet based languages

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Aug 27 17:49:10 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-27 18:20:59 +0100
URL: https://github.com/rspamd/rspamd/commit/a7f5e5eb06168374f1ee25b744e4b37f1ad4c8a0

[Fix] Fix normalization of non-alphabet based languages

---
 src/libstat/backends/redis_backend.c | 3 ++-
 src/libstat/tokenizers/tokenizers.c  | 8 ++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c
index 9ac6fb445..9dd3624fb 100644
--- a/src/libstat/backends/redis_backend.c
+++ b/src/libstat/backends/redis_backend.c
@@ -526,7 +526,8 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task,
 								"HSET %b_tokens %b %b",
 								prefix, (size_t) prefix_len,
 								n0, (size_t) l0,
-								tok->t1->stemmed.begin, tok->t1->stemmed.len);
+								tok->t1->stemmed.begin,
+								tok->t1->stemmed.len);
 					}
 				}
 				else {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index ea3c84c67..000f2033c 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -679,14 +679,10 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
 			}
 #endif
 
-			if (cat == U_UPPERCASE_LETTER ||
-					cat == U_LOWERCASE_LETTER ||
-					cat == U_DECIMAL_DIGIT_NUMBER ||
+			if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
 					cat == U_CONNECTOR_PUNCTUATION ||
 					cat == U_MATH_SYMBOL ||
-					cat == U_CURRENCY_SYMBOL ||
-					cat == U_INITIAL_PUNCTUATION ||
-					cat == U_FINAL_PUNCTUATION) {
+					cat == U_CURRENCY_SYMBOL) {
 				*d++ = u_tolower (t);
 			}
 		}


More information about the Commits mailing list