commit 4bba6e3: [CritFix] Langdet: Fix language detection where no stop words found

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jun 5 13:07:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-06-05 13:59:45 +0100
URL: https://github.com/rspamd/rspamd/commit/4bba6e33a91047f1ea3a0360e6a4480d4b51d26f (HEAD -> master)

[CritFix] Langdet: Fix language detection where no stop words found

---
 src/libmime/lang_detection.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index aad01ec8a..0312d009b 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1718,13 +1718,30 @@ rspamd_language_detector_detect (struct rspamd_task *task,
 	}
 
 	if (!ret) {
-		if (part->nwords < default_short_text_limit) {
+		if (part->utf_words->len < default_short_text_limit) {
 			r = rs_detect_none;
 			msg_debug_lang_det ("text is too short for trigramms detection: "
 					   "%d words; at least %d words required",
-					(int)part->nwords,
+					(int)part->utf_words->len,
 					(int)default_short_text_limit);
-			rspamd_language_detector_set_language (task, part, "en");
+			switch (cat) {
+			case RSPAMD_LANGUAGE_CYRILLIC:
+				rspamd_language_detector_set_language (task, part, "ru");
+				break;
+			case RSPAMD_LANGUAGE_DEVANAGARI:
+				rspamd_language_detector_set_language (task, part, "hi");
+				break;
+			case RSPAMD_LANGUAGE_ARAB:
+				rspamd_language_detector_set_language (task, part, "ar");
+				break;
+			default:
+			case RSPAMD_LANGUAGE_LATIN:
+				rspamd_language_detector_set_language (task, part, "en");
+				break;
+			}
+			msg_debug_lang_det ("set %s language based on symbols category",
+					part->language);
+
 			candidates = kh_init (rspamd_candidates_hash);
 		}
 		else {


More information about the Commits mailing list