commit c6d568b: [Fix] Lang_det: Try better to distinguish Chinese and Japanese

Wed Jun 5 11:07:03 UTC 2019

Author: Vsevolod Stakhov
Date: 2019-06-05 12:04:15 +0100
URL: https://github.com/rspamd/rspamd/commit/c6d568b4bd53af5d9f7d25d9381e5d8b72153564 (HEAD -> master)

[Fix] Lang_det: Try better to distinguish Chinese and Japanese

---
 src/libmime/lang_detection.c | 60 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 143dc3f38..873135e03 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1349,13 +1349,16 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
 
 static void
 rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
-										  struct rspamd_mime_text_part *part)
+										  struct rspamd_mime_text_part *part,
+										  guint *pchinese,
+										  guint *pspecial)
 {
 	const gchar *p = part->utf_stripped_content->data, *end;
-	guint i = 0;
+	guint i = 0, cnt = 0;
 	end = p + part->utf_stripped_content->len;
 	gint32 uc, sc;
 	guint nlatin = 0, nchinese = 0, nspecial = 0;
+	const guint cutoff_limit = 32;
 
 	while (p + i < end) {
 		U8_NEXT (p, i, part->utf_stripped_content->len, uc);
@@ -1366,6 +1369,7 @@ rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
 
 		if (u_isalpha (uc)) {
 			sc = ublock_getCode (uc);
+			cnt ++;
 
 			switch (sc) {
 			case UBLOCK_BASIC_LATIN:
@@ -1446,10 +1450,10 @@ rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
 			}
 		}
 
-		if (nspecial > 6 && nspecial > nlatin) {
+		if (nspecial > cutoff_limit && nspecial > nlatin) {
 			break;
 		}
-		else if (nchinese > 6 && nchinese > nlatin) {
+		else if (nchinese > cutoff_limit && nchinese > nlatin) {
 			if (nspecial > 0) {
 				/* Likely japanese */
 				break;
@@ -1459,7 +1463,10 @@ rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
 
 	msg_debug_lang_det ("stop after checking %d characters, "
 						"%d latin, %d special, %d chinese",
-			i, nlatin, nspecial, nchinese);
+			cnt, nlatin, nspecial, nchinese);
+
+	*pchinese = nchinese;
+	*pspecial = nspecial;
 }
 
 static inline void
@@ -1483,22 +1490,48 @@ rspamd_language_detector_set_language (struct rspamd_task *task,
 
 static gboolean
 rspamd_language_detector_try_uniscript (struct rspamd_task *task,
-										struct rspamd_mime_text_part *part)
+										struct rspamd_mime_text_part *part,
+										guint nchinese,
+										guint nspecial)
 {
 	guint i;
 
 	for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) {
 		if (unicode_langs[i].unicode_code & part->unicode_scripts) {
-			msg_debug_lang_det ("set language based on unicode script %s",
-					unicode_langs[i].lang);
-			rspamd_language_detector_set_language (task, part,
-					unicode_langs[i].lang);
 
-			return TRUE;
+			if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) {
+				msg_debug_lang_det ("set language based on unicode script %s",
+						unicode_langs[i].lang);
+				rspamd_language_detector_set_language (task, part,
+						unicode_langs[i].lang);
+
+				return TRUE;
+			}
+			else {
+				/* Japanese <-> Chinese guess */
+
+				/*
+				 * Typically there might be around 0-70% of kanji glyphs
+				 * and the rest are Haragana/Katakana
+				 *
+				 * If we discover that Kanji is more than 80% then we consider
+				 * it Chinese
+				 */
+				if (nchinese <= 5 || nchinese < nspecial * 5) {
+					msg_debug_lang_det ("set language based on unicode script %s",
+							unicode_langs[i].lang);
+					rspamd_language_detector_set_language (task, part,
+							unicode_langs[i].lang);
+
+					return TRUE;
+				}
+			}
 		}
 	}
 
 	if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
+		msg_debug_lang_det ("guess chinese based on CJK characters: %d chinese, %d special",
+				nchinese, nspecial);
 		rspamd_language_detector_set_language (task, part,
 				"zh-CN");
 
@@ -1670,10 +1703,11 @@ rspamd_language_detector_detect (struct rspamd_task *task,
 
 	start_ticks = rspamd_get_ticks (TRUE);
 
-	rspamd_language_detector_unicode_scripts (task, part);
+	guint nchinese = 0, nspecial = 0;
+	rspamd_language_detector_unicode_scripts (task, part, &nchinese, &nspecial);
 	/* Apply unicode scripts heuristic */
 
-	if (rspamd_language_detector_try_uniscript (task, part)) {
+	if (rspamd_language_detector_try_uniscript (task, part, nchinese, nspecial)) {
 		ret = TRUE;
 	}