commit cd1deb3: [Minor] Add some more heuristics for stop words detection

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Feb 8 13:42:08 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-02-08 13:36:58 +0000
URL: https://github.com/rspamd/rspamd/commit/cd1deb3c19e84b759924c5208d2e1405be3b857a (HEAD -> master)

[Minor] Add some more heuristics for stop words detection

---
 src/libmime/lang_detection.c | 40 +++++++++++++++++++++++++++++++++++++++-
 src/libmime/lang_detection.h |  1 +
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index f27d71a76..a178b1bf8 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -446,6 +446,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 				if (strcmp (fl, "diacritics") == 0) {
 					nelt->flags |= RS_LANGUAGE_DIACRITICS;
 				}
+				else if (strcmp (fl, "ascii") == 0) {
+					nelt->flags |= RS_LANGUAGE_ASCII;
+				}
 				else {
 					msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
 				}
@@ -1668,7 +1671,8 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 	struct rspamd_stop_word_elt *elt;
 	struct rspamd_sw_cbdata cbdata;
 	gboolean ret = FALSE;
-	static const int stop_words_threshold = 4;
+	static const int stop_words_threshold = 4, /* minimum stop words count */
+			strong_confidence_threshold = 10 /* we are sure that this is enough */;
 
 	elt = &d->stop_words[cat];
 	cbdata.res = kh_init (rspamd_sw_hash);
@@ -1683,18 +1687,52 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 		gint cur_matches;
 		double max_rate = G_MINDOUBLE;
 		struct rspamd_language_elt *cur_lang, *sel = NULL;
+		gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
 
+		again:
 		kh_foreach (cbdata.res, cur_lang, cur_matches, {
+			if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
+				/* Restart matches */
+				ignore_ascii = TRUE;
+				sel = NULL;
+				max_rate = G_MINDOUBLE;
+				msg_debug_lang_det ("ignore ascii after finding %d stop words from %s",
+						cur_matches, cur_lang->name);
+				goto again;
+			}
+
+			if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) {
+				/* Restart matches */
+				ignore_latin = TRUE;
+				sel = NULL;
+				max_rate = G_MINDOUBLE;
+				msg_debug_lang_det ("ignore latin after finding stop %d words from %s",
+						cur_matches, cur_lang->name);
+				goto again;
+			}
+
 			if (cur_matches < stop_words_threshold) {
 				continue;
 			}
 
+			if (cur_matches < strong_confidence_threshold) {
+				/* Ignore mixed languages when not enough confidence */
+				if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) {
+					continue;
+				}
+
+				if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) {
+					continue;
+				}
+			}
+
 			double rate = (double)cur_matches / (double)cur_lang->stop_words;
 
 			if (rate > max_rate) {
 				max_rate = rate;
 				sel = cur_lang;
 			}
+
 			msg_debug_lang_det ("found %d stop words from %s: %3f rate",
 					cur_matches, cur_lang->name, rate);
 		});
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index b1382e6ad..6c3234848 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -56,6 +56,7 @@ enum rspamd_language_elt_flags {
 	RS_LANGUAGE_TIER1 = (1 << 3),
 	RS_LANGUAGE_TIER0 = (1 << 4),
 	RS_LANGUAGE_DIACRITICS = (1 << 5),
+	RS_LANGUAGE_ASCII = (1 << 6),
 };
 
 struct rspamd_lang_detector_res {


More information about the Commits mailing list