commit 92e1b61: [Minor] Langdet: Add threshold for stop words

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Aug 2 17:35:06 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-02 18:28:29 +0100
URL: https://github.com/rspamd/rspamd/commit/92e1b614db2bba173c3352455c3454249d357c9d (HEAD -> master)

[Minor] Langdet: Add threshold for stop words

---
 src/libmime/lang_detection.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 74c6f7247..9ccd7bef5 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1650,6 +1650,7 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 	struct rspamd_stop_word_elt *elt;
 	struct rspamd_sw_cbdata cbdata;
 	gboolean ret = FALSE;
+	static const int stop_words_threshold = 4;
 
 	elt = &d->stop_words[cat];
 	cbdata.res = kh_init (rspamd_sw_hash);
@@ -1667,7 +1668,12 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 		struct rspamd_language_elt *cur_lang;
 
 		kh_foreach (cbdata.res, cur_lang, cur_matches, {
+			if (cur_matches < stop_words_threshold) {
+				continue;
+			}
+
 			double rate = (double)cur_matches / (double)cur_lang->stop_words;
+
 			if (rate > max_rate) {
 				max_rate = rate;
 				sel = cur_lang->name;


More information about the Commits mailing list