commit 070120e: [Fix] Ignore non-unique stop words

Vsevolod Stakhov vsevolod at rspamd.com
Sat Apr 29 17:14:08 UTC 2023


Author: Vsevolod Stakhov
Date: 2023-04-29 16:00:52 +0100
URL: https://github.com/rspamd/rspamd/commit/070120ed1370ac7179cf4945195294df6a26b4dc

[Fix] Ignore non-unique stop words

---
 src/libmime/lang_detection.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 211dfe48b..d8e81e075 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1583,7 +1583,10 @@ rspamd_langelt_equal_func (gconstpointer v, gconstpointer v2)
 	return strcmp (elt1->name, elt2->name) == 0;
 }
 
-KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, int, 1,
+/* This hash set stores a word index in the language to avoid duplicate stop words */
+KHASH_INIT (rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal);
+
+KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1,
 		rspamd_langelt_hash_func, rspamd_langelt_equal_func);
 
 struct rspamd_sw_cbdata {
@@ -1652,9 +1655,20 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
 	gint nwords = 1;
 
 	if (k != kh_end (cbdata->res)) {
-		nwords = ++ kh_value (cbdata->res, k);
+		khiter_t set_k;
+		int tt;
+
+		set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum);
+		nwords = kh_size(kh_value(cbdata->res, k));
 
-		if (kh_value (cbdata->res, k) > max_stop_words) {
+		if (set_k == kh_end(kh_value(cbdata->res, k))) {
+			/* New word */
+			set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
+			msg_debug_lang_det ("found new word %*s from %s language (%d stop words found so far)",
+				(int)(next - prev - 1), prev + 1, r->elt->name, nwords);
+		}
+
+		if (nwords > max_stop_words) {
 			return 1;
 		}
 	}
@@ -1662,11 +1676,12 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
 		gint tt;
 
 		k = kh_put (rspamd_sw_hash, cbdata->res, r->elt, &tt);
-		kh_value (cbdata->res, k) = 1;
-	}
+		kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set);
+		kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
 
-	msg_debug_lang_det ("found word %*s from %s language (%d stop words found so far)",
+		msg_debug_lang_det ("found new word %*s from %s language (%d stop words found so far)",
 			(int)(next - prev - 1), prev + 1, r->elt->name, nwords);
+	}
 
 	return 0;
 }
@@ -1693,13 +1708,15 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 			&cbdata, NULL);
 
 	if (kh_size (cbdata.res) > 0) {
-		gint cur_matches;
+		khash_t(rspamd_sw_res_set) *cur_res;
 		double max_rate = G_MINDOUBLE;
 		struct rspamd_language_elt *cur_lang, *sel = NULL;
 		gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
 
 		again:
-		kh_foreach (cbdata.res, cur_lang, cur_matches, {
+		kh_foreach (cbdata.res, cur_lang, cur_res, {
+			int cur_matches = kh_size(cur_res);
+
 			if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
 				/* Restart matches */
 				ignore_ascii = TRUE;
@@ -1746,6 +1763,11 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 					cur_matches, cur_lang->name, rate);
 		});
 
+		/* Cleanup */
+		kh_foreach (cbdata.res, cur_lang, cur_res, {
+			kh_destroy (rspamd_sw_res_set, cur_res);
+		});
+
 		if (max_rate > 0 && sel) {
 			msg_debug_lang_det ("set language based on stop words script %s, %.3f found",
 					sel->name, max_rate);


More information about the Commits mailing list