commit 16093e4: [Minor] Show stop words found

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Aug 2 17:35:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-02 18:17:23 +0100
URL: https://github.com/rspamd/rspamd/commit/16093e49010ddf64295e520ff086e858b7447c5c

[Minor] Show stop words found

---
 src/libmime/lang_detection.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 0f1563d69..276771778 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1556,6 +1556,7 @@ KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, int, 1,
 		rspamd_langelt_hash_func, rspamd_langelt_equal_func);
 
 struct rspamd_sw_cbdata {
+	struct rspamd_task *task;
 	khash_t (rspamd_sw_hash) *res;
 	GArray *ranges;
 };
@@ -1591,6 +1592,7 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
 	struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context;
 	khiter_t k;
 	static const gsize max_stop_words = 80;
+	struct rspamd_task *task;
 
 	if (match_start > 0) {
 		prev = text + match_start - 1;
@@ -1609,14 +1611,17 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
 	}
 
 	/* We have a word on the boundary, check range */
+	task = cbdata->task;
 	r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data,
 			cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp);
+
 	g_assert (r != NULL);
 
 	k = kh_get (rspamd_sw_hash, cbdata->res, r->elt);
+	gint nwords = 1;
 
 	if (k != kh_end (cbdata->res)) {
-		kh_value (cbdata->res, k) ++;
+		nwords = ++ kh_value (cbdata->res, k);
 
 		if (kh_value (cbdata->res, k) > max_stop_words) {
 			return 1;
@@ -1629,6 +1634,9 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
 		kh_value (cbdata->res, k) = 1;
 	}
 
+	msg_debug_lang_det ("found word %*s from %s language (%d stop words found so far)",
+			(int)(next - prev - 1), prev + 1, r->elt->name, nwords);
+
 	return 0;
 }
 
@@ -1645,6 +1653,7 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task,
 	elt = &d->stop_words[cat];
 	cbdata.res = kh_init (rspamd_sw_hash);
 	cbdata.ranges = elt->ranges;
+	cbdata.task = task;
 
 	rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data,
 			part->utf_stripped_content->len, rspamd_language_detector_sw_cb,


More information about the Commits mailing list