commit e6f1e32: [Minor] Chartable: Adjustments to the metatokens handling

Vsevolod Stakhov vsevolod at rspamd.com
Thu Dec 8 21:42:03 UTC 2022


Author: Vsevolod Stakhov
Date: 2022-12-08 21:36:36 +0000
URL: https://github.com/rspamd/rspamd/commit/e6f1e32b07e275379e779e83f62d09c0ed15209f (HEAD -> master)

[Minor] Chartable: Adjustments to the metatokens handling

---
 src/plugins/chartable.cxx | 238 +++++++++++++++++++++++-----------------------
 1 file changed, 120 insertions(+), 118 deletions(-)

diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx
index c5820c606..6e3fd9b10 100644
--- a/src/plugins/chartable.cxx
+++ b/src/plugins/chartable.cxx
@@ -45,18 +45,20 @@
 INIT_LOG_MODULE(chartable)
 
 /* Initialization */
-gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
-gint chartable_module_config (struct rspamd_config *cfg, bool validate);
-gint chartable_module_reconfig (struct rspamd_config *cfg);
+gint chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);
+
+gint chartable_module_config(struct rspamd_config *cfg, bool validate);
+
+gint chartable_module_reconfig(struct rspamd_config *cfg);
 
 module_t chartable_module = {
-		"chartable",
-		chartable_module_init,
-		chartable_module_config,
-		chartable_module_reconfig,
-		nullptr,
-		RSPAMD_MODULE_VER,
-		(guint)-1,
+	"chartable",
+	chartable_module_init,
+	chartable_module_config,
+	chartable_module_reconfig,
+	nullptr,
+	RSPAMD_MODULE_VER,
+	(guint) -1,
 };
 
 struct chartable_ctx {
@@ -68,21 +70,22 @@ struct chartable_ctx {
 };
 
 static inline struct chartable_ctx *
-chartable_get_context (struct rspamd_config *cfg)
+chartable_get_context(struct rspamd_config *cfg)
 {
-	return (struct chartable_ctx *)g_ptr_array_index (cfg->c_modules,
-			chartable_module.ctx_offset);
+	return (struct chartable_ctx *) g_ptr_array_index(cfg->c_modules,
+		chartable_module.ctx_offset);
 }
 
-static void chartable_symbol_callback (struct rspamd_task *task,
-									   struct rspamd_symcache_dynamic_item *item,
-									   void *unused);
-static void chartable_url_symbol_callback (struct rspamd_task *task,
-										   struct rspamd_symcache_dynamic_item *item,
-										   void *unused);
+static void chartable_symbol_callback(struct rspamd_task *task,
+									  struct rspamd_symcache_dynamic_item *item,
+									  void *unused);
+
+static void chartable_url_symbol_callback(struct rspamd_task *task,
+										  struct rspamd_symcache_dynamic_item *item,
+										  void *unused);
 
 gint
-chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
+chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
 {
 	struct chartable_ctx *chartable_module_ctx;
 
@@ -90,40 +93,40 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
 		struct chartable_ctx);
 	chartable_module_ctx->max_word_len = 10;
 
-	*ctx = (struct module_ctx *)chartable_module_ctx;
+	*ctx = (struct module_ctx *) chartable_module_ctx;
 
 	return 0;
 }
 
 
 gint
-chartable_module_config (struct rspamd_config *cfg, bool validate)
+chartable_module_config(struct rspamd_config *cfg, bool _)
 {
 	const ucl_object_t *value;
 	gint res = TRUE;
-	struct chartable_ctx *chartable_module_ctx = chartable_get_context (cfg);
+	struct chartable_ctx *chartable_module_ctx = chartable_get_context(cfg);
 
-	if (!rspamd_config_is_module_enabled (cfg, "chartable")) {
+	if (!rspamd_config_is_module_enabled(cfg, "chartable")) {
 		return TRUE;
 	}
 
 	if ((value =
-		rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != nullptr) {
-		chartable_module_ctx->symbol = ucl_obj_tostring (value);
+			 rspamd_config_get_module_opt(cfg, "chartable", "symbol")) != nullptr) {
+		chartable_module_ctx->symbol = ucl_obj_tostring(value);
 	}
 	else {
 		chartable_module_ctx->symbol = DEFAULT_SYMBOL;
 	}
 	if ((value =
-		rspamd_config_get_module_opt (cfg, "chartable", "url_symbol")) != nullptr) {
-		chartable_module_ctx->url_symbol = ucl_obj_tostring (value);
+			 rspamd_config_get_module_opt(cfg, "chartable", "url_symbol")) != nullptr) {
+		chartable_module_ctx->url_symbol = ucl_obj_tostring(value);
 	}
 	else {
 		chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL;
 	}
 	if ((value =
-		rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != nullptr) {
-		if (!ucl_obj_todouble_safe (value, &chartable_module_ctx->threshold)) {
+			 rspamd_config_get_module_opt(cfg, "chartable", "threshold")) != nullptr) {
+		if (!ucl_obj_todouble_safe(value, &chartable_module_ctx->threshold)) {
 			msg_warn_config ("invalid numeric value");
 			chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
 		}
@@ -132,37 +135,37 @@ chartable_module_config (struct rspamd_config *cfg, bool validate)
 		chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
 	}
 	if ((value =
-			rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != nullptr) {
-		chartable_module_ctx->max_word_len = ucl_object_toint (value);
+			 rspamd_config_get_module_opt(cfg, "chartable", "max_word_len")) != nullptr) {
+		chartable_module_ctx->max_word_len = ucl_object_toint(value);
 	}
 	else {
 		chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
 	}
 
-	rspamd_symcache_add_symbol (cfg->cache,
-			chartable_module_ctx->symbol,
-			0,
-			chartable_symbol_callback,
-			nullptr,
-			SYMBOL_TYPE_NORMAL,
-			-1);
-	rspamd_symcache_add_symbol (cfg->cache,
-			chartable_module_ctx->url_symbol,
-			0,
-			chartable_url_symbol_callback,
-			nullptr,
-			SYMBOL_TYPE_NORMAL,
-			-1);
-
-	msg_info_config ("init internal chartable module");
+	rspamd_symcache_add_symbol(cfg->cache,
+		chartable_module_ctx->symbol,
+		0,
+		chartable_symbol_callback,
+		nullptr,
+		SYMBOL_TYPE_NORMAL,
+		-1);
+	rspamd_symcache_add_symbol(cfg->cache,
+		chartable_module_ctx->url_symbol,
+		0,
+		chartable_url_symbol_callback,
+		nullptr,
+		SYMBOL_TYPE_NORMAL,
+		-1);
+
+	msg_info_config("init internal chartable module");
 
 	return res;
 }
 
 gint
-chartable_module_reconfig (struct rspamd_config *cfg)
+chartable_module_reconfig(struct rspamd_config *cfg)
 {
-	return chartable_module_config (cfg, false);
+	return chartable_module_config(cfg, false);
 }
 
 static const auto latin_confusable = ankerl::unordered_dense::set<int>{
@@ -321,19 +324,18 @@ static const auto latin_confusable = ankerl::unordered_dense::set<int>{
 };
 
 static gboolean
-rspamd_can_alias_latin (gint ch)
+rspamd_can_alias_latin(gint ch)
 {
 	return latin_confusable.contains(ch);
 }
 
 static gdouble
-rspamd_chartable_process_word_utf (struct rspamd_task *task,
-								   rspamd_stat_token_t *w,
-								   gboolean is_url,
-								   guint *ncap,
-								   struct chartable_ctx *chartable_module_ctx,
-								   const gchar *lang,
-								   gboolean ignore_diacritics)
+rspamd_chartable_process_word_utf(struct rspamd_task *task,
+								  rspamd_stat_token_t *w,
+								  gboolean is_url,
+								  guint *ncap,
+								  struct chartable_ctx *chartable_module_ctx,
+								  gboolean ignore_diacritics)
 {
 	const UChar32 *p, *end;
 	gdouble badness = 0.0;
@@ -357,12 +359,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 	while (p < end) {
 		uc = *p++;
 
-		if (((gint32)uc) < 0) {
+		if (((gint32) uc) < 0) {
 			break;
 		}
 
-		sc = ublock_getCode (uc);
-		cat = u_charType (uc);
+		sc = ublock_getCode(uc);
+		cat = u_charType(uc);
 
 		if (!ignore_diacritics) {
 			if (cat == U_NON_SPACING_MARK ||
@@ -375,10 +377,10 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 			}
 		}
 
-		if (u_isalpha (uc)) {
+		if (u_isalpha(uc)) {
 
 			if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS ||
-					sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
+				sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
 				/*
 				 * Assume all latin, IPA, diacritic and space modifiers
 				 * characters as basic latin
@@ -386,16 +388,16 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 				sc = UBLOCK_BASIC_LATIN;
 			}
 
-			if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) {
+			if (sc != UBLOCK_BASIC_LATIN && u_isupper(uc)) {
 				if (ncap) {
-					(*ncap) ++;
+					(*ncap)++;
 				}
 			}
 
 			if (state == got_digit) {
 				/* Penalize digit -> alpha translations */
 				if (!is_url && sc != UBLOCK_BASIC_LATIN &&
-						prev_state != start_process) {
+					prev_state != start_process) {
 					badness += 0.25;
 				}
 			}
@@ -404,15 +406,15 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 				if (same_script_count > 0) {
 					if (sc != UBLOCK_BASIC_LATIN && last_is_latin) {
 
-						if (rspamd_can_alias_latin (uc)) {
-							badness += 1.0 / (gdouble)same_script_count;
+						if (rspamd_can_alias_latin(uc)) {
+							badness += 1.0 / (gdouble) same_script_count;
 						}
 
 						last_is_latin = 0;
 						same_script_count = 1;
 					}
 					else {
-						same_script_count ++;
+						same_script_count++;
 					}
 				}
 				else {
@@ -425,7 +427,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 			state = got_alpha;
 
 		}
-		else if (u_isdigit (uc)) {
+		else if (u_isdigit(uc)) {
 			if (state != got_digit) {
 				prev_state = state;
 			}
@@ -443,7 +445,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 			same_script_count = 0;
 		}
 
-		nsym ++;
+		nsym++;
 	}
 
 	if (nspecial > 0) {
@@ -467,17 +469,17 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
 	}
 
 	msg_debug_chartable ("word %*s, badness: %.2f",
-			(gint)w->normalized.len, w->normalized.begin,
-			badness);
+		(gint) w->normalized.len, w->normalized.begin,
+		badness);
 
 	return badness;
 }
 
 static gdouble
-rspamd_chartable_process_word_ascii (struct rspamd_task *task,
-									 rspamd_stat_token_t *w,
-									 gboolean is_url,
-									 struct chartable_ctx *chartable_module_ctx)
+rspamd_chartable_process_word_ascii(struct rspamd_task *task,
+									rspamd_stat_token_t *w,
+									gboolean is_url,
+									struct chartable_ctx *chartable_module_ctx)
 {
 	gdouble badness = 0.0;
 	enum {
@@ -516,12 +518,12 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
 
 				if (same_script_count > 0) {
 					if (sc != last_sc) {
-						badness += 1.0 / (gdouble)same_script_count;
+						badness += 1.0 / (gdouble) same_script_count;
 						last_sc = sc;
 						same_script_count = 1;
 					}
 					else {
-						same_script_count ++;
+						same_script_count++;
 					}
 				}
 				else {
@@ -544,7 +546,7 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
 			same_script_count = 0;
 		}
 
-		p ++;
+		p++;
 	}
 
 	if (badness > 4.0) {
@@ -552,24 +554,24 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
 	}
 
 	msg_debug_chartable ("word %*s, badness: %.2f",
-			(gint)w->normalized.len, w->normalized.begin,
-			badness);
+		(gint) w->normalized.len, w->normalized.begin,
+		badness);
 
 	return badness;
 }
 
 static gboolean
-rspamd_chartable_process_part (struct rspamd_task *task,
-							   struct rspamd_mime_text_part *part,
-							   struct chartable_ctx *chartable_module_ctx,
-							   gboolean ignore_diacritics)
+rspamd_chartable_process_part(struct rspamd_task *task,
+							  struct rspamd_mime_text_part *part,
+							  struct chartable_ctx *chartable_module_ctx,
+							  gboolean ignore_diacritics)
 {
 	rspamd_stat_token_t *w;
 	guint i, ncap = 0;
 	gdouble cur_score = 0.0;
 
 	if (part == nullptr || part->utf_words == nullptr ||
-			part->utf_words->len == 0 || part->nwords == 0) {
+		part->utf_words->len == 0 || part->nwords == 0) {
 		return FALSE;
 	}
 
@@ -579,12 +581,12 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 		if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
 
 			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
-				cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
-						&ncap, chartable_module_ctx, part->language, ignore_diacritics);
+				cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
+					&ncap, chartable_module_ctx, ignore_diacritics);
 			}
 			else {
-				cur_score += rspamd_chartable_process_word_ascii (task, w,
-						FALSE, chartable_module_ctx);
+				cur_score += rspamd_chartable_process_word_ascii(task, w,
+					FALSE, chartable_module_ctx);
 			}
 		}
 	}
@@ -596,7 +598,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 	 */
 	part->capital_letters += ncap;
 
-	cur_score /= (gdouble)part->nwords;
+	cur_score /= (gdouble) part->nwords;
 
 	if (cur_score > 1.0) {
 		cur_score = 1.0;
@@ -604,7 +606,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 
 	if (cur_score > chartable_module_ctx->threshold) {
 		rspamd_task_insert_result (task, chartable_module_ctx->symbol,
-				cur_score, nullptr);
+			cur_score, nullptr);
 		return TRUE;
 	}
 
@@ -612,37 +614,37 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 }
 
 static void
-chartable_symbol_callback (struct rspamd_task *task,
-		struct rspamd_symcache_dynamic_item *item,
-		void *unused)
+chartable_symbol_callback(struct rspamd_task *task,
+						  struct rspamd_symcache_dynamic_item *item,
+						  void *_)
 {
 	guint i;
 	struct rspamd_mime_text_part *part;
-	struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg);
-	const gchar *language = nullptr;
-	gboolean ignore_diacritics = FALSE, seen_violated_part = FALSE;
+	struct chartable_ctx *chartable_module_ctx = chartable_get_context(task->cfg);
+	gboolean ignore_diacritics = TRUE, seen_violated_part = FALSE;
 
 	/* Check if we have parts with diacritic symbols language */
-	PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
+	PTR_ARRAY_FOREACH (MESSAGE_FIELD(task, text_parts), i, part) {
 		if (part->languages && part->languages->len > 0) {
-			struct rspamd_lang_detector_res *lang =
-					(struct rspamd_lang_detector_res *)g_ptr_array_index (part->languages, 0);
+			auto *lang = (struct rspamd_lang_detector_res *) g_ptr_array_index(part->languages, 0);
 			gint flags;
 
-			flags = rspamd_language_detector_elt_flags (lang->elt);
+			flags = rspamd_language_detector_elt_flags(lang->elt);
 
-			if (flags & RS_LANGUAGE_DIACRITICS) {
+			if ((flags & RS_LANGUAGE_DIACRITICS)) {
 				ignore_diacritics = TRUE;
 			}
+			else if (lang->prob > 0.75) {
+				ignore_diacritics = FALSE;
+			}
 		}
 
-		if (rspamd_chartable_process_part (task, part, chartable_module_ctx,
-				ignore_diacritics)) {
+		if (rspamd_chartable_process_part(task, part, chartable_module_ctx, ignore_diacritics)) {
 			seen_violated_part = TRUE;
 		}
 	}
 
-	if (MESSAGE_FIELD (task, text_parts)->len == 0) {
+	if (MESSAGE_FIELD(task, text_parts)->len == 0) {
 		/* No text parts, assume that we should ignore diacritics checks for metatokens */
 		ignore_diacritics = TRUE;
 	}
@@ -653,12 +655,12 @@ chartable_symbol_callback (struct rspamd_task *task,
 		gsize arlen = task->meta_words->len;
 
 		for (i = 0; i < arlen; i++) {
-			w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
-			cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
-					nullptr, chartable_module_ctx, language, ignore_diacritics);
+			w = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+			cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
+				nullptr, chartable_module_ctx, ignore_diacritics);
 		}
 
-		cur_score /= (gdouble)arlen;
+		cur_score /= (gdouble) (arlen + 1);
 
 		if (cur_score > 1.0) {
 			cur_score = 1.0;
@@ -672,19 +674,19 @@ chartable_symbol_callback (struct rspamd_task *task,
 				}
 			}
 
-			rspamd_task_insert_result (task, chartable_module_ctx->symbol,
-					cur_score, "subject");
+			rspamd_task_insert_result(task, chartable_module_ctx->symbol,
+				cur_score, "subject");
 
 		}
 	}
 
-	rspamd_symcache_finalize_item (task, item);
+	rspamd_symcache_finalize_item(task, item);
 }
 
 static void
-chartable_url_symbol_callback (struct rspamd_task *task,
-		struct rspamd_symcache_dynamic_item *item,
-		void *unused)
+chartable_url_symbol_callback(struct rspamd_task *task,
+							  struct rspamd_symcache_dynamic_item *item,
+							  void *unused)
 {
 	/* XXX: TODO: unbreak module once URLs unicode project is over */
 #if 0
@@ -751,5 +753,5 @@ chartable_url_symbol_callback (struct rspamd_task *task,
 
 	}
 #endif
-	rspamd_symcache_finalize_item (task, item);
+	rspamd_symcache_finalize_item(task, item);
 }


More information about the Commits mailing list