commit e6f1e32: [Minor] Chartable: Adjustments to the metatokens handling
Vsevolod Stakhov
vsevolod at rspamd.com
Thu Dec 8 21:42:03 UTC 2022
Author: Vsevolod Stakhov
Date: 2022-12-08 21:36:36 +0000
URL: https://github.com/rspamd/rspamd/commit/e6f1e32b07e275379e779e83f62d09c0ed15209f (HEAD -> master)
[Minor] Chartable: Adjustments to the metatokens handling
---
src/plugins/chartable.cxx | 238 +++++++++++++++++++++++-----------------------
1 file changed, 120 insertions(+), 118 deletions(-)
diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx
index c5820c606..6e3fd9b10 100644
--- a/src/plugins/chartable.cxx
+++ b/src/plugins/chartable.cxx
@@ -45,18 +45,20 @@
INIT_LOG_MODULE(chartable)
/* Initialization */
-gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
-gint chartable_module_config (struct rspamd_config *cfg, bool validate);
-gint chartable_module_reconfig (struct rspamd_config *cfg);
+gint chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);
+
+gint chartable_module_config(struct rspamd_config *cfg, bool validate);
+
+gint chartable_module_reconfig(struct rspamd_config *cfg);
module_t chartable_module = {
- "chartable",
- chartable_module_init,
- chartable_module_config,
- chartable_module_reconfig,
- nullptr,
- RSPAMD_MODULE_VER,
- (guint)-1,
+ "chartable",
+ chartable_module_init,
+ chartable_module_config,
+ chartable_module_reconfig,
+ nullptr,
+ RSPAMD_MODULE_VER,
+ (guint) -1,
};
struct chartable_ctx {
@@ -68,21 +70,22 @@ struct chartable_ctx {
};
static inline struct chartable_ctx *
-chartable_get_context (struct rspamd_config *cfg)
+chartable_get_context(struct rspamd_config *cfg)
{
- return (struct chartable_ctx *)g_ptr_array_index (cfg->c_modules,
- chartable_module.ctx_offset);
+ return (struct chartable_ctx *) g_ptr_array_index(cfg->c_modules,
+ chartable_module.ctx_offset);
}
-static void chartable_symbol_callback (struct rspamd_task *task,
- struct rspamd_symcache_dynamic_item *item,
- void *unused);
-static void chartable_url_symbol_callback (struct rspamd_task *task,
- struct rspamd_symcache_dynamic_item *item,
- void *unused);
+static void chartable_symbol_callback(struct rspamd_task *task,
+ struct rspamd_symcache_dynamic_item *item,
+ void *unused);
+
+static void chartable_url_symbol_callback(struct rspamd_task *task,
+ struct rspamd_symcache_dynamic_item *item,
+ void *unused);
gint
-chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
+chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
{
struct chartable_ctx *chartable_module_ctx;
@@ -90,40 +93,40 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
struct chartable_ctx);
chartable_module_ctx->max_word_len = 10;
- *ctx = (struct module_ctx *)chartable_module_ctx;
+ *ctx = (struct module_ctx *) chartable_module_ctx;
return 0;
}
gint
-chartable_module_config (struct rspamd_config *cfg, bool validate)
+chartable_module_config(struct rspamd_config *cfg, bool _)
{
const ucl_object_t *value;
gint res = TRUE;
- struct chartable_ctx *chartable_module_ctx = chartable_get_context (cfg);
+ struct chartable_ctx *chartable_module_ctx = chartable_get_context(cfg);
- if (!rspamd_config_is_module_enabled (cfg, "chartable")) {
+ if (!rspamd_config_is_module_enabled(cfg, "chartable")) {
return TRUE;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != nullptr) {
- chartable_module_ctx->symbol = ucl_obj_tostring (value);
+ rspamd_config_get_module_opt(cfg, "chartable", "symbol")) != nullptr) {
+ chartable_module_ctx->symbol = ucl_obj_tostring(value);
}
else {
chartable_module_ctx->symbol = DEFAULT_SYMBOL;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "url_symbol")) != nullptr) {
- chartable_module_ctx->url_symbol = ucl_obj_tostring (value);
+ rspamd_config_get_module_opt(cfg, "chartable", "url_symbol")) != nullptr) {
+ chartable_module_ctx->url_symbol = ucl_obj_tostring(value);
}
else {
chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != nullptr) {
- if (!ucl_obj_todouble_safe (value, &chartable_module_ctx->threshold)) {
+ rspamd_config_get_module_opt(cfg, "chartable", "threshold")) != nullptr) {
+ if (!ucl_obj_todouble_safe(value, &chartable_module_ctx->threshold)) {
msg_warn_config ("invalid numeric value");
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}
@@ -132,37 +135,37 @@ chartable_module_config (struct rspamd_config *cfg, bool validate)
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != nullptr) {
- chartable_module_ctx->max_word_len = ucl_object_toint (value);
+ rspamd_config_get_module_opt(cfg, "chartable", "max_word_len")) != nullptr) {
+ chartable_module_ctx->max_word_len = ucl_object_toint(value);
}
else {
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}
- rspamd_symcache_add_symbol (cfg->cache,
- chartable_module_ctx->symbol,
- 0,
- chartable_symbol_callback,
- nullptr,
- SYMBOL_TYPE_NORMAL,
- -1);
- rspamd_symcache_add_symbol (cfg->cache,
- chartable_module_ctx->url_symbol,
- 0,
- chartable_url_symbol_callback,
- nullptr,
- SYMBOL_TYPE_NORMAL,
- -1);
-
- msg_info_config ("init internal chartable module");
+ rspamd_symcache_add_symbol(cfg->cache,
+ chartable_module_ctx->symbol,
+ 0,
+ chartable_symbol_callback,
+ nullptr,
+ SYMBOL_TYPE_NORMAL,
+ -1);
+ rspamd_symcache_add_symbol(cfg->cache,
+ chartable_module_ctx->url_symbol,
+ 0,
+ chartable_url_symbol_callback,
+ nullptr,
+ SYMBOL_TYPE_NORMAL,
+ -1);
+
+ msg_info_config("init internal chartable module");
return res;
}
gint
-chartable_module_reconfig (struct rspamd_config *cfg)
+chartable_module_reconfig(struct rspamd_config *cfg)
{
- return chartable_module_config (cfg, false);
+ return chartable_module_config(cfg, false);
}
static const auto latin_confusable = ankerl::unordered_dense::set<int>{
@@ -321,19 +324,18 @@ static const auto latin_confusable = ankerl::unordered_dense::set<int>{
};
static gboolean
-rspamd_can_alias_latin (gint ch)
+rspamd_can_alias_latin(gint ch)
{
return latin_confusable.contains(ch);
}
static gdouble
-rspamd_chartable_process_word_utf (struct rspamd_task *task,
- rspamd_stat_token_t *w,
- gboolean is_url,
- guint *ncap,
- struct chartable_ctx *chartable_module_ctx,
- const gchar *lang,
- gboolean ignore_diacritics)
+rspamd_chartable_process_word_utf(struct rspamd_task *task,
+ rspamd_stat_token_t *w,
+ gboolean is_url,
+ guint *ncap,
+ struct chartable_ctx *chartable_module_ctx,
+ gboolean ignore_diacritics)
{
const UChar32 *p, *end;
gdouble badness = 0.0;
@@ -357,12 +359,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
while (p < end) {
uc = *p++;
- if (((gint32)uc) < 0) {
+ if (((gint32) uc) < 0) {
break;
}
- sc = ublock_getCode (uc);
- cat = u_charType (uc);
+ sc = ublock_getCode(uc);
+ cat = u_charType(uc);
if (!ignore_diacritics) {
if (cat == U_NON_SPACING_MARK ||
@@ -375,10 +377,10 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
}
}
- if (u_isalpha (uc)) {
+ if (u_isalpha(uc)) {
if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS ||
- sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
+ sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
/*
* Assume all latin, IPA, diacritic and space modifiers
* characters as basic latin
@@ -386,16 +388,16 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
sc = UBLOCK_BASIC_LATIN;
}
- if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) {
+ if (sc != UBLOCK_BASIC_LATIN && u_isupper(uc)) {
if (ncap) {
- (*ncap) ++;
+ (*ncap)++;
}
}
if (state == got_digit) {
/* Penalize digit -> alpha translations */
if (!is_url && sc != UBLOCK_BASIC_LATIN &&
- prev_state != start_process) {
+ prev_state != start_process) {
badness += 0.25;
}
}
@@ -404,15 +406,15 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
if (same_script_count > 0) {
if (sc != UBLOCK_BASIC_LATIN && last_is_latin) {
- if (rspamd_can_alias_latin (uc)) {
- badness += 1.0 / (gdouble)same_script_count;
+ if (rspamd_can_alias_latin(uc)) {
+ badness += 1.0 / (gdouble) same_script_count;
}
last_is_latin = 0;
same_script_count = 1;
}
else {
- same_script_count ++;
+ same_script_count++;
}
}
else {
@@ -425,7 +427,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
state = got_alpha;
}
- else if (u_isdigit (uc)) {
+ else if (u_isdigit(uc)) {
if (state != got_digit) {
prev_state = state;
}
@@ -443,7 +445,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
same_script_count = 0;
}
- nsym ++;
+ nsym++;
}
if (nspecial > 0) {
@@ -467,17 +469,17 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
}
msg_debug_chartable ("word %*s, badness: %.2f",
- (gint)w->normalized.len, w->normalized.begin,
- badness);
+ (gint) w->normalized.len, w->normalized.begin,
+ badness);
return badness;
}
static gdouble
-rspamd_chartable_process_word_ascii (struct rspamd_task *task,
- rspamd_stat_token_t *w,
- gboolean is_url,
- struct chartable_ctx *chartable_module_ctx)
+rspamd_chartable_process_word_ascii(struct rspamd_task *task,
+ rspamd_stat_token_t *w,
+ gboolean is_url,
+ struct chartable_ctx *chartable_module_ctx)
{
gdouble badness = 0.0;
enum {
@@ -516,12 +518,12 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
if (same_script_count > 0) {
if (sc != last_sc) {
- badness += 1.0 / (gdouble)same_script_count;
+ badness += 1.0 / (gdouble) same_script_count;
last_sc = sc;
same_script_count = 1;
}
else {
- same_script_count ++;
+ same_script_count++;
}
}
else {
@@ -544,7 +546,7 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
same_script_count = 0;
}
- p ++;
+ p++;
}
if (badness > 4.0) {
@@ -552,24 +554,24 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
}
msg_debug_chartable ("word %*s, badness: %.2f",
- (gint)w->normalized.len, w->normalized.begin,
- badness);
+ (gint) w->normalized.len, w->normalized.begin,
+ badness);
return badness;
}
static gboolean
-rspamd_chartable_process_part (struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- struct chartable_ctx *chartable_module_ctx,
- gboolean ignore_diacritics)
+rspamd_chartable_process_part(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ struct chartable_ctx *chartable_module_ctx,
+ gboolean ignore_diacritics)
{
rspamd_stat_token_t *w;
guint i, ncap = 0;
gdouble cur_score = 0.0;
if (part == nullptr || part->utf_words == nullptr ||
- part->utf_words->len == 0 || part->nwords == 0) {
+ part->utf_words->len == 0 || part->nwords == 0) {
return FALSE;
}
@@ -579,12 +581,12 @@ rspamd_chartable_process_part (struct rspamd_task *task,
if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
- cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
- &ncap, chartable_module_ctx, part->language, ignore_diacritics);
+ cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
+ &ncap, chartable_module_ctx, ignore_diacritics);
}
else {
- cur_score += rspamd_chartable_process_word_ascii (task, w,
- FALSE, chartable_module_ctx);
+ cur_score += rspamd_chartable_process_word_ascii(task, w,
+ FALSE, chartable_module_ctx);
}
}
}
@@ -596,7 +598,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
*/
part->capital_letters += ncap;
- cur_score /= (gdouble)part->nwords;
+ cur_score /= (gdouble) part->nwords;
if (cur_score > 1.0) {
cur_score = 1.0;
@@ -604,7 +606,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
if (cur_score > chartable_module_ctx->threshold) {
rspamd_task_insert_result (task, chartable_module_ctx->symbol,
- cur_score, nullptr);
+ cur_score, nullptr);
return TRUE;
}
@@ -612,37 +614,37 @@ rspamd_chartable_process_part (struct rspamd_task *task,
}
static void
-chartable_symbol_callback (struct rspamd_task *task,
- struct rspamd_symcache_dynamic_item *item,
- void *unused)
+chartable_symbol_callback(struct rspamd_task *task,
+ struct rspamd_symcache_dynamic_item *item,
+ void *_)
{
guint i;
struct rspamd_mime_text_part *part;
- struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg);
- const gchar *language = nullptr;
- gboolean ignore_diacritics = FALSE, seen_violated_part = FALSE;
+ struct chartable_ctx *chartable_module_ctx = chartable_get_context(task->cfg);
+ gboolean ignore_diacritics = TRUE, seen_violated_part = FALSE;
/* Check if we have parts with diacritic symbols language */
- PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
+ PTR_ARRAY_FOREACH (MESSAGE_FIELD(task, text_parts), i, part) {
if (part->languages && part->languages->len > 0) {
- struct rspamd_lang_detector_res *lang =
- (struct rspamd_lang_detector_res *)g_ptr_array_index (part->languages, 0);
+ auto *lang = (struct rspamd_lang_detector_res *) g_ptr_array_index(part->languages, 0);
gint flags;
- flags = rspamd_language_detector_elt_flags (lang->elt);
+ flags = rspamd_language_detector_elt_flags(lang->elt);
- if (flags & RS_LANGUAGE_DIACRITICS) {
+ if ((flags & RS_LANGUAGE_DIACRITICS)) {
ignore_diacritics = TRUE;
}
+ else if (lang->prob > 0.75) {
+ ignore_diacritics = FALSE;
+ }
}
- if (rspamd_chartable_process_part (task, part, chartable_module_ctx,
- ignore_diacritics)) {
+ if (rspamd_chartable_process_part(task, part, chartable_module_ctx, ignore_diacritics)) {
seen_violated_part = TRUE;
}
}
- if (MESSAGE_FIELD (task, text_parts)->len == 0) {
+ if (MESSAGE_FIELD(task, text_parts)->len == 0) {
/* No text parts, assume that we should ignore diacritics checks for metatokens */
ignore_diacritics = TRUE;
}
@@ -653,12 +655,12 @@ chartable_symbol_callback (struct rspamd_task *task,
gsize arlen = task->meta_words->len;
for (i = 0; i < arlen; i++) {
- w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
- cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
- nullptr, chartable_module_ctx, language, ignore_diacritics);
+ w = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+ cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
+ nullptr, chartable_module_ctx, ignore_diacritics);
}
- cur_score /= (gdouble)arlen;
+ cur_score /= (gdouble) (arlen + 1);
if (cur_score > 1.0) {
cur_score = 1.0;
@@ -672,19 +674,19 @@ chartable_symbol_callback (struct rspamd_task *task,
}
}
- rspamd_task_insert_result (task, chartable_module_ctx->symbol,
- cur_score, "subject");
+ rspamd_task_insert_result(task, chartable_module_ctx->symbol,
+ cur_score, "subject");
}
}
- rspamd_symcache_finalize_item (task, item);
+ rspamd_symcache_finalize_item(task, item);
}
static void
-chartable_url_symbol_callback (struct rspamd_task *task,
- struct rspamd_symcache_dynamic_item *item,
- void *unused)
+chartable_url_symbol_callback(struct rspamd_task *task,
+ struct rspamd_symcache_dynamic_item *item,
+ void *unused)
{
/* XXX: TODO: unbreak module once URLs unicode project is over */
#if 0
@@ -751,5 +753,5 @@ chartable_url_symbol_callback (struct rspamd_task *task,
}
#endif
- rspamd_symcache_finalize_item (task, item);
+ rspamd_symcache_finalize_item(task, item);
}
More information about the Commits
mailing list