commit 4839327: [Minor] Improve diacritics handling in R_MIXED_CHARSET
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Sep 18 15:28:08 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-09-18 16:25:44 +0100
URL: https://github.com/rspamd/rspamd/commit/4839327399dc15a6bba1056027feb8d2def0a26a
[Minor] Improve diacritics handling in R_MIXED_CHARSET
---
src/plugins/chartable.c | 41 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 5 deletions(-)
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 815afd95a..55532bcf6 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -356,14 +356,16 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
rspamd_stat_token_t *w,
gboolean is_url,
guint *ncap,
- struct chartable_ctx *chartable_module_ctx)
+ struct chartable_ctx *chartable_module_ctx,
+ const gchar *lang)
{
const UChar32 *p, *end;
gdouble badness = 0.0;
UChar32 uc;
UBlockCode sc;
+ guint cat;
gint last_is_latin = -1;
- guint same_script_count = 0, nsym = 0;
+ guint same_script_count = 0, nsym = 0, nspecial = 0;
enum {
start_process = 0,
got_alpha,
@@ -383,8 +385,20 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
break;
}
+ sc = ublock_getCode (uc);
+ cat = u_charType (uc);
+
+ if (cat == U_NON_SPACING_MARK ||
+ (sc == UBLOCK_LATIN_1_SUPPLEMENT) ||
+ (sc == UBLOCK_LATIN_EXTENDED_A) ||
+ (sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
+ (sc == UBLOCK_LATIN_EXTENDED_B) ||
+ (sc == UBLOCK_COMBINING_DIACRITICAL_MARKS)) {
+ nspecial ++;
+ }
+
if (u_isalpha (uc)) {
- sc = ublock_getCode (uc);
+
if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS ||
sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) {
/*
@@ -454,6 +468,21 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
nsym ++;
}
+ if (nspecial > 0) {
+ if (lang) {
+ if (strcmp (lang, "en") == 0) {
+ /* Diacritic is always bad for English */
+ badness += nspecial;
+ }
+ else if (nspecial > 1) {
+ badness += (nspecial - 1.0) / 2.0;
+ }
+ }
+ else if (nspecial > 1) {
+ badness += (nspecial - 1.0) / 2.0;
+ }
+ }
+
/* Try to avoid FP for long words */
if (nsym > chartable_module_ctx->max_word_len) {
badness = 0;
@@ -578,7 +607,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
- &ncap, chartable_module_ctx);
+ &ncap, chartable_module_ctx, part->language);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, w,
@@ -615,9 +644,11 @@ chartable_symbol_callback (struct rspamd_task *task,
guint i;
struct rspamd_mime_text_part *part;
struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg);
+ const gchar *language = NULL;
PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, part) {
rspamd_chartable_process_part (task, part, chartable_module_ctx);
+ language = part->language;
}
if (task->meta_words != NULL) {
@@ -628,7 +659,7 @@ chartable_symbol_callback (struct rspamd_task *task,
for (i = 0; i < arlen; i++) {
w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
- NULL, chartable_module_ctx);
+ NULL, chartable_module_ctx, language);
}
cur_score /= (gdouble)arlen;
More information about the Commits
mailing list