commit 50e1bfb: [Fix] Normalize dynamic scores in ANN correctly
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Mar 16 10:35:07 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-03-16 10:12:46 +0000
URL: https://github.com/rspamd/rspamd/commit/50e1bfba88a19d9d0fe225fc5900beede64b03a2 (HEAD -> master)
[Fix] Normalize dynamic scores in ANN correctly
---
src/lua/lua_task.c | 37 ++++++++++++++++++++++++++++++++++++-
1 file changed, 36 insertions(+), 1 deletion(-)
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 95e4c9fbd..1fbf988a9 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -4771,6 +4771,16 @@ lua_task_process_ann_tokens (lua_State *L)
lua_rawgeti (L, 2, i);
sym = lua_tostring (L, -1);
+ /*
+ * TODO: this cycle involves one hash lookup per symbol in a profile
+ * Basically, in a common case that would be a table of all symbols
+ * So we need to do N_symbols hash lookups which is not optimal
+ * The optimal solution is to convert [sym1, sym2, ... symn] profile
+ * to a set {sym1 = true, sym2 = true, ...} and then for each
+ * resulting symbol check this table.
+ *
+ * That would lead to N_results lookups which is usually MUCH smaller
+ */
sres = rspamd_task_find_symbol_result (task, sym);
if (sres && !(sres->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) {
@@ -4778,8 +4788,33 @@ lua_task_process_ann_tokens (lua_State *L)
if (!isnan (sres->score) && !isinf (sres->score) &&
(!sres->sym ||
!(rspamd_symcache_item_flags (sres->sym->cache_item) & SYMBOL_TYPE_NOSTAT))) {
- gdouble norm_score = fabs (tanh (sres->score));
+ gdouble norm_score;
+
+ if (!isnan (sres->sym->score)) {
+ if (sres->sym->score == 0) {
+
+ if (sres->score == 0) {
+ /* Binary symbol */
+ norm_score = 1.0;
+ }
+ else {
+ norm_score = fabs (tanh (sres->score));
+ }
+ }
+ else {
+ /* Get dynamic weight */
+ norm_score = fabs (sres->score / sres->sym->score);
+
+ if (norm_score > 1.0) {
+ /* Multiple hits, we assume them as a single one */
+ norm_score = 1.0;
+ }
+ }
+ }
+ else {
+ norm_score = fabs (tanh (sres->score));
+ }
lua_pushnumber (L, MAX (min_score , norm_score));
lua_rawseti (L, 3, offset + 1);
More information about the Commits
mailing list