commit 50e1bfb: [Fix] Normalize dynamic scores in ANN correctly

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Mar 16 10:35:07 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-03-16 10:12:46 +0000
URL: https://github.com/rspamd/rspamd/commit/50e1bfba88a19d9d0fe225fc5900beede64b03a2 (HEAD -> master)

[Fix] Normalize dynamic scores in ANN correctly

---
 src/lua/lua_task.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 95e4c9fbd..1fbf988a9 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -4771,6 +4771,16 @@ lua_task_process_ann_tokens (lua_State *L)
 			lua_rawgeti (L, 2, i);
 			sym = lua_tostring (L, -1);
 
+			/*
+			 * TODO: this cycle involves one hash lookup per symbol in a profile
+			 * Basically, in a common case that would be a table of all symbols
+			 * So we need to do N_symbols hash lookups which is not optimal
+			 * The optimal solution is to convert [sym1, sym2, ... symn] profile
+			 * to a set {sym1 = true, sym2 = true, ...} and then for each
+			 * resulting symbol check this table.
+			 *
+			 * That would lead to N_results lookups which is usually MUCH smaller
+			 */
 			sres = rspamd_task_find_symbol_result (task, sym);
 
 			if (sres && !(sres->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) {
@@ -4778,8 +4788,33 @@ lua_task_process_ann_tokens (lua_State *L)
 				if (!isnan (sres->score) && !isinf (sres->score) &&
 						(!sres->sym ||
 							!(rspamd_symcache_item_flags (sres->sym->cache_item) & SYMBOL_TYPE_NOSTAT))) {
-					gdouble norm_score = fabs (tanh (sres->score));
 
+					gdouble norm_score;
+
+					if (!isnan (sres->sym->score)) {
+						if (sres->sym->score == 0) {
+
+							if (sres->score == 0) {
+								/* Binary symbol */
+								norm_score = 1.0;
+							}
+							else {
+								norm_score = fabs (tanh (sres->score));
+							}
+						}
+						else {
+							/* Get dynamic weight */
+							norm_score = fabs (sres->score / sres->sym->score);
+
+							if (norm_score > 1.0) {
+								/* Multiple hits, we assume them as a single one */
+								norm_score = 1.0;
+							}
+						}
+					}
+					else {
+						norm_score = fabs (tanh (sres->score));
+					}
 
 					lua_pushnumber (L, MAX (min_score , norm_score));
 					lua_rawseti (L, 3, offset + 1);


More information about the Commits mailing list