commit 8e4bbaa: [Feature] Calculate tokens occurrences distribution
moisseev
moiseev at mezonplus.ru
Sun Jul 14 12:35:04 UTC 2019
Author: moisseev
Date: 2019-07-13 21:55:54 +0300
URL: https://github.com/rspamd/rspamd/commit/8e4bbaa70d9ac38a6f5b87758e8a23cc4b94cbc9 (refs/pull/2977/head)
[Feature] Calculate tokens occurrences distribution
---
src/plugins/lua/bayes_expiry.lua | 55 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 53 insertions(+), 2 deletions(-)
diff --git a/src/plugins/lua/bayes_expiry.lua b/src/plugins/lua/bayes_expiry.lua
index e2bf6aa8f..e5eb471d4 100644
--- a/src/plugins/lua/bayes_expiry.lua
+++ b/src/plugins/lua/bayes_expiry.lua
@@ -192,7 +192,7 @@ end
-- [1] = symbol pattern
-- [2] = expire value
-- [3] = cursor
--- returns {cursor for the next step, step number, step statistic counters, cycle statistic counters}
+-- returns {cursor for the next step, step number, step statistic counters, cycle statistic counters, tokens occurrences distribution}
local expiry_script = [[
local unpack_function = table.unpack or unpack
@@ -246,6 +246,13 @@ local expiry_script = [[
local keys = ret[2]
local tokens = {}
+ -- Tokens occurrences distribution counters
+ local occurr = {
+ ham = {},
+ spam = {},
+ total = {}
+ }
+
-- Expiry step statistics counters
local nelts, extended, discriminated, sum, sum_squares, common, significant,
infrequent, infrequent_ttls_set, insignificant, insignificant_ttls_set =
@@ -265,6 +272,11 @@ local expiry_script = [[
sum = sum + total
sum_squares = sum_squares + total * total
nelts = nelts + 1
+
+ for k,v in pairs({['ham']=ham, ['spam']=spam, ['total']=total}) do
+ if tonumber(v) > 19 then v = 20 end
+ occurr[k][v] = occurr[k][v] and occurr[k][v] + 1 or 1
+ end
end
local mean, stddev = 0, 0
@@ -342,12 +354,45 @@ local expiry_script = [[
redis.call('SET', step_key, tostring(step))
redis.call('DEL', lock_key)
+ local occ_distr = {}
+ for _,cl in pairs({'ham', 'spam', 'total'}) do
+ local occurr_key = pattern_sha1 .. '_occurrence_' .. cl
+
+ if cursor ~= 0 then
+ local n
+ for i,v in ipairs(redis.call('HGETALL', occurr_key)) do
+ if i % 2 == 1 then
+ n = tonumber(v)
+ else
+ occurr[cl][n] = occurr[cl][n] and occurr[cl][n] + v or v
+ end
+ end
+
+ local str = ''
+ if occurr[cl][0] ~= nil then
+ str = '0:' .. occurr[cl][0] .. ','
+ end
+ for k,v in ipairs(occurr[cl]) do
+ if k == 20 then k = '>19' end
+ str = str .. k .. ':' .. v .. ','
+ end
+ table.insert(occ_distr, str)
+ else
+ redis.call('DEL', occurr_key)
+ end
+
+ if next(occurr[cl]) ~= nil then
+ redis.call('HMSET', occurr_key, unpack_function(hash2list(occurr[cl])))
+ end
+ end
+
return {
next_cursor, step,
{nelts, extended, discriminated, mean, stddev, common, significant, infrequent,
infrequent_ttls_set, insignificant, insignificant_ttls_set},
{c.nelts, c.extended, c.discriminated, c.sum, c.sum_squares, c.common,
- c.significant, c.infrequent, c.infrequent_ttls_set, c.insignificant, c.insignificant_ttls_set}
+ c.significant, c.infrequent, c.infrequent_ttls_set, c.insignificant, c.insignificant_ttls_set},
+ occ_distr
}
]]
@@ -360,6 +405,7 @@ local function expire_step(cls, ev_base, worker)
local step = args[2]
local data = args[3]
local c_data = args[4]
+ local occ_distr = args[5]
local function log_stat(cycle)
local infrequent_action = (cls.expiry < 0) and 'made persistent' or 'ttls set'
@@ -393,6 +439,11 @@ local function expire_step(cls, ev_base, worker)
'%s insignificant (%s %s), %s common (%s discriminated), ' ..
'%s infrequent (%s %s), %s mean, %s std',
lutil.unpack(d))
+ if cycle then
+ for i,cl in ipairs({'in ham', 'in spam', 'total'}) do
+ logger.infox(rspamd_config, 'tokens occurrences, %s: {%s}', cl, occ_distr[i])
+ end
+ end
end
log_stat(false)
if cur == 0 then
More information about the Commits
mailing list