commit 0d8245f: [Rework] Clickhouse: Improve performance

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Nov 30 14:35:09 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-11-30 14:27:39 +0000
URL: https://github.com/rspamd/rspamd/commit/0d8245f8339014dec90c004ce5bd09c764f182cd (HEAD -> master)

[Rework] Clickhouse: Improve performance

---
 lualib/rspamadm/clickhouse.lua | 125 ++++++++++++++++++++++-------------------
 1 file changed, 66 insertions(+), 59 deletions(-)

diff --git a/lualib/rspamadm/clickhouse.lua b/lualib/rspamadm/clickhouse.lua
index d1bbbef1a..4388b8ce0 100644
--- a/lualib/rspamadm/clickhouse.lua
+++ b/lualib/rspamadm/clickhouse.lua
@@ -86,43 +86,10 @@ local function load_config(config_file)
   end
 end
 
-local function get_excluded_symbols(res)
+local function get_excluded_symbols(known_symbols, correlations, seen_total)
   -- Walk results once to collect all symbols & count ocurrences
-  local known_symbols, remove = {}, {}
-  local symbols_count, seen_total = 1, 0
-  for _, r in ipairs(res) do
-    local is_spam = true
-    if r['Action'] == 'no action' or r['Action'] == 'greylist' then
-      is_spam = false
-    end
-    seen_total = seen_total + 1
-    for _, sym in ipairs(r['Symbols.Names']) do
-      local t = known_symbols[sym]
-      if not t then
-        local spam_count, ham_count = 0, 0
-        if is_spam then
-          spam_count = spam_count + 1
-        else
-          ham_count = ham_count + 1
-        end
-        known_symbols[sym] = {
-          id = symbols_count,
-          seen = 1,
-          seen_ham = ham_count,
-          seen_spam = spam_count,
-        }
-        symbols_count = symbols_count + 1
-      else
-        known_symbols[sym].seen = known_symbols[sym].seen + 1
-        if is_spam then
-          known_symbols[sym].seen_spam = known_symbols[sym].seen_spam + 1
-        else
-          known_symbols[sym].seen_ham = known_symbols[sym].seen_ham + 1
-        end
-      end
-    end
-  end
 
+  local remove = {}
   local known_symbols_list = {}
   local composites = rspamd_config:get_all_opt('composites')
   for k, v in pairs(known_symbols) do
@@ -147,27 +114,6 @@ local function get_excluded_symbols(res)
     }
   end
 
-  -- Walk results again & count correlations
-  local correlations = {}
-  for _, r in ipairs(res) do
-    for _, sym in ipairs(r['Symbols.Names']) do
-      for _, inner_sym_name in ipairs(r['Symbols.Names']) do
-        if inner_sym_name ~= sym then
-          local known_sym = known_symbols[sym]
-          local inner_sym = known_symbols[inner_sym_name]
-          if known_sym and inner_sym then
-            if not correlations[known_sym.id] then
-              correlations[known_sym.id] = {}
-            end
-            local n = correlations[known_sym.id][inner_sym.id] or 0
-            n = n + 1
-            correlations[known_sym.id][inner_sym.id] = n
-          end
-        end
-      end
-    end
-  end
-
   -- Walk correlation matrix and check total counts
   for sym_id, row in pairs(correlations) do
     for inner_sym_id, count in pairs(row) do
@@ -180,7 +126,7 @@ local function get_excluded_symbols(res)
     end
   end
 
-  return remove, known_symbols
+  return remove
 end
 
 local function handle_neural_profile(args)
@@ -190,13 +136,74 @@ local function handle_neural_profile(args)
   local query = string.format(
       "SELECT Action, Symbols.Names FROM rspamd %s", args.where or '')
   local upstream = args.upstream:get_upstream_round_robin()
-  local err, res = lua_clickhouse.select_sync(upstream, args, http_params, query)
+  local known_symbols = {}
+  local symbols_count, seen_total = 1, 0
+  local correlations = {}
+
+  local function process_row(r)
+    local is_spam = true
+    if r['Action'] == 'no action' or r['Action'] == 'greylist' then
+      is_spam = false
+    end
+    seen_total = seen_total + 1
+
+    local nsym = #r['Symbols.Names']
+
+    for i = 1,nsym do
+      local sym = r['Symbols.Names'][i]
+      local t = known_symbols[sym]
+      if not t then
+        local spam_count, ham_count = 0, 0
+        if is_spam then
+          spam_count = spam_count + 1
+        else
+          ham_count = ham_count + 1
+        end
+        known_symbols[sym] = {
+          id = symbols_count,
+          seen = 1,
+          seen_ham = ham_count,
+          seen_spam = spam_count,
+        }
+        symbols_count = symbols_count + 1
+      else
+        known_symbols[sym].seen = known_symbols[sym].seen + 1
+        if is_spam then
+          known_symbols[sym].seen_spam = known_symbols[sym].seen_spam + 1
+        else
+          known_symbols[sym].seen_ham = known_symbols[sym].seen_ham + 1
+        end
+      end
+    end
+
+    -- Fill correlations
+    for i = 1,nsym do
+      for j = 1,nsym do
+        if i ~= j then
+          local sym = r['Symbols.Names'][i]
+          local inner_sym_name = r['Symbols.Names'][j]
+          local known_sym = known_symbols[sym]
+          local inner_sym = known_symbols[inner_sym_name]
+          if known_sym and inner_sym then
+            if not correlations[known_sym.id] then
+              correlations[known_sym.id] = {}
+            end
+            local n = correlations[known_sym.id][inner_sym.id] or 0
+            n = n + 1
+            correlations[known_sym.id][inner_sym.id] = n
+          end
+        end
+      end
+    end
+  end
+
+  local err, _ = lua_clickhouse.select_sync(upstream, args, http_params, query, process_row)
   if err ~= nil then
     io.stderr:write(string.format('Error querying Clickhouse: %s\n', err))
     os.exit(1)
   end
 
-  local remove, known_symbols = get_excluded_symbols(res)
+  local remove = get_excluded_symbols(known_symbols, correlations, seen_total)
   if not args.json then
     for k in pairs(known_symbols) do
       if not remove[k] then


More information about the Commits mailing list