commit d2ceb9d: [Rework] Clickhouse: Store url flags

Vsevolod Stakhov vsevolod at
Mon Feb 22 13:49:06 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-02-22 13:13:19 +0000

[Rework] Clickhouse: Store url flags

 src/plugins/lua/clickhouse.lua | 50 ++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/plugins/lua/clickhouse.lua b/src/plugins/lua/clickhouse.lua
index 3ee4eeb73..fcf9177a4 100644
--- a/src/plugins/lua/clickhouse.lua
+++ b/src/plugins/lua/clickhouse.lua
@@ -33,7 +33,7 @@ local nrows = 0
 local used_memory = 0
 local last_collection = 0
 local final_call = false -- If the final collection has been started
-local schema_version = 8 -- Current schema version
+local schema_version = 9 -- Current schema version
 local settings = {
   limits = { -- Collection limits
@@ -133,6 +133,7 @@ CREATE TABLE IF NOT EXISTS rspamd
     `Attachments.Digest` Array(FixedString(16)) COMMENT 'First 16 characters of hash returned by mime_part:get_digest()',
     `Urls.Tld` Array(String) COMMENT 'Effective second level domain part of the URL host',
     `Urls.Url` Array(String) COMMENT 'Full URL if `full_urls` module option enabled, host part of URL otherwise',
+    `Urls.Flags` Array(UInt32) COMMENT 'Corresponding url flags',
     Emails Array(String) COMMENT 'List of emails extracted from the message',
     ASN UInt32 COMMENT 'BGP AS number for SMTP client IP (returned by or',
     Country FixedString(2) COMMENT 'Country for SMTP client IP (returned by or',
@@ -246,6 +247,14 @@ local migrations = {
     -- New version
     [[INSERT INTO rspamd_version (Version) Values (8)]],
+  [8] = {
+    -- Add new columns
+    [[ALTER TABLE rspamd
+      ADD COLUMN IF NOT EXISTS `Urls.Flags` Array(UInt32) AFTER `Urls.Url`
+    ]],
+    -- New version
+    [[INSERT INTO rspamd_version (Version) Values (9)]],
+  },
 local predefined_actions = {
@@ -314,6 +323,7 @@ local function clickhouse_urls_row(res)
   local fields = {
+    'Urls.Flags',
   for _,v in ipairs(fields) do table.insert(res, v) end
@@ -633,7 +643,11 @@ local function clickhouse_collect(task)
   local nurls = 0
-  local task_urls = task:get_urls(false) or {}
+  local task_urls = task:get_urls({
+   need_content = true,
+   need_images = true,
+   need_emails = false,
+  }) or {}
   nurls = #task_urls
@@ -748,34 +762,36 @@ local function clickhouse_collect(task)
     table.insert(row, {})
-  local flatten_urls = function(f, ...)
-    return fun.totable(,v) return f(k,v) end, ...))
-  end
   -- Urls step
   local urls_urls = {}
+  local urls_tlds = {}
+  local urls_flags = {}
-  for _,u in ipairs(task_urls) do
+  for i,u in ipairs(task_urls) do
     if settings['full_urls'] then
-      urls_urls[u:get_text()] = u
+      urls_urls[i] = u:get_text()
-      urls_urls[u:get_host()] = u
+      urls_urls[i] = u:get_host()
+    urls_tlds[i] = u:get_tld() or u:get_host()
+    urls_flags[i] = u:get_flags_num()
   -- Get tlds
-  table.insert(row, flatten_urls(function(_, u)
-    return u:get_tld() or u:get_host()
-  end, urls_urls))
+  table.insert(row, urls_tlds)
   -- Get hosts/full urls
-  table.insert(row, flatten_urls(function(k, _) return k end, urls_urls))
+  table.insert(row, urls_urls)
+  -- Numeric flags
+  table.insert(row, urls_flags)
   -- Emails step
   if task:has_urls(true) then
-    table.insert(row, flatten_urls(function(k, _) return k end,
-          return string.format('%s@%s', u:get_user(), u:get_host()),true
-        end, task:get_emails())))
+    local emails = task:get_emails() or {}
+    local emails_formatted = {}
+    for i,u in ipairs(emails) do
+      emails_formatted[i] = string.format('%s@%s', u:get_user(), u:get_host())
+    end
+    table.insert(row, emails_formatted)
     table.insert(row, {})

More information about the Commits mailing list