commit 63f47bc: [Minor] Add tool to convert trivial SA rules to multimap

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Nov 1 15:49:06 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-11-01 15:46:35 +0000
URL: https://github.com/rspamd/rspamd/commit/63f47bc68b8025ead4549fc04fa9c554b7db4eab (HEAD -> master)

[Minor] Add tool to convert trivial SA rules to multimap

---
 utils/sa_trivial_convert.lua | 460 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 460 insertions(+)

diff --git a/utils/sa_trivial_convert.lua b/utils/sa_trivial_convert.lua
new file mode 100644
index 000000000..8cf0b9137
--- /dev/null
+++ b/utils/sa_trivial_convert.lua
@@ -0,0 +1,460 @@
+local fun = require "fun"
+local rspamd_logger = require "rspamd_logger"
+local util = require "rspamd_util"
+local lua_util = require "lua_util"
+local rspamd_regexp = require "rspamd_regexp"
+local ucl = require "ucl"
+
+local complicated = {}
+local rules = {}
+local scores = {}
+
+local function words_to_re(words, start)
+  return table.concat(fun.totable(fun.drop_n(start, words)), " ");
+end
+
+local function split(str, delim)
+  local result = {}
+
+  if not delim then
+    delim = '[^%s]+'
+  end
+
+  for token in string.gmatch(str, delim) do
+    table.insert(result, token)
+  end
+
+  return result
+end
+
+local function handle_header_def(hline, cur_rule)
+  --Now check for modifiers inside header's name
+  local hdrs = split(hline, '[^|]+')
+  local hdr_params = {}
+  local cur_param = {}
+  -- Check if an re is an ordinary re
+  local ordinary = true
+
+  for _,h in ipairs(hdrs) do
+    if h == 'ALL' or h == 'ALL:raw' then
+      ordinary = false
+    else
+      local args = split(h, '[^:]+')
+      cur_param['strong'] = false
+      cur_param['raw'] = false
+      cur_param['header'] = args[1]
+
+      if args[2] then
+        -- We have some ops that are required for the header, so it's not ordinary
+        ordinary = false
+      end
+
+      fun.each(function(func)
+          if func == 'addr' then
+            cur_param['function'] = function(str)
+              local addr_parsed = util.parse_addr(str)
+              local ret = {}
+              if addr_parsed then
+                for _,elt in ipairs(addr_parsed) do
+                  if elt['addr'] then
+                    table.insert(ret, elt['addr'])
+                  end
+                end
+              end
+
+              return ret
+            end
+          elseif func == 'name' then
+            cur_param['function'] = function(str)
+              local addr_parsed = util.parse_addr(str)
+              local ret = {}
+              if addr_parsed then
+                for _,elt in ipairs(addr_parsed) do
+                  if elt['name'] then
+                    table.insert(ret, elt['name'])
+                  end
+                end
+              end
+
+              return ret
+            end
+          elseif func == 'raw' then
+            cur_param['raw'] = true
+          elseif func == 'case' then
+            cur_param['strong'] = true
+          else
+            rspamd_logger.warnx(rspamd_config, 'Function %1 is not supported in %2',
+              func, cur_rule['symbol'])
+          end
+        end, fun.tail(args))
+
+        local function split_hdr_param(param, headers)
+          for _,hh in ipairs(headers) do
+            local nparam = {}
+            for k,v in pairs(param) do
+              if k ~= 'header' then
+                nparam[k] = v
+              end
+            end
+
+            nparam['header'] = hh
+            table.insert(hdr_params, nparam)
+          end
+        end
+        -- Some header rules require splitting to check of multiple headers
+        if cur_param['header'] == 'MESSAGEID' then
+          -- Special case for spamassassin
+          ordinary = false
+        elseif cur_param['header'] == 'ToCc' then
+          ordinary = false
+        else
+          table.insert(hdr_params, cur_param)
+        end
+    end
+
+    cur_rule['ordinary'] = ordinary
+    cur_rule['header'] = hdr_params
+  end
+end
+
+local function process_sa_conf(f)
+  local cur_rule = {}
+  local valid_rule = false
+
+  local function insert_cur_rule()
+   if not rules[cur_rule.type] then
+     rules[cur_rule.type] = {}
+   end
+
+   local target = rules[cur_rule.type]
+
+   if cur_rule.type == 'header' then
+     if not cur_rule.header[1].header then
+      rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+      return
+     end
+     if not target[cur_rule.header[1].header] then
+       target[cur_rule.header[1].header] = {}
+     end
+     target = target[cur_rule.header[1].header]
+   end
+
+   if not cur_rule['symbol'] then
+     rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+     return
+   end
+   target[cur_rule['symbol']] = cur_rule
+   cur_rule = {}
+   valid_rule = false
+  end
+
+  local function parse_score(words)
+    if #words == 3 then
+      -- score rule <x>
+      lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[3])
+      return tonumber(words[3])
+    elseif #words == 6 then
+      -- score rule <x1> <x2> <x3> <x4>
+      -- we assume here that bayes and network are enabled and select <x4>
+      lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[6])
+      return tonumber(words[6])
+    else
+      rspamd_logger.errx(rspamd_config, 'invalid score for %1', words[2])
+    end
+
+    return 0
+  end
+
+  local skip_to_endif = false
+  local if_nested = 0
+  for l in f:lines() do
+    (function ()
+    l = lua_util.rspamd_str_trim(l)
+    -- Replace bla=~/re/ with bla =~ /re/ (#2372)
+    l = l:gsub('([^%s])%s*([=!]~)%s*([^%s])', '%1 %2 %3')
+
+    if string.len(l) == 0 or string.sub(l, 1, 1) == '#' then
+      return
+    end
+
+    -- Unbalanced if/endif
+    if if_nested < 0 then if_nested = 0 end
+    if skip_to_endif then
+      if string.match(l, '^endif') then
+        if_nested = if_nested - 1
+
+        if if_nested == 0 then
+          skip_to_endif = false
+        end
+      elseif string.match(l, '^if') then
+        if_nested = if_nested + 1
+      elseif string.match(l, '^else') then
+        -- Else counterpart for if
+        skip_to_endif = false
+      end
+      table.insert(complicated, l)
+      return
+    else
+      if string.match(l, '^ifplugin') then
+        local ls = split(l)
+
+        skip_to_endif = true
+        if_nested = if_nested + 1
+        table.insert(complicated, l)
+      elseif string.match(l, '^if !plugin%(') then
+         local pname = string.match(l, '^if !plugin%(([A-Za-z:]+)%)')
+         skip_to_endif = true
+         if_nested = if_nested + 1
+        table.insert(complicated, l)
+      elseif string.match(l, '^if') then
+        -- Unknown if
+        skip_to_endif = true
+        if_nested = if_nested + 1
+        table.insert(complicated, l)
+      elseif string.match(l, '^else') then
+        -- Else counterpart for if
+        skip_to_endif = true
+        table.insert(complicated, l)
+      elseif string.match(l, '^endif') then
+        if_nested = if_nested - 1
+        table.insert(complicated, l)
+      end
+    end
+
+    -- Skip comments
+    local words = fun.totable(fun.take_while(
+      function(w) return string.sub(w, 1, 1) ~= '#' end,
+      fun.filter(function(w)
+          return w ~= "" end,
+      fun.iter(split(l)))))
+
+    if words[1] == "header" then
+      -- header SYMBOL Header ~= /regexp/
+      if valid_rule then
+        insert_cur_rule()
+      end
+      if words[4] and (words[4] == '=~' or words[4] == '!~') then
+        cur_rule['type'] = 'header'
+        cur_rule['symbol'] = words[2]
+
+        if words[4] == '!~' then
+          table.insert(complicated, l)
+          return
+        end
+
+        cur_rule['re_expr'] = words_to_re(words, 4)
+        local unset_comp = string.find(cur_rule['re_expr'], '%s+%[if%-unset:')
+        if unset_comp then
+          table.insert(complicated, l)
+          return
+        end
+
+        cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+
+        if not cur_rule['re'] then
+          rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%1' for %2",
+            cur_rule['re_expr'], cur_rule['symbol'])
+          table.insert(complicated, l)
+          return
+        else
+          handle_header_def(words[3], cur_rule)
+          if not cur_rule['ordinary'] then
+            table.insert(complicated, l)
+            return
+          end
+        end
+
+        valid_rule = true
+      else
+        table.insert(complicated, l)
+        return
+      end
+    elseif words[1] == "body" then
+      -- body SYMBOL /regexp/
+      if valid_rule then
+        insert_cur_rule()
+      end
+
+      cur_rule['symbol'] = words[2]
+      if words[3] and (string.sub(words[3], 1, 1) == '/'
+          or string.sub(words[3], 1, 1) == 'm') then
+        cur_rule['type'] = 'sabody'
+        cur_rule['re_expr'] = words_to_re(words, 2)
+        cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+        if cur_rule['re'] then
+
+          valid_rule = true
+        end
+      else
+        -- might be function
+        table.insert(complicated, l)
+        return
+      end
+    elseif words[1] == "rawbody" then
+      -- body SYMBOL /regexp/
+      if valid_rule then
+        insert_cur_rule()
+      end
+
+      cur_rule['symbol'] = words[2]
+      if words[3] and (string.sub(words[3], 1, 1) == '/'
+          or string.sub(words[3], 1, 1) == 'm') then
+        cur_rule['type'] = 'sarawbody'
+        cur_rule['re_expr'] = words_to_re(words, 2)
+        cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+        if cur_rule['re'] then
+          valid_rule = true
+        end
+      else
+        table.insert(complicated, l)
+        return
+      end
+    elseif words[1] == "full" then
+      -- body SYMBOL /regexp/
+      if valid_rule then
+        insert_cur_rule()
+      end
+
+      cur_rule['symbol'] = words[2]
+
+      if words[3] and (string.sub(words[3], 1, 1) == '/'
+          or string.sub(words[3], 1, 1) == 'm') then
+        cur_rule['type'] = 'message'
+        cur_rule['re_expr'] = words_to_re(words, 2)
+        cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+        cur_rule['raw'] = true
+        if cur_rule['re'] then
+          valid_rule = true
+        end
+      else
+        table.insert(complicated, l)
+        return
+      end
+    elseif words[1] == "uri" then
+      -- uri SYMBOL /regexp/
+      if valid_rule then
+        insert_cur_rule()
+      end
+      cur_rule['type'] = 'uri'
+      cur_rule['symbol'] = words[2]
+      cur_rule['re_expr'] = words_to_re(words, 2)
+      cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+      if cur_rule['re'] and cur_rule['symbol'] then
+        valid_rule = true
+      else
+        table.insert(complicated, l)
+        return
+      end
+    elseif words[1] == "meta" then
+      -- meta SYMBOL expression
+      if valid_rule then
+        insert_cur_rule()
+      end
+      table.insert(complicated, l)
+      return
+    elseif words[1] == "describe" and valid_rule then
+      cur_rule['description'] = words_to_re(words, 2)
+    elseif words[1] == "score" then
+      scores[words[2]] = parse_score(words)
+    else
+      table.insert(complicated, l)
+      return
+    end
+    end)()
+  end
+  if valid_rule then
+    insert_cur_rule()
+  end
+end
+
+for _,matched in ipairs(arg) do
+  local f = io.open(matched, "r")
+  if f then
+    rspamd_logger.messagex(rspamd_config, 'loading SA rules from %s', matched)
+    process_sa_conf(f)
+  else
+    rspamd_logger.errx(rspamd_config, "cannot open %1", matched)
+  end
+end
+
+local multimap_conf = {}
+
+local function handle_rule(what, syms, hdr)
+  local mtype
+  local filter
+  local fname
+  local sym = what:upper()
+  if what == 'sabody' then
+    mtype = 'content'
+    fname = 'body_re.map'
+    filter = 'oneline'
+  elseif what == 'sarawbody' then
+    fname = 'raw_body_re.map'
+    mtype = 'content'
+    filter = 'rawtext'
+  elseif what == 'full' then
+    fname = 'full_re.map'
+    mtype = 'content'
+    filter = 'full'
+  elseif what == 'uri' then
+    fname = 'uri_re.map'
+    mtype = 'url'
+    filter = 'full'
+  elseif what == 'header' then
+    fname = ('hdr_' .. hdr .. '_re.map'):lower()
+    mtype = 'header'
+    header = hdr
+    sym = sym .. '_' .. hdr:upper()
+  else
+    rspamd_logger.errx('unknown type: %s', what)
+    return
+  end
+  local conf = {
+    type = mtype,
+    filter = filter,
+    symbol = 'SA_MAP_AUTO_' .. sym,
+    regexp = true,
+    map = fname,
+    header = header,
+    symbols = {}
+  }
+  
+  local re_file = io.open(fname, 'w')
+
+  for k,r in pairs(syms) do
+    local score = 0.0
+    if scores[k] then
+      score = scores[k]
+    end
+    re_file:write(string.format('/%s/ %s:%f\n', tostring(r.re), k, score))
+    table.insert(conf.symbols, k)
+  end
+
+  re_file:close()
+
+  multimap_conf[sym:lower()] = conf
+  rspamd_logger.messagex('stored %s regexp in %s', sym:lower(), fname)
+end
+
+for k,v in pairs(rules) do
+  if k == 'header' then
+    for h,r in pairs(v) do
+      handle_rule(k, r, h)
+    end
+  else
+    handle_rule(k, v)
+  end
+end
+
+local out = ucl.to_format(multimap_conf, 'ucl')
+local mmap_conf = io.open('auto_multimap.conf', 'w')
+mmap_conf:write(out)
+mmap_conf:close()
+rspamd_logger.messagex('stored multimap conf in %s', 'auto_multimap.conf')
+
+local sa_remain = io.open('auto_sa.conf', 'w')
+fun.each(function(l) 
+  sa_remain:write(l)
+end, fun.filter(function(l) return not string.match(l, '^%s+$') end, complicated))
+sa_remain:close()
+rspamd_logger.messagex('stored sa remains conf in %s', 'auto_sa.conf')


More information about the Commits mailing list