commit 58a5e0c: [Rework] Reorganise selectors implementation

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Aug 19 08:49:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-19 09:45:28 +0100
URL: https://github.com/rspamd/rspamd/commit/58a5e0c7039af50b0607c5f4aee8648d1d560cdc (HEAD -> master)

[Rework] Reorganise selectors implementation

---
 lualib/lua_selectors.lua                           | 1212 --------------------
 lualib/lua_selectors/extractors.lua                |  374 ++++++
 lualib/lua_selectors/init.lua                      |  496 ++++++++
 .../{lua_ffi/common.lua => lua_selectors/maps.lua} |   30 +-
 lualib/lua_selectors/transforms.lua                |  395 +++++++
 test/lua/unit/selectors.combined.lua               |   14 +-
 test/lua/unit/selectors.negative.lua               |    4 +-
 7 files changed, 1276 insertions(+), 1249 deletions(-)

diff --git a/lualib/lua_selectors.lua b/lualib/lua_selectors.lua
deleted file mode 100644
index c123ae637..000000000
--- a/lualib/lua_selectors.lua
+++ /dev/null
@@ -1,1212 +0,0 @@
---[[
-Copyright (c) 2018, Vsevolod Stakhov <vsevolod at highsecure.ru>
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-]]--
-
--- This module contains 'selectors' implementation: code to extract data
--- from Rspamd tasks and compose those together
---
--- Read more at https://rspamd.com/doc/configuration/selectors.html
-
---[[[
--- @module lua_selectors
--- This module contains 'selectors' implementation: code to extract data
--- from Rspamd tasks and compose those together.
--- Typical selector looks like this: header(User).lower.substring(1, 2):ip
---]]
-
-local exports = {
-  maps = {} -- Defined for selectors maps, must be indexed by name
-}
-
-local logger = require 'rspamd_logger'
-local fun = require 'fun'
-local lua_util = require "lua_util"
-local ts = require("tableshape").types
-local M = "selectors"
-local E = {}
-
-local extractors = {
-  -- Plain id function
-  ['id'] = {
-    ['get_value'] = function(_, args)
-      if args[1] then
-        return args[1], 'string'
-      end
-
-      return '','string'
-    end,
-    ['description'] = [[Return value from function's argument or an empty string,
-For example, `id('Something')` returns a string 'Something']],
-    ['args_schema'] = {ts.string:is_optional()}
-  },
-  -- Similar but for making lists
-  ['list'] = {
-    ['get_value'] = function(_, args)
-      if args[1] then
-        return fun.map(tostring, args), 'string_list'
-      end
-
-      return {},'string_list'
-    end,
-    ['description'] = [[Return a list from function's arguments or an empty list,
-For example, `list('foo', 'bar')` returns a list {'foo', 'bar'}]],
-  },
-  -- Get source IP address
-  ['ip'] = {
-    ['get_value'] = function(task)
-      local ip = task:get_ip()
-      if ip and ip:is_valid() then return ip,'userdata' end
-      return nil
-    end,
-    ['description'] = [[Get source IP address]],
-  },
-  -- Get MIME from
-  ['from'] = {
-    ['get_value'] = function(task, args)
-      local from = task:get_from(args[1] or 0)
-      if ((from or E)[1] or E).addr then
-        return from[1],'table'
-      end
-      return nil
-    end,
-    ['description'] = [[Get MIME or SMTP from (e.g. `from('smtp')` or `from('mime')`,
-uses any type by default)]],
-  },
-  ['rcpts'] = {
-    ['get_value'] = function(task, args)
-      local rcpts = task:get_recipients(args[1] or 0)
-      if ((rcpts or E)[1] or E).addr then
-        return rcpts,'table_list'
-      end
-      return nil
-    end,
-    ['description'] = [[Get MIME or SMTP rcpts (e.g. `rcpts('smtp')` or `rcpts('mime')`,
-uses any type by default)]],
-  },
-  -- Get country (ASN module must be executed first)
-  ['country'] = {
-    ['get_value'] = function(task)
-      local country = task:get_mempool():get_variable('country')
-      if not country then
-        return nil
-      else
-        return country,'string'
-      end
-    end,
-    ['description'] = [[Get country (ASN module must be executed first)]],
-  },
-  -- Get ASN number
-  ['asn'] = {
-    ['type'] = 'string',
-    ['get_value'] = function(task)
-      local asn = task:get_mempool():get_variable('asn')
-      if not asn then
-        return nil
-      else
-        return asn,'string'
-      end
-    end,
-    ['description'] = [[Get AS number (ASN module must be executed first)]],
-  },
-  -- Get authenticated username
-  ['user'] = {
-    ['get_value'] = function(task)
-      local auser = task:get_user()
-      if not auser then
-        return nil
-      else
-        return auser,'string'
-      end
-    end,
-    ['description'] = 'Get authenticated user name',
-  },
-  -- Get principal recipient
-  ['to'] = {
-    ['get_value'] = function(task)
-      return task:get_principal_recipient(),'string'
-    end,
-    ['description'] = 'Get principal recipient',
-  },
-  -- Get content digest
-  ['digest'] = {
-    ['get_value'] = function(task)
-      return task:get_digest(),'string'
-    end,
-    ['description'] = 'Get content digest',
-  },
-  -- Get list of all attachments digests
-  ['attachments'] = {
-    ['get_value'] = function(task, args)
-
-      local s
-      local parts = task:get_parts() or E
-      local digests = {}
-
-      if #args > 0 then
-        local rspamd_cryptobox = require "rspamd_cryptobox_hash"
-        local encoding = args[1] or 'hex'
-        local ht = args[2] or 'blake2'
-
-        for _,p in ipairs(parts) do
-          if p:get_filename() then
-            local h = rspamd_cryptobox.create_specific(ht, p:get_content('raw_parsed'))
-            if encoding == 'hex' then
-              s = h:hex()
-            elseif encoding == 'base32' then
-              s = h:base32()
-            elseif encoding == 'base64' then
-              s = h:base64()
-            end
-            table.insert(digests, s)
-          end
-        end
-      else
-        for _,p in ipairs(parts) do
-          if p:get_filename() then
-            table.insert(digests, p:get_digest())
-          end
-        end
-      end
-
-      if #digests > 0 then
-        return digests,'string_list'
-      end
-
-      return nil
-    end,
-    ['description'] = [[Get list of all attachments digests.
-The first optional argument is encoding (`hex`, `base32`, `base64`),
-the second optional argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`, `md5`)]],
-
-    ['args_schema'] = {ts.one_of{'hex', 'base32', 'base64'}:is_optional(),
-                       ts.one_of{'blake2', 'sha256', 'sha1', 'sha512', 'md5'}:is_optional()}
-
-  },
-  -- Get all attachments files
-  ['files'] = {
-    ['get_value'] = function(task)
-      local parts = task:get_parts() or E
-      local files = {}
-
-      for _,p in ipairs(parts) do
-        local fname = p:get_filename()
-        if fname then
-          table.insert(files, fname)
-        end
-      end
-
-      if #files > 0 then
-        return files,'string_list'
-      end
-
-      return nil
-    end,
-    ['description'] = 'Get all attachments files',
-  },
-  -- Get languages for text parts
-  ['languages'] = {
-    ['get_value'] = function(task)
-      local text_parts = task:get_text_parts() or E
-      local languages = {}
-
-      for _,p in ipairs(text_parts) do
-        local lang = p:get_language()
-        if lang then
-          table.insert(languages, lang)
-        end
-      end
-
-      if #languages > 0 then
-        return languages,'string_list'
-      end
-
-      return nil
-    end,
-    ['description'] = 'Get languages for text parts',
-  },
-  -- Get helo value
-  ['helo'] = {
-    ['get_value'] = function(task)
-      return task:get_helo(),'string'
-    end,
-    ['description'] = 'Get helo value',
-  },
-  -- Get header with the name that is expected as an argument. Returns list of
-  -- headers with this name
-  ['header'] = {
-    ['get_value'] = function(task, args)
-      local strong = false
-      if args[2] then
-        if args[2]:match('strong') then
-          strong = true
-        end
-
-        if args[2]:match('full') then
-          return task:get_header_full(args[1], strong),'table_list'
-        end
-
-        return task:get_header(args[1], strong),'string'
-      else
-        return task:get_header(args[1]),'string'
-      end
-    end,
-    ['description'] = [[Get header with the name that is expected as an argument.
-The optional second argument accepts list of flags:
-  - `full`: returns all headers with this name with all data (like task:get_header_full())
-  - `strong`: use case sensitive match when matching header's name]],
-    ['args_schema'] = {ts.string,
-                       (ts.pattern("strong") + ts.pattern("full")):is_optional()}
-  },
-  -- Get list of received headers (returns list of tables)
-  ['received'] = {
-    ['get_value'] = function(task, args)
-      local rh = task:get_received_headers()
-      if args[1] and rh then
-        return fun.map(function(r) return r[args[1]] end, rh), 'string_list'
-      end
-
-      return rh,'table_list'
-    end,
-    ['description'] = [[Get list of received headers.
-If no arguments specified, returns list of tables. Otherwise, selects a specific element,
-e.g. `by_hostname`]],
-  },
-  -- Get all urls
-  ['urls'] = {
-    ['get_value'] = function(task, args)
-      local urls = task:get_urls()
-      if args[1] and urls then
-        return fun.map(function(r) return r[args[1]](r) end, urls), 'string_list'
-      end
-      return urls,'userdata_list'
-    end,
-    ['description'] = [[Get list of all urls.
-If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
-e.g. `get_tld`]],
-  },
-  -- Get all emails
-  ['emails'] = {
-    ['get_value'] = function(task, args)
-      local urls = task:get_emails()
-      if args[1] and urls then
-        return fun.map(function(r) return r[args[1]](r) end, urls), 'string_list'
-      end
-      return urls,'userdata_list'
-    end,
-    ['description'] = [[Get list of all emails.
-If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
-e.g. `get_user`]],
-  },
-  -- Get specific pool var. The first argument must be variable name,
-  -- the second argument is optional and defines the type (string by default)
-  ['pool_var'] = {
-    ['get_value'] = function(task, args)
-      local type = args[2] or 'string'
-      return task:get_mempool():get_variable(args[1], type),(type)
-    end,
-    ['description'] = [[Get specific pool var. The first argument must be variable name,
-the second argument is optional and defines the type (string by default)]],
-    ['args_schema'] = {ts.string, ts.string:is_optional()}
-  },
-  -- Get specific HTTP request header. The first argument must be header name.
-  ['request_header'] = {
-    ['get_value'] = function(task, args)
-      local hdr = task:get_request_header(args[1])
-      if hdr then
-        return tostring(hdr),'string'
-      end
-
-      return nil
-    end,
-    ['description'] = [[Get specific HTTP request header.
-The first argument must be header name.]],
-    ['args_schema'] = {ts.string}
-  },
-  -- Get task date, optionally formatted
-  ['time'] = {
-    ['get_value'] = function(task, args)
-      local what = args[1] or 'message'
-      local dt = task:get_date{format = what, gmt = true}
-
-      if dt then
-        if args[2] then
-          -- Should be in format !xxx, as dt is in GMT
-          return os.date(args[2], dt),'string'
-        end
-
-        return tostring(dt),'string'
-      end
-
-      return nil
-    end,
-    ['description'] = [[Get task timestamp. The first argument is type:
-  - `connect`: connection timestamp (default)
-  - `message`: timestamp as defined by `Date` header
-
-  The second argument is optional time format, see [os.date](http://pgl.yoyo.org/luai/i/os.date) description]],
-    ['args_schema'] = {ts.one_of{'connect', 'message'}:is_optional(),
-                       ts.string:is_optional()}
-  },
-  -- Get text words from a message
-  ['words'] = {
-    ['get_value'] = function(task, args)
-      local how = args[1] or 'stem'
-      local tp = task:get_text_parts()
-
-      if tp then
-        local rtype = 'string_list'
-        if how == 'full' then
-          rtype = 'table_list'
-        end
-
-        return lua_util.flatten(
-            fun.map(function(p)
-              return p:get_words(how)
-            end, tp)), rtype
-      end
-
-      return nil
-    end,
-    ['description'] = [[Get words from text parts
-  - `stem`: stemmed words (default)
-  - `raw`: raw words
-  - `norm`: normalised words (lowercased)
-  - `full`: list of tables
-  ]],
-    ['args_schema'] = { ts.one_of { 'stem', 'raw', 'norm', 'full' }:is_optional()},
-  },
-}
-
-local function pure_type(ltype)
-  return ltype:match('^(.*)_list$')
-end
-
-local transform_function = {
-  -- Returns the lowercased string
-  ['lower'] = {
-    ['types'] = {
-      ['string'] = true,
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, _)
-      return inp:lower(),'string'
-    end,
-    ['description'] = 'Returns the lowercased string',
-  },
-  -- Returns the first element
-  ['first'] = {
-    ['types'] = {
-      ['list'] = true,
-    },
-    ['process'] = function(inp, t)
-      return fun.head(inp),pure_type(t)
-    end,
-    ['description'] = 'Returns the first element',
-  },
-  -- Returns the last element
-  ['last'] = {
-    ['types'] = {
-      ['list'] = true,
-    },
-    ['process'] = function(inp, t)
-      return fun.nth(#inp, inp),pure_type(t)
-    end,
-    ['description'] = 'Returns the last element',
-  },
-  -- Returns the nth element
-  ['nth'] = {
-    ['types'] = {
-      ['list'] = true,
-    },
-    ['process'] = function(inp, t, args)
-      return fun.nth(args[1] or 1, inp),pure_type(t)
-    end,
-    ['description'] = 'Returns the nth element',
-    ['args_schema'] = {ts.number + ts.string / tonumber}
-  },
-  ['take_n'] = {
-    ['types'] = {
-      ['list'] = true,
-    },
-    ['process'] = function(inp, t, args)
-      return fun.take_n(args[1] or 1, inp),t
-    end,
-    ['description'] = 'Returns the n first elements',
-    ['args_schema'] = {ts.number + ts.string / tonumber}
-  },
-  ['drop_n'] = {
-    ['types'] = {
-      ['list'] = true,
-    },
-    ['process'] = function(inp, t, args)
-      return fun.drop_n(args[1] or 1, inp),t
-    end,
-    ['description'] = 'Returns list without the first n elements',
-    ['args_schema'] = {ts.number + ts.string / tonumber}
-  },
-  -- Joins strings into a single string using separator in the argument
-  ['join'] = {
-    ['types'] = {
-      ['string_list'] = true
-    },
-    ['process'] = function(inp, _, args)
-      return table.concat(fun.totable(inp), args[1] or ''), 'string'
-    end,
-    ['description'] = 'Joins strings into a single string using separator in the argument',
-    ['args_schema'] = {ts.string:is_optional()}
-  },
-  -- Sort strings
-  ['sort'] = {
-    ['types'] = {
-      ['list'] = true
-    },
-    ['process'] = function(inp, t, _)
-      table.sort(inp)
-      return inp, t
-    end,
-    ['description'] = 'Sort strings lexicographically',
-  },
-  -- Return unique elements based on hashing (can work without sorting)
-  ['uniq'] = {
-    ['types'] = {
-      ['list'] = true
-    },
-    ['process'] = function(inp, t, _)
-      local tmp = {}
-      fun.each(function(val)
-         tmp[val] = true
-      end, inp)
-
-      return fun.map(function(k, _) return k end, tmp), t
-    end,
-    ['description'] = 'Returns a list of unique elements (using a hash table)',
-  },
-  -- Create a digest from string or a list of strings
-  ['digest'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'hash',
-    ['process'] = function(inp, _, args)
-      local hash = require 'rspamd_cryptobox_hash'
-      local encoding = args[1] or 'hex'
-      local ht = args[2] or 'blake2'
-      local h = hash:create_specific(ht):update(inp)
-      local s
-
-      if encoding == 'hex' then
-        s = h:hex()
-      elseif encoding == 'base32' then
-        s = h:base32()
-      elseif encoding == 'base64' then
-        s = h:base64()
-      end
-
-      return s,'string'
-    end,
-    ['description'] = [[Create a digest from a string.
-The first argument is encoding (`hex`, `base32`, `base64`),
-the second argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`, `md5`)]],
-    ['args_schema'] = {ts.one_of{'hex', 'base32', 'base64'}:is_optional(),
-                       ts.one_of{'blake2', 'sha256', 'sha1', 'sha512', 'md5'}:is_optional()}
-  },
-  -- Extracts substring
-  ['substring'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, _, args)
-      local start_pos = args[1] or 1
-      local end_pos = args[2] or -1
-
-      return inp:sub(start_pos, end_pos), 'string'
-    end,
-    ['description'] = 'Extracts substring; the first argument is start, the second is the last (like in Lua)',
-    ['args_schema'] = {(ts.number + ts.string / tonumber):is_optional(),
-                       (ts.number + ts.string / tonumber):is_optional()}
-  },
-  -- Prepends a string or a strings list
-  ['prepend'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, _, args)
-      local prepend = table.concat(args, '')
-
-      return prepend .. inp, 'string'
-    end,
-    ['description'] = 'Prepends a string or a strings list',
-  },
-  -- Appends a string or a strings list
-  ['append'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, _, args)
-      local append = table.concat(args, '')
-
-      return inp .. append, 'string'
-    end,
-    ['description'] = 'Appends a string or a strings list',
-  },
-  -- Regexp matching
-  ['regexp'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, _, args)
-      local rspamd_regexp = require "rspamd_regexp"
-
-      local re = rspamd_regexp.create_cached(args[1])
-
-      if not re then
-        logger.errx('invalid regexp: %s', args[1])
-        return nil
-      end
-
-      local res = re:search(inp, false, true)
-
-      if res then
-        if #res == 1 then
-          return res[1],'string'
-        end
-
-        return res,'string_list'
-      end
-
-      return nil
-    end,
-    ['description'] = 'Regexp matching',
-    ['args_schema'] = {ts.string}
-  },
-  -- Returns a value if it exists in some map (or acts like a `filter` function)
-  ['filter_map'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, t, args)
-      local map = exports.maps[args[1]]
-
-      if not map then
-        logger.errx('invalid map name: %s', args[1])
-        return nil
-      end
-
-      local res = map:get_key(inp)
-
-      if res then
-        return inp,t
-      end
-
-      return nil
-    end,
-    ['description'] = 'Returns a value if it exists in some map (or acts like a `filter` function)',
-    ['args_schema'] = {ts.string}
-  },
-  -- Returns a value from some map corresponding to some key (or acts like a `map` function)
-  ['apply_map'] = {
-    ['types'] = {
-      ['string'] = true
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, t, args)
-      local map = exports.maps[args[1]]
-
-      if not map then
-        logger.errx('invalid map name: %s', args[1])
-        return nil
-      end
-
-      local res = map:get_key(inp)
-
-      if res then
-        return res,t
-      end
-
-      return nil
-    end,
-    ['description'] = 'Returns a value from some map corresponding to some key (or acts like a `map` function)',
-    ['args_schema'] = {ts.string}
-  },
-  -- Drops input value and return values from function's arguments or an empty string
-  ['id'] = {
-    ['types'] = {
-      ['string'] = true,
-      ['list'] = true,
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(_, _, args)
-      if args[1] and args[2] then
-        return fun.map(tostring, args),'string_list'
-      elseif args[1] then
-        return args[1],'string'
-      end
-
-      return '','string'
-    end,
-    ['description'] = 'Drops input value and return values from function\'s arguments or an empty string',
-    ['args_schema'] = (ts.string + ts.array_of(ts.string)):is_optional()
-  },
-  ['equal'] = {
-    ['types'] = {
-      ['string'] = true,
-    },
-    ['map_type'] = 'string',
-    ['process'] = function(inp, _, args)
-      if inp == args[1] then
-        return inp,'string'
-      end
-
-      return nil
-    end,
*** OUTPUT TRUNCATED, 1924 LINES SKIPPED ***


More information about the Commits mailing list