commit 5f09ef5: [Rework] Url_redirector: Rewrite plugin

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Aug 22 16:21:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-22 15:07:34 +0100
URL: https://github.com/rspamd/rspamd/commit/5f09ef5b8230106adbfbee5c86f830adbe185243

[Rework] Url_redirector: Rewrite plugin

---
 src/plugins/lua/url_redirector.lua | 161 +++++++++++++++++++++++++++----------
 1 file changed, 119 insertions(+), 42 deletions(-)

diff --git a/src/plugins/lua/url_redirector.lua b/src/plugins/lua/url_redirector.lua
index 6d6183d53..8de358bff 100644
--- a/src/plugins/lua/url_redirector.lua
+++ b/src/plugins/lua/url_redirector.lua
@@ -21,7 +21,10 @@ end
 local rspamd_logger = require "rspamd_logger"
 local rspamd_http = require "rspamd_http"
 local hash = require "rspamd_cryptobox_hash"
+local rspamd_url = require "rspamd_url"
 local lua_util = require "lua_util"
+local lua_redis = require "lua_redis"
+local N = "url_redirector"
 
 -- Some popular UA
 local default_ua = {
@@ -36,7 +39,7 @@ local default_ua = {
 }
 
 local redis_params
-local N = 'url_redirector'
+
 local settings = {
   expire = 86400, -- 1 day by default
   timeout = 10, -- 10 seconds by default
@@ -44,22 +47,49 @@ local settings = {
   --proxy = "http://example.com:3128", -- Send request through proxy
   key_prefix = 'rdr:', -- default hash name
   check_ssl = false, -- check ssl certificates
+  max_urls = 5, -- how many urls to check
   max_size = 10 * 1024, -- maximum body to process
   user_agent = default_ua,
+  redirector_symbol = nil, -- insert symbol if redirected url has been found
   redirectors_only = true, -- follow merely redirectors
   top_urls_key = 'rdr:top_urls', -- key for top urls
   top_urls_count = 200, -- how many top urls to save
+  redirector_hosts_map = nil -- check only those redirectors
 }
 
+local function adjust_url(task, orig_url, redir_url)
+  if type(redir_url) == 'string' then
+    redir_url = rspamd_url.create(task:get_mempool(), redir_url)
+  end
+
+  if redir_url then
+    orig_url:set_redirected(redir_url)
+    if settings.redirector_symbol then
+      task:insert_result(settings.redirector_symbol, 1.0,
+          string.format('%s->%s', orig_url:get_host(), redir_url:get_host()))
+    end
+  else
+    rspamd_logger.infox(task, 'bad url %s as redirection for %s', redir_url, orig_url)
+  end
+end
+
 local function cache_url(task, orig_url, url, key, param)
-  local function redis_trim_cb(err, data)
+  -- String representation
+  local str_orig_url = tostring(orig_url)
+  local str_url = tostring(url)
+
+  if str_url ~= str_orig_url then
+    -- Set redirected url
+    adjust_url(task, orig_url, url)
+  end
+
+  local function redis_trim_cb(err, _)
     if err then
       rspamd_logger.errx(task, 'got error while getting top urls count: %s', err)
     else
       rspamd_logger.infox(task, 'trimmed url set to %s elements',
         settings.top_urls_count)
     end
-    rspamd_plugins.surbl.continue_process(url, param)
   end
 
   -- Cleanup logic
@@ -69,7 +99,7 @@ local function cache_url(task, orig_url, url, key, param)
     else
       if data then
         if tonumber(data) > settings.top_urls_count * 2 then
-          local ret = rspamd_redis_make_request(task,
+          local ret = lua_redis.redis_make_request(task,
             redis_params, -- connect params
             key, -- hash key
             true, -- is write
@@ -80,7 +110,6 @@ local function cache_url(task, orig_url, url, key, param)
           )
           if not ret then
             rspamd_logger.errx(task, 'cannot trim top urls set')
-            rspamd_plugins.surbl.continue_process(url, param)
           else
             rspamd_logger.infox(task, 'need to trim urls set from %s to %s elements',
               data,
@@ -90,8 +119,6 @@ local function cache_url(task, orig_url, url, key, param)
         end
       end
     end
-
-    rspamd_plugins.surbl.continue_process(url, param)
   end
 
   local function redis_set_cb(err, _)
@@ -108,12 +135,11 @@ local function cache_url(task, orig_url, url, key, param)
       )
       if not ret then
         rspamd_logger.errx(task, 'cannot make redis request to cache results')
-        rspamd_plugins.surbl.continue_process(url, param)
       end
     end
   end
 
-  local ret,conn,_ = rspamd_redis_make_request(task,
+  local ret,conn,_ = lua_redis.redis_make_request(task,
     redis_params, -- connect params
     key, -- hash key
     true, -- is write
@@ -129,22 +155,25 @@ local function cache_url(task, orig_url, url, key, param)
   end
 end
 
-local function resolve_cached(task, orig_url, url, key, param, ntries)
+-- Resolve maybe cached url
+-- Orig url is the original url object
+-- url should be a new url object...
+local function resolve_cached(task, orig_url, url, key, ntries)
   local function resolve_url()
     if ntries > settings.nested_limit then
       -- We cannot resolve more, stop
-      rspamd_logger.infox(task, 'cannot get more requests to resolve %s, stop on %s after %s attempts',
+      rspamd_logger.debugm(N, task, 'cannot get more requests to resolve %s, stop on %s after %s attempts',
         orig_url, url, ntries)
-      cache_url(task, orig_url, url, key, param)
+      cache_url(task, orig_url, url, key)
 
       return
     end
 
-    local function http_callback(err, code, body, headers)
+    local function http_callback(err, code, _, headers)
       if err then
         rspamd_logger.infox(task, 'found redirect error from %s to %s, err message: %s',
           orig_url, url, err)
-        cache_url(task, orig_url, url, key, param)
+        cache_url(task, orig_url, url, key)
       else
         if code == 200 then
           if orig_url == url then
@@ -155,32 +184,37 @@ local function resolve_cached(task, orig_url, url, key, param, ntries)
               orig_url, url)
           end
 
-          cache_url(task, orig_url, url, key, param)
+          cache_url(task, orig_url, url, key)
 
         elseif code == 301 or code == 302 then
           local loc = headers['location']
-          rspamd_logger.infox(task, 'found redirect from %s to %s, err code %s',
-            orig_url, loc, code)
+          local redir_url
           if loc then
+            redir_url = rspamd_url.create(task:get_mempool(), loc)
+          end
+          rspamd_logger.debugm(N, task, 'found redirect from %s to %s, err code %s',
+            orig_url, loc, code)
+
+          if redir_url then
             if settings.redirectors_only then
-              if rspamd_plugins.surbl.is_redirector(task, loc) then
-                resolve_cached(task, orig_url, loc, key, param, ntries + 1)
+              if settings.redirector_hosts_map:get_key(redir_url:get_host()) then
+                resolve_cached(task, orig_url, redir_url, key, ntries + 1)
               else
                 lua_util.debugm(N, task,
                   "stop resolving redirects as %s is not a redirector", loc)
-                cache_url(task, orig_url, loc, key, param)
+                cache_url(task, orig_url, redir_url, key)
               end
             else
-              resolve_cached(task, orig_url, loc, key, param, ntries + 1)
+              resolve_cached(task, orig_url, redir_url, key, ntries + 1)
             end
           else
-            rspamd_logger.infox(task, "no location, headers: %s", headers)
-            cache_url(task, orig_url, url, key, param)
+            rspamd_logger.debugm(N, task, "no location, headers: %s", headers)
+            cache_url(task, orig_url, url, key)
           end
         else
-          rspamd_logger.infox(task, 'found redirect error from %s to %s, err code: %s',
+          rspamd_logger.debugm(N, task, 'found redirect error from %s to %s, err code: %s',
             orig_url, url, code)
-          cache_url(task, orig_url, url, key, param)
+          cache_url(task, orig_url, url, key)
         end
       end
     end
@@ -192,11 +226,13 @@ local function resolve_cached(task, orig_url, url, key, param, ntries)
       ua = settings.user_agent[math.random(#settings.user_agent)]
     end
 
+    lua_util.debugm(N, task, 'select user agent %s', ua)
+
     rspamd_http.request{
       headers = {
         ['User-Agent'] = ua,
       },
-      url = url,
+      url = tostring(url),
       task = task,
       method = 'head',
       max_size = settings.max_size,
@@ -211,9 +247,11 @@ local function resolve_cached(task, orig_url, url, key, param, ntries)
       if type(data) == 'string' then
         if data ~= 'processing' then
           -- Got cached result
-          rspamd_logger.infox(task, 'found cached redirect from %s to %s',
+          rspamd_logger.debugm(N, task, 'found cached redirect from %s to %s',
             url, data)
-          rspamd_plugins.surbl.continue_process(data, param)
+          if data ~= tostring(orig_url) then
+            adjust_url(task, orig_url, data)
+          end
           return
         end
       end
@@ -222,13 +260,13 @@ local function resolve_cached(task, orig_url, url, key, param, ntries)
       if nerr then
         rspamd_logger.errx(task, 'got error while setting redirect keys: %s', nerr)
       elseif ndata == 'OK' then
-        orig_url = url
         resolve_url()
       end
     end
 
-    if orig_url == url then
-      local ret = rspamd_redis_make_request(task,
+    if ntries == 1 then
+      -- Reserve key in Redis that we are processing this redirection
+      local ret = lua_redis.redis_make_request(task,
         redis_params, -- connect params
         key, -- hash key
         true, -- is write
@@ -240,11 +278,12 @@ local function resolve_cached(task, orig_url, url, key, param, ntries)
         rspamd_logger.errx(task, 'Couldn\'t schedule SET')
       end
     else
+      -- Just continue resolving
       resolve_url()
     end
 
   end
-  local ret = rspamd_redis_make_request(task,
+  local ret = lua_redis.redis_make_request(task,
     redis_params, -- connect params
     key, -- hash key
     false, -- is write
@@ -257,28 +296,66 @@ local function resolve_cached(task, orig_url, url, key, param, ntries)
   end
 end
 
-local function url_redirector_handler(task, url, param)
+local function url_redirector_process_url(task, url)
   local url_str = url:get_raw()
   -- 32 base32 characters are roughly 20 bytes of data or 160 bits
   local key = settings.key_prefix .. hash.create(url_str):base32():sub(1, 32)
-  resolve_cached(task, url_str, url_str, key, param, 1)
+  resolve_cached(task, url, url, key, 1)
+end
+
+local function url_redirector_handler(task)
+  local sp_urls = lua_util.extract_specific_urls({
+    task = task,
+    limit = settings.max_urls,
+    filter = function(url)
+      local host = url:get_host()
+      if settings.redirector_hosts_map:get_key(host) then
+        lua_util.debugm(N, task, 'check url %s', tostring(url))
+        return true
+      end
+    end,
+    no_cache = true,
+  })
+
+  if sp_urls then
+    for _,u in ipairs(sp_urls) do
+      url_redirector_process_url(task, u)
+    end
+  end
 end
 
 local opts =  rspamd_config:get_all_opt('url_redirector')
 if opts then
-  for k,v in pairs(opts) do
-    settings[k] = v
-  end
-  redis_params = rspamd_parse_redis_server('url_redirector')
+  settings = lua_util.override_defaults(settings, opts)
+  redis_params = lua_redis.parse_redis_server('url_redirector', settings)
+
   if not redis_params then
     rspamd_logger.infox(rspamd_config, 'no servers are specified, disabling module')
     lua_util.disable_module(N, "redis")
   else
-    if rspamd_plugins.surbl then
-      rspamd_plugins.surbl.register_redirect(rspamd_config, url_redirector_handler)
+
+    if not settings.redirector_hosts_map then
+      rspamd_logger.infox(rspamd_config, 'no redirector_hosts_map option is specified, disabling module')
+      lua_util.disable_module(N, "config")
     else
-      rspamd_logger.infox(rspamd_config, 'surbl module is not enabled, disabling module')
-      lua_util.disable_module(N, "fail")
+      local lua_maps = require "lua_maps"
+      settings.redirector_hosts_map = lua_maps.map_add_from_ucl(settings.redirector_hosts_map,
+          'set', 'Redirectors definitions')
+
+      local id = rspamd_config:register_symbol{
+        name = 'URL_REDIRECTOR_CHECK',
+        type = 'callback,prefilter',
+        callback = url_redirector_handler,
+      }
+
+      if settings.redirector_symbol then
+        rspamd_config:register_symbol{
+          name = settings.redirector_symbol,
+          type = 'virtual',
+          parent = id,
+          score = 0,
+        }
+      end
     end
   end
 end


More information about the Commits mailing list