commit ddbbe07: [Fix] More fixes to extract_specific_urls

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Aug 19 15:56:18 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-19 16:49:19 +0100
URL: https://github.com/rspamd/rspamd/commit/ddbbe07faf9e5f01f3b9c984a01551e82633af42

[Fix] More fixes to extract_specific_urls

---
 lualib/lua_selectors/extractors.lua |  1 +
 lualib/lua_util.lua                 | 55 +++++++++++++++++++++++++------------
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/lualib/lua_selectors/extractors.lua b/lualib/lua_selectors/extractors.lua
index 993169708..1d06794a1 100644
--- a/lualib/lua_selectors/extractors.lua
+++ b/lualib/lua_selectors/extractors.lua
@@ -283,6 +283,7 @@ e.g. `get_tld`]],
     ['get_value'] = function(task, args)
       local params = args[1] or {}
       params.task = task
+      params.no_cache = true
       local urls = lua_util.extract_specific_urls(params)
       return urls,'userdata_list'
     end,
diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua
index 9bc42fd62..cde09ad6a 100644
--- a/lualib/lua_util.lua
+++ b/lualib/lua_util.lua
@@ -668,6 +668,18 @@ exports.filter_specific_urls = function (urls, params)
   local ntlds, neslds = 0, 0
 
   local res = {}
+  local nres = 0
+
+  local function insert_url(str, u)
+    if not res[str] then
+      res[str] = u
+      nres = nres + 1
+
+      return true
+    end
+
+    return false
+  end
 
   local function process_single_url(u)
     local esld = u:get_tld()
@@ -682,13 +694,15 @@ exports.filter_specific_urls = function (urls, params)
       end
     end
 
+    local str_hash = tostring(u)
+
     if esld then
       if not eslds[esld] then
-        eslds[esld] = {u}
+        eslds[esld] = {{str_hash, u}}
         neslds = neslds + 1
       else
         if #eslds[esld] < params.esld_limit then
-          table.insert(eslds[esld], u)
+          table.insert(eslds[esld], {str_hash, u})
         end
       end
 
@@ -696,21 +710,21 @@ exports.filter_specific_urls = function (urls, params)
       local tld = table.concat(fun.totable(fun.tail(parts)), '.')
 
       if not tlds[tld] then
-        tlds[tld] = {u}
+        tlds[tld] = {{str_hash, u}}
         ntlds = ntlds + 1
       else
-        table.insert(tlds[tld], u)
+        table.insert(tlds[tld], {str_hash, u})
       end
 
-      -- Extract priority urls that are proven to be malicious
-      if not u:is_html_displayed() then
+      -- Special cases
+      if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then
         if u:is_obscured() then
-          table.insert(res, u)
+          insert_url(str_hash, u)
         else
           if u:get_user() then
-            table.insert(res, u)
+            insert_url(str_hash, u)
           elseif u:is_subject() or u:is_phished() then
-            table.insert(res, u)
+            insert_url(str_hash, u)
           end
         end
       end
@@ -722,7 +736,7 @@ exports.filter_specific_urls = function (urls, params)
   end
 
   local limit = params.limit
-  limit = limit - #res
+  limit = limit - nres
   if limit <= 0 then limit = 1 end
 
   if neslds <= limit then
@@ -732,7 +746,8 @@ exports.filter_specific_urls = function (urls, params)
 
       for _,lurls in pairs(eslds) do
         if #lurls > 0 then
-          table.insert(res, table.remove(lurls))
+          local last = table.remove(lurls)
+          insert_url(last[1], last[2])
           limit = limit - 1
           item_found = true
         end
@@ -740,8 +755,9 @@ exports.filter_specific_urls = function (urls, params)
 
     until limit <= 0 or not item_found
 
+    res = exports.values(res)
     if params.task and not params.no_cache then
-      params.task:cache_set(cache_key, urls)
+      params.task:cache_set(cache_key, res)
     end
     return res
   end
@@ -750,14 +766,16 @@ exports.filter_specific_urls = function (urls, params)
     while limit > 0 do
       for _,lurls in pairs(tlds) do
         if #lurls > 0 then
-          table.insert(res, table.remove(lurls))
+          local last = table.remove(lurls)
+          insert_url(last[1], last[2])
           limit = limit - 1
         end
       end
     end
 
+    res = exports.values(res)
     if params.task and not params.no_cache then
-      params.task:cache_set(cache_key, urls)
+      params.task:cache_set(cache_key, res)
     end
     return res
   end
@@ -774,11 +792,13 @@ exports.filter_specific_urls = function (urls, params)
     local tld1 = tlds[tlds_keys[i]]
     local tld2 = tlds[tlds_keys[ntlds - i]]
     if #tld1 > 0 then
-      table.insert(res, table.remove(tld1))
+      local last = table.remove(tld1)
+      insert_url(last[1], last[2])
       limit = limit - 1
     end
     if #tld2 > 0 then
-      table.insert(res, table.remove(tld2))
+      local last = table.remove(tld2)
+      insert_url(last[1], last[2])
       limit = limit - 1
     end
 
@@ -787,8 +807,9 @@ exports.filter_specific_urls = function (urls, params)
     end
   end
 
+  res = exports.values(res)
   if params.task and not params.no_cache then
-    params.task:cache_set(cache_key, urls)
+    params.task:cache_set(cache_key, res)
   end
 
   return res


More information about the Commits mailing list