commit 2bccf65: [Rework] Lua_util: Another rework for extract_specific_urls

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Aug 20 09:21:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-20 10:13:52 +0100
URL: https://github.com/rspamd/rspamd/commit/2bccf655b4095d88ad334d00c3f4bf51135b8f39

[Rework] Lua_util: Another rework for extract_specific_urls

---
 lualib/lua_util.lua                              | 134 ++++++++++++-----------
 test/lua/unit/lua_util.extract_specific_urls.lua |   6 +-
 2 files changed, 75 insertions(+), 65 deletions(-)

diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua
index cde09ad6a..4dddb979f 100644
--- a/lualib/lua_util.lua
+++ b/lualib/lua_util.lua
@@ -682,6 +682,7 @@ exports.filter_specific_urls = function (urls, params)
   end
 
   local function process_single_url(u)
+    local priority = 1 -- Normal priority
     local esld = u:get_tld()
 
     if params.ignore_redirected and u:is_redirected() then
@@ -697,36 +698,40 @@ exports.filter_specific_urls = function (urls, params)
     local str_hash = tostring(u)
 
     if esld then
+      -- Special cases
+      if (u:get_protocol() ~= 'mailto') and (not u:is_html_displayed()) then
+        if u:is_obscured() then
+          priority = 2
+        else
+          if u:get_user() then
+            priority = 2
+          elseif u:is_subject() or u:is_phished() then
+            priority = 2
+          end
+        end
+      elseif u:is_html_displayed() then
+        priority = 0
+      end
+
       if not eslds[esld] then
-        eslds[esld] = {{str_hash, u}}
+        eslds[esld] = {{str_hash, u, priority}}
         neslds = neslds + 1
       else
         if #eslds[esld] < params.esld_limit then
-          table.insert(eslds[esld], {str_hash, u})
+          table.insert(eslds[esld], {str_hash, u, priority})
         end
       end
 
+
+      -- eSLD - 1 part => tld
       local parts = rspamd_str_split(esld, '.')
       local tld = table.concat(fun.totable(fun.tail(parts)), '.')
 
       if not tlds[tld] then
-        tlds[tld] = {{str_hash, u}}
+        tlds[tld] = {{str_hash, u, priority}}
         ntlds = ntlds + 1
       else
-        table.insert(tlds[tld], {str_hash, u})
-      end
-
-      -- Special cases
-      if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then
-        if u:is_obscured() then
-          insert_url(str_hash, u)
-        else
-          if u:get_user() then
-            insert_url(str_hash, u)
-          elseif u:is_subject() or u:is_phished() then
-            insert_url(str_hash, u)
-          end
-        end
+        table.insert(tlds[tld], {str_hash, u, priority})
       end
     end
   end
@@ -737,24 +742,9 @@ exports.filter_specific_urls = function (urls, params)
 
   local limit = params.limit
   limit = limit - nres
-  if limit <= 0 then limit = 1 end
-
-  if neslds <= limit then
-    -- We can get urls based on their eslds
-    repeat
-      local item_found = false
-
-      for _,lurls in pairs(eslds) do
-        if #lurls > 0 then
-          local last = table.remove(lurls)
-          insert_url(last[1], last[2])
-          limit = limit - 1
-          item_found = true
-        end
-      end
-
-    until limit <= 0 or not item_found
+  if limit < 0 then limit = 0 end
 
+  if limit == 0 then
     res = exports.values(res)
     if params.task and not params.no_cache then
       params.task:cache_set(cache_key, res)
@@ -762,16 +752,49 @@ exports.filter_specific_urls = function (urls, params)
     return res
   end
 
-  if ntlds <= limit then
-    while limit > 0 do
-      for _,lurls in pairs(tlds) do
+  -- Sort eSLDs and tlds
+  local function sort_stuff(tbl)
+    -- Sort according to max priority
+    table.sort(tbl, function(e1, e2)
+      -- Sort by priority so max priority is at the end
+      table.sort(e1, function(tr1, tr2)
+        return tr1[3] < tr2[3]
+      end)
+      table.sort(e2, function(tr1, tr2)
+        return tr1[3] < tr2[3]
+      end)
+
+      if e1[#e1][3] ~= e2[#e2][3] then
+        -- Sort by priority so max priority is at the beginning
+        return e1[#e1][3] > e2[#e2][3]
+      else
+        -- Prefer less urls to more urls per esld
+        return #e1 < #e2
+      end
+
+    end)
+
+    return tbl
+  end
+
+  eslds = sort_stuff(exports.values(eslds))
+  neslds = #eslds
+
+  if neslds <= limit then
+    -- Number of eslds < limit
+    repeat
+      local item_found = false
+
+      for _,lurls in ipairs(eslds) do
         if #lurls > 0 then
           local last = table.remove(lurls)
           insert_url(last[1], last[2])
           limit = limit - 1
+          item_found = true
         end
       end
-    end
+
+    until limit <= 0 or not item_found
 
     res = exports.values(res)
     if params.task and not params.no_cache then
@@ -780,30 +803,18 @@ exports.filter_specific_urls = function (urls, params)
     return res
   end
 
-  -- We need to sort tlds table first
-  local tlds_keys = {}
-  for k,_ in pairs(tlds) do table.insert(tlds_keys, k) end
-  table.sort(tlds_keys, function (t1, t2)
-    return #tlds[t1] < #tlds[t2]
-  end)
-
-  ntlds = #tlds_keys
-  for i=1,ntlds / 2 do
-    local tld1 = tlds[tlds_keys[i]]
-    local tld2 = tlds[tlds_keys[ntlds - i]]
-    if #tld1 > 0 then
-      local last = table.remove(tld1)
-      insert_url(last[1], last[2])
-      limit = limit - 1
-    end
-    if #tld2 > 0 then
-      local last = table.remove(tld2)
-      insert_url(last[1], last[2])
-      limit = limit - 1
-    end
+  tlds = sort_stuff(exports.values(tlds))
+  ntlds = #tlds
 
-    if limit <= 0 then
-      break
+  -- Number of tlds < limit
+  while limit > 0 do
+    for _,lurls in ipairs(tlds) do
+      if #lurls > 0 then
+        local last = table.remove(lurls)
+        insert_url(last[1], last[2])
+        limit = limit - 1
+      end
+      if limit == 0 then break end
     end
   end
 
@@ -811,7 +822,6 @@ exports.filter_specific_urls = function (urls, params)
   if params.task and not params.no_cache then
     params.task:cache_set(cache_key, res)
   end
-
   return res
 end
 
diff --git a/test/lua/unit/lua_util.extract_specific_urls.lua b/test/lua/unit/lua_util.extract_specific_urls.lua
index 93816745e..c84a7ca8d 100644
--- a/test/lua/unit/lua_util.extract_specific_urls.lua
+++ b/test/lua/unit/lua_util.extract_specific_urls.lua
@@ -192,8 +192,8 @@ context("Lua util - extract_specific_urls", function()
 
     local actual = util.extract_specific_urls({
       task = task,
-      limit = 2,
-      esld_limit = 2,
+      limit = 1,
+      esld_limit = 1,
     })
 
     local actual_result = prepare_actual_result(actual)
@@ -202,7 +202,7 @@ context("Lua util - extract_specific_urls", function()
       local s = logger.slog("case[%1] %2 =?= %3", i, expect, actual_result)
       print(s) --]]
 
-    assert_equal("domain.com", actual_result[1], "checking that first url is the one with highest suspiciousness level")
+    assert_rspamd_table_eq({actual = actual_result, expect = {"domain.com"}})
 
   end)
 end)


More information about the Commits mailing list