commit ddbbe07: [Fix] More fixes to extract_specific_urls
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Aug 19 15:56:18 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-08-19 16:49:19 +0100
URL: https://github.com/rspamd/rspamd/commit/ddbbe07faf9e5f01f3b9c984a01551e82633af42
[Fix] More fixes to extract_specific_urls
---
lualib/lua_selectors/extractors.lua | 1 +
lualib/lua_util.lua | 55 +++++++++++++++++++++++++------------
2 files changed, 39 insertions(+), 17 deletions(-)
diff --git a/lualib/lua_selectors/extractors.lua b/lualib/lua_selectors/extractors.lua
index 993169708..1d06794a1 100644
--- a/lualib/lua_selectors/extractors.lua
+++ b/lualib/lua_selectors/extractors.lua
@@ -283,6 +283,7 @@ e.g. `get_tld`]],
['get_value'] = function(task, args)
local params = args[1] or {}
params.task = task
+ params.no_cache = true
local urls = lua_util.extract_specific_urls(params)
return urls,'userdata_list'
end,
diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua
index 9bc42fd62..cde09ad6a 100644
--- a/lualib/lua_util.lua
+++ b/lualib/lua_util.lua
@@ -668,6 +668,18 @@ exports.filter_specific_urls = function (urls, params)
local ntlds, neslds = 0, 0
local res = {}
+ local nres = 0
+
+ local function insert_url(str, u)
+ if not res[str] then
+ res[str] = u
+ nres = nres + 1
+
+ return true
+ end
+
+ return false
+ end
local function process_single_url(u)
local esld = u:get_tld()
@@ -682,13 +694,15 @@ exports.filter_specific_urls = function (urls, params)
end
end
+ local str_hash = tostring(u)
+
if esld then
if not eslds[esld] then
- eslds[esld] = {u}
+ eslds[esld] = {{str_hash, u}}
neslds = neslds + 1
else
if #eslds[esld] < params.esld_limit then
- table.insert(eslds[esld], u)
+ table.insert(eslds[esld], {str_hash, u})
end
end
@@ -696,21 +710,21 @@ exports.filter_specific_urls = function (urls, params)
local tld = table.concat(fun.totable(fun.tail(parts)), '.')
if not tlds[tld] then
- tlds[tld] = {u}
+ tlds[tld] = {{str_hash, u}}
ntlds = ntlds + 1
else
- table.insert(tlds[tld], u)
+ table.insert(tlds[tld], {str_hash, u})
end
- -- Extract priority urls that are proven to be malicious
- if not u:is_html_displayed() then
+ -- Special cases
+ if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then
if u:is_obscured() then
- table.insert(res, u)
+ insert_url(str_hash, u)
else
if u:get_user() then
- table.insert(res, u)
+ insert_url(str_hash, u)
elseif u:is_subject() or u:is_phished() then
- table.insert(res, u)
+ insert_url(str_hash, u)
end
end
end
@@ -722,7 +736,7 @@ exports.filter_specific_urls = function (urls, params)
end
local limit = params.limit
- limit = limit - #res
+ limit = limit - nres
if limit <= 0 then limit = 1 end
if neslds <= limit then
@@ -732,7 +746,8 @@ exports.filter_specific_urls = function (urls, params)
for _,lurls in pairs(eslds) do
if #lurls > 0 then
- table.insert(res, table.remove(lurls))
+ local last = table.remove(lurls)
+ insert_url(last[1], last[2])
limit = limit - 1
item_found = true
end
@@ -740,8 +755,9 @@ exports.filter_specific_urls = function (urls, params)
until limit <= 0 or not item_found
+ res = exports.values(res)
if params.task and not params.no_cache then
- params.task:cache_set(cache_key, urls)
+ params.task:cache_set(cache_key, res)
end
return res
end
@@ -750,14 +766,16 @@ exports.filter_specific_urls = function (urls, params)
while limit > 0 do
for _,lurls in pairs(tlds) do
if #lurls > 0 then
- table.insert(res, table.remove(lurls))
+ local last = table.remove(lurls)
+ insert_url(last[1], last[2])
limit = limit - 1
end
end
end
+ res = exports.values(res)
if params.task and not params.no_cache then
- params.task:cache_set(cache_key, urls)
+ params.task:cache_set(cache_key, res)
end
return res
end
@@ -774,11 +792,13 @@ exports.filter_specific_urls = function (urls, params)
local tld1 = tlds[tlds_keys[i]]
local tld2 = tlds[tlds_keys[ntlds - i]]
if #tld1 > 0 then
- table.insert(res, table.remove(tld1))
+ local last = table.remove(tld1)
+ insert_url(last[1], last[2])
limit = limit - 1
end
if #tld2 > 0 then
- table.insert(res, table.remove(tld2))
+ local last = table.remove(tld2)
+ insert_url(last[1], last[2])
limit = limit - 1
end
@@ -787,8 +807,9 @@ exports.filter_specific_urls = function (urls, params)
end
end
+ res = exports.values(res)
if params.task and not params.no_cache then
- params.task:cache_set(cache_key, urls)
+ params.task:cache_set(cache_key, res)
end
return res
More information about the Commits
mailing list