commit 007a4ab: [Feature] Split pdf processing object and output object to allow GC
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Aug 18 10:49:06 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-08-18 11:45:59 +0100
URL: https://github.com/rspamd/rspamd/commit/007a4abff494ecfb9f80de1f6ac9505644968a0e (HEAD -> master)
[Feature] Split pdf processing object and output object to allow GC
---
lualib/lua_content/pdf.lua | 77 ++++++++++++++++++++++++++--------------------
1 file changed, 43 insertions(+), 34 deletions(-)
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index fd3317bac..53b980f1a 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -1181,12 +1181,15 @@ local function process_pdf(input, mpart, task)
if matches then
local start_ts = rspamd_util.get_ticks()
- local pdf_output = {
+ -- Temp object used to share data between pdf extraction methods
+ local pdf_object = {
tag = 'pdf',
extract_text = extract_text_data,
start_timestamp = start_ts,
end_timestamp = start_ts + config.pdf_process_timeout,
}
+ -- Output object that excludes all internal stuff
+ local pdf_output = lua_util.shallowcopy(pdf_object)
local grouped_processors = {}
for npat,matched_positions in pairs(matches) do
local index = pdf_indexes[npat]
@@ -1211,43 +1214,43 @@ local function process_pdf(input, mpart, task)
lua_util.debugm(N, task, "pdf: process group %s with %s matches",
name, #processor.offsets)
table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
- processor.processor_func(input, task, processor.offsets, pdf_output)
+ processor.processor_func(input, task, processor.offsets, pdf_object, pdf_output)
end
pdf_output.flags = {}
- if pdf_output.start_objects and pdf_output.end_objects then
- if #pdf_output.start_objects > config.max_pdf_objects then
- pdf_output.many_objects = #pdf_output.start_objects
+ if pdf_object.start_objects and pdf_object.end_objects then
+ if #pdf_object.start_objects > config.max_pdf_objects then
+ pdf_output.many_objects = #pdf_object.start_objects
-- Trim
end
-- Postprocess objects
- postprocess_pdf_objects(task, input, pdf_output)
+ postprocess_pdf_objects(task, input, pdf_object)
if config.text_extraction then
- search_text(task, pdf_output)
+ search_text(task, pdf_object, pdf_output)
end
if config.url_extraction then
- search_urls(task, pdf_output, mpart)
+ search_urls(task, pdf_object, mpart, pdf_output)
end
- if config.js_fuzzy and pdf_output.scripts then
+ if config.js_fuzzy and pdf_object.scripts then
pdf_output.fuzzy_hashes = {}
if config.openaction_fuzzy_only then
-- OpenAction only
- if pdf_output.openaction and pdf_output.openaction.bin_hash then
- if config.min_js_fuzzy and #pdf_output.openaction.data >= config.min_js_fuzzy then
+ if pdf_object.openaction and pdf_object.openaction.bin_hash then
+ if config.min_js_fuzzy and #pdf_object.openaction.data >= config.min_js_fuzzy then
lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
- pdf_output.openaction.hash)
- table.insert(pdf_output.fuzzy_hashes, pdf_output.openaction.bin_hash)
+ pdf_object.openaction.hash)
+ table.insert(pdf_output.fuzzy_hashes, pdf_object.openaction.bin_hash)
else
lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
- pdf_output.openaction.hash, #pdf_output.openaction.data)
+ pdf_object.openaction.hash, #pdf_object.openaction.data)
end
end
else
-- All hashes
- for h,sc in pairs(pdf_output.scripts) do
+ for h,sc in pairs(pdf_object.scripts) do
if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
sc.hash)
@@ -1264,19 +1267,24 @@ local function process_pdf(input, mpart, task)
pdf_output.flags.no_objects = true
end
+ -- Propagate from object to output
+ if pdf_object.encrypted then
+ pdf_output.encrypted = true
+ end
+
return pdf_output
end
end
-- Processes the PDF trailer
-processors.trailer = function(input, task, positions, output)
+processors.trailer = function(input, task, positions, pdf_object, pdf_output)
local last_pos = positions[#positions]
lua_util.debugm(N, task, 'pdf: process trailer at position %s (%s total length)',
last_pos, #input)
if last_pos[1] > config.max_pdf_trailer then
- output.long_trailer = #input - last_pos[1]
+ pdf_output.long_trailer = #input - last_pos[1]
return
end
@@ -1286,20 +1294,21 @@ processors.trailer = function(input, task, positions, output)
if line:find('/Encrypt ') then
lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
line)
- output.encrypted = true
+ pdf_output.encrypted = true
+ pdf_object.encrypted = true
break
end
lines_checked = lines_checked + 1
if lines_checked > config.max_pdf_trailer_lines then
lua_util.debugm(N, task, "pdf: trailer has too many lines, stop checking")
- output.long_trailer = #input - last_pos[1]
+ pdf_output.long_trailer = #input - last_pos[1]
break
end
end
end
-processors.suspicious = function(input, task, positions, output)
+processors.suspicious = function(input, task, positions, pdf_object, pdf_output)
local suspicious_factor = 0.0
local nexec = 0
local nencoded = 0
@@ -1343,33 +1352,33 @@ processors.suspicious = function(input, task, positions, output)
suspicious_factor = 1.0
end
- output.suspicious = suspicious_factor
+ pdf_output.suspicious = suspicious_factor
end
-local function generic_table_inserter(positions, output, output_key)
- if not output[output_key] then
- output[output_key] = {}
+local function generic_table_inserter(positions, pdf_object, output_key)
+ if not pdf_object[output_key] then
+ pdf_object[output_key] = {}
end
- local shift = #output[output_key]
+ local shift = #pdf_object[output_key]
for i,pos in ipairs(positions) do
- output[output_key][i + shift] = pos[1]
+ pdf_object[output_key][i + shift] = pos[1]
end
end
-processors.start_object = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'start_objects')
+processors.start_object = function(_, task, positions, pdf_object)
+ generic_table_inserter(positions, pdf_object, 'start_objects')
end
-processors.end_object = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'end_objects')
+processors.end_object = function(_, task, positions, pdf_object)
+ generic_table_inserter(positions, pdf_object, 'end_objects')
end
-processors.start_stream = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'start_streams')
+processors.start_stream = function(_, task, positions, pdf_object)
+ generic_table_inserter(positions, pdf_object, 'start_streams')
end
-processors.end_stream = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'end_streams')
+processors.end_stream = function(_, task, positions, pdf_object)
+ generic_table_inserter(positions, pdf_object, 'end_streams')
end
exports.process = process_pdf
More information about the Commits
mailing list