commit 8ae435c: [Project] Lua_content: Implement JS extraction from PDF
Vsevolod Stakhov
vsevolod at highsecure.ru
Sat Jan 18 21:28:17 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-01-18 21:20:50 +0000
URL: https://github.com/rspamd/rspamd/commit/8ae435c92608b0adde09910c2e741c1dbf87a834 (HEAD -> master)
[Project] Lua_content: Implement JS extraction from PDF
---
lualib/lua_content/pdf.lua | 76 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 73 insertions(+), 3 deletions(-)
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 9b6ee2622..85f939869 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -89,6 +89,21 @@ local pdf_text_patterns = {
}
}
+local pdf_cmap_patterns = {
+ start = {
+ patterns = {
+ [[\d\s+beginbfchar\s]],
+ [[\d\s+beginbfrange\s]]
+ }
+ },
+ stop = {
+ patterns = {
+ [[\sendbfrange\b]],
+ [[\sendbchar\b]]
+ }
+ }
+}
+
-- index[n] ->
-- t[1] - pattern,
-- t[2] - key in patterns table,
@@ -96,9 +111,11 @@ local pdf_text_patterns = {
-- t[4] - local pattern index
local pdf_indexes = {}
local pdf_text_indexes = {}
+local pdf_cmap_indexes = {}
local pdf_trie
local pdf_text_trie
+local pdf_cmap_trie
local exports = {}
@@ -150,6 +167,9 @@ local function compile_tries()
if not pdf_text_trie then
pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
end
+ if not pdf_cmap_trie then
+ pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
+ end
end
-- Returns a table with generic grammar elements for PDF
@@ -313,6 +333,7 @@ local function gen_text_grammar()
}
end
+
-- Call immediately on require
compile_tries()
config_module()
@@ -392,7 +413,7 @@ end
-- Conditionally extract stream data from object and attach it as obj.uncompressed
local function maybe_extract_object_stream(obj, pdf, task)
local dict = obj.dict
- if dict.Filter and dict.Length then
+ if dict.Length then
local len = math.min(obj.stream.len,
tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
local real_stream = obj.stream.data:span(1, len)
@@ -453,6 +474,7 @@ local function parse_object_grammar(obj, task, pdf)
end
-- Extracts font data and process /ToUnicode mappings
+-- NYI in fact as cmap is ridiculously stupid and complicated
local function process_font(task, pdf, font, fname)
local dict = font
if font.dict then
@@ -470,6 +492,48 @@ local function process_font(task, pdf, font, fname)
end
end
+-- Extract interesting stuff, e.g. javascript
+local function process_action(task, pdf, obj)
+ if obj.dict and obj.dict.JS then
+ local js = maybe_dereference_object(obj.dict.JS, pdf, task)
+
+ if js then
+ if type(js) == 'table' then
+ local extracted_js = maybe_extract_object_stream(js, pdf, task)
+
+ if not extracted_js then
+ lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
+ obj.major, obj.minor, js)
+ else
+ js = extracted_js
+ end
+ end
+
+ if type(js) == 'string' then
+ lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
+ obj.major, obj.minor, js)
+ if not pdf.scripts then
+ pdf.scripts = {}
+ end
+ pdf.scripts[#pdf.scripts + 1] = rspamd_text.fromstring(js)
+ elseif type(js) == 'userdata' then
+ lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
+ obj.major, obj.minor, js)
+ if not pdf.scripts then
+ pdf.scripts = {}
+ end
+ pdf.scripts[#pdf.scripts + 1] = js
+ else
+ lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
+ obj.major, obj.minor, js)
+ end
+ else
+ lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
+ obj.major, obj.minor)
+ end
+ end
+end
+
local function process_dict(task, pdf, obj, dict)
if not obj.type and type(dict) == 'table' then
if dict.Type and type(dict.Type) == 'string' then
@@ -499,6 +563,10 @@ local function process_dict(task, pdf, obj, dict)
end
end
+ if not obj.type then
+ return
+ end
+
lua_util.debugm(N, task, 'process stream dictionary for object %s:%s -> %s',
obj.major, obj.minor, obj.type)
local contents = dict.Contents
@@ -568,7 +636,7 @@ local function process_dict(task, pdf, obj, dict)
lua_util.debugm(N, task, 'found resources for object %s:%s: %s',
obj.major, obj.minor, obj.resources)
- if dict.Type == 'FontDescriptor' then
+ if obj.type == 'FontDescriptor' then
lua_util.debugm(N, task, "obj %s:%s is a font descriptor",
obj.major, obj.minor)
@@ -594,6 +662,8 @@ local function process_dict(task, pdf, obj, dict)
lua_util.debugm(N, task, "obj %s:%s is a font data stream",
stream_ref.major, stream_ref.minor)
end
+ elseif obj.type == 'Action' then
+ process_action(task, pdf, obj)
end
end
end
@@ -925,7 +995,7 @@ local function search_urls(task, pdf)
end
for _,obj in ipairs(pdf.objects) do
- if obj.dict then
+ if obj.dict and type(obj.dict) == 'table' then
recursive_object_traverse(obj, obj.dict, 0)
end
end
More information about the Commits
mailing list