commit 8ae435c: [Project] Lua_content: Implement JS extraction from PDF

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Jan 18 21:28:17 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-18 21:20:50 +0000
URL: https://github.com/rspamd/rspamd/commit/8ae435c92608b0adde09910c2e741c1dbf87a834 (HEAD -> master)

[Project] Lua_content: Implement JS extraction from PDF

---
 lualib/lua_content/pdf.lua | 76 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 3 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 9b6ee2622..85f939869 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -89,6 +89,21 @@ local pdf_text_patterns = {
   }
 }
 
+local pdf_cmap_patterns = {
+  start = {
+    patterns = {
+      [[\d\s+beginbfchar\s]],
+      [[\d\s+beginbfrange\s]]
+    }
+  },
+  stop = {
+    patterns = {
+      [[\sendbfrange\b]],
+      [[\sendbchar\b]]
+    }
+  }
+}
+
 -- index[n] ->
 --  t[1] - pattern,
 --  t[2] - key in patterns table,
@@ -96,9 +111,11 @@ local pdf_text_patterns = {
 --  t[4] - local pattern index
 local pdf_indexes = {}
 local pdf_text_indexes = {}
+local pdf_cmap_indexes = {}
 
 local pdf_trie
 local pdf_text_trie
+local pdf_cmap_trie
 
 local exports = {}
 
@@ -150,6 +167,9 @@ local function compile_tries()
   if not pdf_text_trie then
     pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
   end
+  if not pdf_cmap_trie then
+    pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
+  end
 end
 
 -- Returns a table with generic grammar elements for PDF
@@ -313,6 +333,7 @@ local function gen_text_grammar()
   }
 end
 
+
 -- Call immediately on require
 compile_tries()
 config_module()
@@ -392,7 +413,7 @@ end
 -- Conditionally extract stream data from object and attach it as obj.uncompressed
 local function maybe_extract_object_stream(obj, pdf, task)
   local dict = obj.dict
-  if dict.Filter and dict.Length then
+  if dict.Length then
     local len = math.min(obj.stream.len,
         tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
     local real_stream = obj.stream.data:span(1, len)
@@ -453,6 +474,7 @@ local function parse_object_grammar(obj, task, pdf)
 end
 
 -- Extracts font data and process /ToUnicode mappings
+-- NYI in fact as cmap is ridiculously stupid and complicated
 local function process_font(task, pdf, font, fname)
   local dict = font
   if font.dict then
@@ -470,6 +492,48 @@ local function process_font(task, pdf, font, fname)
   end
 end
 
+-- Extract interesting stuff, e.g. javascript
+local function process_action(task, pdf, obj)
+  if obj.dict and obj.dict.JS then
+    local js = maybe_dereference_object(obj.dict.JS, pdf, task)
+
+    if js then
+      if type(js) == 'table' then
+        local extracted_js = maybe_extract_object_stream(js, pdf, task)
+
+        if not extracted_js then
+          lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
+              obj.major, obj.minor, js)
+        else
+          js = extracted_js
+        end
+      end
+
+      if type(js) == 'string' then
+        lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
+            obj.major, obj.minor, js)
+        if not pdf.scripts then
+          pdf.scripts = {}
+        end
+        pdf.scripts[#pdf.scripts + 1] = rspamd_text.fromstring(js)
+      elseif type(js) == 'userdata' then
+        lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
+            obj.major, obj.minor, js)
+        if not pdf.scripts then
+          pdf.scripts = {}
+        end
+        pdf.scripts[#pdf.scripts + 1] = js
+      else
+        lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
+            obj.major, obj.minor, js)
+      end
+    else
+      lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
+          obj.major, obj.minor)
+    end
+  end
+end
+
 local function process_dict(task, pdf, obj, dict)
   if not obj.type and type(dict) == 'table' then
     if dict.Type and type(dict.Type) == 'string' then
@@ -499,6 +563,10 @@ local function process_dict(task, pdf, obj, dict)
       end
     end
 
+    if not obj.type then
+      return
+    end
+
     lua_util.debugm(N, task, 'process stream dictionary for object %s:%s -> %s',
         obj.major, obj.minor, obj.type)
     local contents = dict.Contents
@@ -568,7 +636,7 @@ local function process_dict(task, pdf, obj, dict)
     lua_util.debugm(N, task, 'found resources for object %s:%s: %s',
         obj.major, obj.minor, obj.resources)
 
-    if dict.Type == 'FontDescriptor' then
+    if obj.type == 'FontDescriptor' then
 
       lua_util.debugm(N, task, "obj %s:%s is a font descriptor",
          obj.major, obj.minor)
@@ -594,6 +662,8 @@ local function process_dict(task, pdf, obj, dict)
         lua_util.debugm(N, task, "obj %s:%s is a font data stream",
             stream_ref.major, stream_ref.minor)
       end
+    elseif obj.type == 'Action' then
+      process_action(task, pdf, obj)
     end
   end
 end
@@ -925,7 +995,7 @@ local function search_urls(task, pdf)
   end
 
   for _,obj in ipairs(pdf.objects) do
-    if obj.dict then
+    if obj.dict and type(obj.dict) == 'table' then
       recursive_object_traverse(obj, obj.dict, 0)
     end
   end


More information about the Commits mailing list