commit 8ede2e0: [Project] Lua_content: More work to process pdf objects

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jan 7 16:49:07 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-07 16:46:24 +0000
URL: https://github.com/rspamd/rspamd/commit/8ede2e032a99b83e31f5486bcac46857d5625660 (HEAD -> master)

[Project] Lua_content: More work to process pdf objects

---
 lualib/lua_content/pdf.lua | 104 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 92 insertions(+), 12 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 88c14e9bf..2ded30045 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -323,13 +323,17 @@ local function obj_ref(major, minor)
 end
 
 -- Return indirect object reference (if needed)
-local function maybe_dereference_object(elt, pdf)
+local function maybe_dereference_object(elt, pdf, task)
   if type(elt) == 'table' and elt[1] == '%REF%' then
     local ref = obj_ref(elt[2], elt[3])
 
     if pdf.ref[ref] then
       -- No recursion!
-      return pdf.ref[ref].dict
+      return pdf.ref[ref]
+    else
+      lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s',
+          elt[2], elt[3], obj_ref(elt[2], elt[3]))
+      return nil
     end
   end
 
@@ -351,10 +355,75 @@ local function dereference_object(elt, pdf)
 end
 
 local function process_dict(task, pdf, obj, dict)
-  if type(dict) == 'table' and dict.Type then
+  if type(dict) == 'table' then
+    if dict.Type and type(dict.Type) == 'string' then
+      -- Common stuff
+      obj.type = dict.Type
+    else
+      -- Fucking pdf, we need to guess a type...
+      lua_util.debugm(N, task, 'no explicit type for %s:%s',
+          obj.major, obj.minor)
+      if dict.Parent then
+        -- Guess by parent
+        local parent = dereference_object(dict.Parent, pdf)
+
+        if parent then
+          lua_util.debugm(N, task, 'guess type for %s:%s from parent %s:%s',
+              obj.major, obj.minor, parent.major, parent.minor)
+        end
+      end
+    end
+
+    lua_util.debugm(N, task, 'process stream dictionary for object %s:%s -> %s',
+        obj.major, obj.minor, obj.type)
+    local contents = dict.Contents
+    if contents then
+      if type(contents) == 'table' and contents[1] == '%REF%' then
+        contents = {contents}
+      end
+      obj.contents = {}
+
+      for _,c in ipairs(contents) do
+        obj.contents[#obj.contents + 1] = maybe_dereference_object(c, pdf, task)
+      end
+    end
+    local resources = dict.Resources
+    if resources and type(resources) == 'table' then
+      obj.resources = resources
+    else
+      -- Fucking pdf: we need to inherit from parent
+      resources = {}
+      if dict.Parent then
+        local parent = maybe_dereference_object(dict.Parent, pdf, task)
+
+        if parent and type(parent) == 'table' and parent.dict then
+          if parent.resources then
+            lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
+                parent.major, parent.minor, obj.major, obj.minor)
+            resources = parent.resources
+          end
+        end
+      end
+
+      obj.resources = resources
+    end
+
+    local fonts = obj.resources.Font
+
+    if fonts and type(fonts) == 'table' then
+      obj.fonts = {}
+      for k,v in pairs(fonts) do
+        obj.fonts[k] = maybe_dereference_object(v, pdf, task)
+
+        if obj.fonts[k] then
+          local font = obj.fonts[k]
+          lua_util.debugm(N, task, 'found font for object %s:%s -> %s',
+              obj.major, obj.minor, font)
+        end
+      end
+    end
+
     if dict.Type == 'FontDescriptor' then
-      obj.type = 'font'
-      obj.ignore = true
 
       lua_util.debugm(N, task, "obj %s:%s is a font descriptor",
          obj.major, obj.minor)
@@ -489,13 +558,24 @@ local function postprocess_pdf_objects(task, input, pdf)
           obj_dict_span = obj.data
         end
 
-        if obj_dict_span:len() < 1024 * 128 then
+        if obj_dict_span:len() < config.max_processing_size then
           local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
 
           if ret then
-            obj.dict = obj_or_err
-            lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
-                obj.major, obj.minor, obj_or_err)
+            if obj.stream then
+              obj.dict = obj_or_err
+              lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
+                  obj.major, obj.minor, obj_or_err)
+            else
+              -- Direct object
+              pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
+              if type(obj_or_err) == 'table' then
+                obj.dict = obj_or_err
+              end
+              obj.uncompressed = obj_or_err
+              lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
+                  obj.major, obj.minor, obj_or_err)
+            end
           else
             lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
                 obj.major, obj.minor, obj_or_err)
@@ -510,14 +590,14 @@ local function postprocess_pdf_objects(task, input, pdf)
 
   end
 
+  pdf.objects = objects
+
   for _,obj in ipairs(objects) do
     if obj.dict then
       -- Types processing
       process_dict(task, pdf, obj, obj.dict)
     end
   end
-
-  pdf.objects = objects
 end
 
 local function apply_pdf_filter(input, filt)
@@ -552,7 +632,7 @@ local function extract_pdf_objects(task, pdf)
     local dict = obj.dict
     if dict.Filter and dict.Length then
       local len = math.min(obj.stream.len,
-          tonumber(maybe_dereference_object(dict.Length, pdf)) or 0)
+          tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
       local real_stream = obj.stream.data:span(1, len)
 
       local uncompressed = maybe_apply_filter(dict, real_stream)


More information about the Commits mailing list