commit 6b84660: [Project] Lua_content: Attach text to page objects via contents

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jan 7 18:49:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-07 18:10:53 +0000
URL: https://github.com/rspamd/rspamd/commit/6b8466002ff771897b780111e3b33fdd2bddf4e4 (HEAD -> master)

[Project] Lua_content: Attach text to page objects via contents

---
 lualib/lua_content/pdf.lua | 121 ++++++++++++++++++++++++++++-----------------
 1 file changed, 76 insertions(+), 45 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 2ded30045..f29e1e781 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -355,21 +355,30 @@ local function dereference_object(elt, pdf)
 end
 
 local function process_dict(task, pdf, obj, dict)
-  if type(dict) == 'table' then
+  if not obj.type and type(dict) == 'table' then
     if dict.Type and type(dict.Type) == 'string' then
       -- Common stuff
       obj.type = dict.Type
     else
-      -- Fucking pdf, we need to guess a type...
+      -- Fucking pdf, we need to guess a type (or ignore that crap)...
       lua_util.debugm(N, task, 'no explicit type for %s:%s',
           obj.major, obj.minor)
       if dict.Parent then
         -- Guess by parent
         local parent = dereference_object(dict.Parent, pdf)
 
-        if parent then
-          lua_util.debugm(N, task, 'guess type for %s:%s from parent %s:%s',
-              obj.major, obj.minor, parent.major, parent.minor)
+        if parent and parent.type then
+          if parent.type == 'Catalog' then
+            obj.type = 'Pages'
+          elseif parent.type == 'Pages' then
+            obj.type = 'Page'
+          end
+
+          if obj.type then
+            lua_util.debugm(N, task, 'guessed type for %s:%s (%s) from parent %s:%s (%s)',
+                obj.major, obj.minor, obj.type, parent.major, parent.minor, parent.type)
+
+          end
         end
       end
     end
@@ -384,9 +393,18 @@ local function process_dict(task, pdf, obj, dict)
       obj.contents = {}
 
       for _,c in ipairs(contents) do
-        obj.contents[#obj.contents + 1] = maybe_dereference_object(c, pdf, task)
+        local cobj = maybe_dereference_object(c, pdf, task)
+        if cobj then
+          obj.contents[#obj.contents + 1] = cobj
+          cobj.parent = obj
+          cobj.type = 'content'
+        end
       end
+
+      lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
+          obj.major, obj.minor, #obj.contents)
     end
+
     local resources = dict.Resources
     if resources and type(resources) == 'table' then
       obj.resources = resources
@@ -423,6 +441,9 @@ local function process_dict(task, pdf, obj, dict)
       end
     end
 
+    lua_util.debugm(N, task, 'found resources for object %s:%s: %s',
+        obj.major, obj.minor, obj.resources)
+
     if dict.Type == 'FontDescriptor' then
 
       lua_util.debugm(N, task, "obj %s:%s is a font descriptor",
@@ -639,8 +660,8 @@ local function extract_pdf_objects(task, pdf)
 
       if uncompressed then
         obj.uncompressed = uncompressed
-        lua_util.debugm(N, task, 'extracted object %s:%s: %s (%s -> %s)',
-            obj.major, obj.minor, uncompressed, len, uncompressed:len())
+        lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
+            obj.major, obj.minor, len, uncompressed:len())
       else
         lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
             obj.major, obj.minor, len, dict.Filter)
@@ -681,52 +702,62 @@ end
 
 local function search_text(task, pdf)
   for _,obj in ipairs(pdf.objects) do
-    if obj.uncompressed then
-      local matches = pdf_text_trie:match(obj.uncompressed)
-      if matches then
-        local text_blocks = {}
-        local starts = {}
-        local ends = {}
-
-        for npat,matched_positions in pairs(matches) do
-          if npat == 1 then
-            for _,pos in ipairs(matched_positions) do
-              starts[#starts + 1] = pos
-            end
-          else
-            for _,pos in ipairs(matched_positions) do
-              ends[#ends + 1] = pos
+    if obj.type == 'Page' and obj.contents then
+      local text = {}
+      for _,tobj in ipairs(obj.contents) do
+        local matches = pdf_text_trie:match(tobj.uncompressed or '')
+        if matches then
+          local text_blocks = {}
+          local starts = {}
+          local ends = {}
+
+          for npat,matched_positions in pairs(matches) do
+            if npat == 1 then
+              for _,pos in ipairs(matched_positions) do
+                starts[#starts + 1] = pos
+              end
+            else
+              for _,pos in ipairs(matched_positions) do
+                ends[#ends + 1] = pos
+              end
             end
           end
-        end
-
-        offsets_to_blocks(starts, ends, text_blocks)
-        for _,bl in ipairs(text_blocks) do
-          if bl.len > 2 then
-            -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
-            bl.len = bl.len - 2
-          end
 
-          bl.data = obj.uncompressed:span(bl.start, bl.len)
-          lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
-              obj.major, obj.minor, bl.data)
+          offsets_to_blocks(starts, ends, text_blocks)
+          for _,bl in ipairs(text_blocks) do
+            if bl.len > 2 then
+              -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
+              bl.len = bl.len - 2
+            end
 
-          if bl.len < config.max_processing_size then
-            local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
-              bl.data)
+            bl.data = tobj.uncompressed:span(bl.start, bl.len)
+            --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
+            --    tobj.major, tobj.minor, bl.data)
+
+            if bl.len < config.max_processing_size then
+              local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
+                  bl.data)
+
+              if ret then
+                text[#text + 1] = obj_or_err
+                lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
+                    obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
+              else
+                lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
+                    obj.major, obj.minor, obj_or_err)
+              end
 
-            if ret then
-              obj.text = rspamd_text.fromtable(obj_or_err)
-              lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
-                  obj.major, obj.minor, obj.text)
-            else
-              lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
-                  obj.major, obj.minor, obj_or_err)
             end
-
           end
         end
       end
+
+      -- Join all text data together
+      if #text > 0 then
+        obj.text = rspamd_text.fromtable(text)
+        lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
+            obj.major, obj.minor, obj.text)
+      end
     end
   end
 end


More information about the Commits mailing list