commit e7bc102: [Project] Lua_content: Add preliminary fonts handling

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Jan 16 12:28:08 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-16 12:23:44 +0000
URL: https://github.com/rspamd/rspamd/commit/e7bc102cd470e7cb36335a65c33eaf76707295ea

[Project] Lua_content: Add preliminary fonts handling

---
 lualib/lua_content/pdf.lua | 127 ++++++++++++++++++++++++++-------------------
 1 file changed, 75 insertions(+), 52 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index da434b501..460938f8a 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -359,6 +359,58 @@ local function dereference_object(elt, pdf)
   return nil
 end
 
+-- Apply PDF stream filter
+local function apply_pdf_filter(input, filt)
+  if filt == 'FlateDecode' then
+    return rspamd_util.inflate(input, config.max_extraction_size)
+  end
+
+  return nil
+end
+
+-- Conditionally apply a pipeline of stream filters and return uncompressed data
+local function maybe_apply_filter(dict, data)
+  local uncompressed = data
+
+  if dict.Filter then
+    local filt = dict.Filter
+    if type(filt) == 'string' then
+      filt = {filt}
+    end
+
+    for _,f in ipairs(filt) do
+      uncompressed = apply_pdf_filter(uncompressed, f)
+
+      if not uncompressed then break end
+    end
+  end
+
+  return uncompressed
+end
+
+-- Conditionally extract stream data from object and attach it as obj.uncompressed
+local function maybe_extract_object_stream(obj, pdf, task)
+  local dict = obj.dict
+  if dict.Filter and dict.Length then
+    local len = math.min(obj.stream.len,
+        tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
+    local real_stream = obj.stream.data:span(1, len)
+
+    local uncompressed = maybe_apply_filter(dict, real_stream)
+
+    if uncompressed then
+      obj.uncompressed = uncompressed
+      lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
+          obj.major, obj.minor, len, uncompressed:len())
+      return obj.uncompressed
+    else
+      lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
+          obj.major, obj.minor, len, dict.Filter)
+    end
+  end
+end
+
+
 local function parse_object_grammar(obj, task, pdf)
   -- Parse grammar
   local obj_dict_span
@@ -385,8 +437,8 @@ local function parse_object_grammar(obj, task, pdf)
           lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
               obj.major, obj.minor, obj_or_err)
         else
-          lua_util.debugm(N, task, 'direct object %s:%s cannot be parsed: %s',
-              obj.major, obj.minor, obj_dict_span)
+          obj.dict = {}
+          obj.uncompressed = obj_or_err
         end
       end
     else
@@ -399,6 +451,24 @@ local function parse_object_grammar(obj, task, pdf)
   end
 end
 
+-- Extracts font data and process /ToUnicode mappings
+local function process_font(task, pdf, font, fname)
+  local dict = font
+  if font.dict then
+    dict = font.dict
+  end
+
+  if type(dict) == 'table' and dict.ToUnicode then
+    local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
+
+    if cmap and cmap.dict then
+      maybe_extract_object_stream(cmap, pdf, task)
+      lua_util.debugm(N, task, 'found cmap for font %s: %s',
+          fname, cmap.uncompressed)
+    end
+  end
+end
+
 local function process_dict(task, pdf, obj, dict)
   if not obj.type and type(dict) == 'table' then
     if dict.Type and type(dict.Type) == 'string' then
@@ -481,8 +551,9 @@ local function process_dict(task, pdf, obj, dict)
 
         if obj.fonts[k] then
           local font = obj.fonts[k]
-          lua_util.debugm(N, task, 'found font for object %s:%s -> %s',
-              obj.major, obj.minor, font)
+          process_font(task, pdf, font, k)
+          lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
+              k, obj.major, obj.minor, font)
         end
       end
     end
@@ -520,54 +591,6 @@ local function process_dict(task, pdf, obj, dict)
   end
 end
 
-local function apply_pdf_filter(input, filt)
-  if filt == 'FlateDecode' then
-    return rspamd_util.inflate(input, config.max_extraction_size)
-  end
-
-  return nil
-end
-
-local function maybe_apply_filter(dict, data)
-  local uncompressed = data
-
-  if dict.Filter then
-    local filt = dict.Filter
-    if type(filt) == 'string' then
-      filt = {filt}
-    end
-
-    for _,f in ipairs(filt) do
-      uncompressed = apply_pdf_filter(uncompressed, f)
-
-      if not uncompressed then break end
-    end
-  end
-
-  return uncompressed
-end
-
-local function maybe_extract_object_stream(obj, pdf, task)
-  local dict = obj.dict
-  if dict.Filter and dict.Length then
-    local len = math.min(obj.stream.len,
-        tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
-    local real_stream = obj.stream.data:span(1, len)
-
-    local uncompressed = maybe_apply_filter(dict, real_stream)
-
-    if uncompressed then
-      obj.uncompressed = uncompressed
-      lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
-          obj.major, obj.minor, len, uncompressed:len())
-      return obj.uncompressed
-    else
-      lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
-          obj.major, obj.minor, len, dict.Filter)
-    end
-  end
-end
-
 -- This function is intended to unpack objects from ObjStm crappy structure
 local compound_obj_grammar
 local function compound_obj_grammar_gen()


More information about the Commits mailing list