commit adff551: [Project] Lua_content: Massive rework of the parsing structure

Mon Jan 13 21:56:09 UTC 2020

Author: Vsevolod Stakhov
Date: 2020-01-13 19:59:48 +0000
URL: https://github.com/rspamd/rspamd/commit/adff55155bdc85fc69ca96d3222703b3d379929b

[Project] Lua_content: Massive rework of the parsing structure

---
 lualib/lua_content/pdf.lua | 191 +++++++++++++++++++++++++++------------------
 1 file changed, 115 insertions(+), 76 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index fa3aee501..d3d4b9d85 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -279,15 +279,17 @@ local function gen_text_grammar()
       C(gen_graphics_unary()) / empty
   local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
       gen_graphics_binary()
-  local ternary_ops = P("TD") + P("Td") + P("Tf") + gen_graphics_ternary()
+  local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
   local nary_op = P("Tm") + gen_graphics_nary()
   local text_binary_op = P("Tj") + P("TJ") + P("'")
   local text_quote_op = P('"')
+  local font_op = P("Tf")
 
   return lpeg.P{
     "EXPR";
     EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
-    COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") + gen.comment) * gen.ws^0,
+    COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
+        V("FONT") + gen.comment) * gen.ws^0,
     UNARY = unary_ops,
     BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
     TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
@@ -301,6 +303,8 @@ local function gen_text_grammar()
     STRING = lpeg.P{gen.str + gen.hexstr},
     TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
         (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
+    FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
+    FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
     TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
     TEXT_ARRAY = "[" *
         lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
@@ -475,15 +479,34 @@ local function process_dict(task, pdf, obj, dict)
   end
 end
 
--- Processes PDF objects: extracts streams, object numbers, process outer grammar,
--- augment object types
-local function postprocess_pdf_objects(task, input, pdf)
-  local start_pos, end_pos = 1, 1
+-- PDF 1.5 ObjStmt
+local function extract_pdf_compound_objects(task, pdf)
+  for _,obj in ipairs(pdf.objects or {}) do
+    if obj.stream and obj.dict and type(obj.dict) == 'table' then
+      local t = obj.dict.Type
+      if t and t == 'ObjStm' then
+        -- We are in troubles sir...
+        local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
+        local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
+
+        if nobjs and first then
+          local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
+          lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
+              nobjs, first, extend)
+        else
+          lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
+              obj.major, obj.minor, obj.dict)
+        end
+      end
+    end
+  end
+end
 
-  local objects = {}
+-- This function arranges starts and ends of all objects and process them into initial
+-- set of objects
+local function extract_outer_objects(task, input, pdf)
+  local start_pos, end_pos = 1, 1
   local obj_count = 0
-  pdf.ref = {} -- references table
-
   while start_pos <= #pdf.start_objects and end_pos <= #pdf.end_objects do
     local first = pdf.start_objects[start_pos]
     local last = pdf.end_objects[end_pos]
@@ -504,14 +527,20 @@ local function postprocess_pdf_objects(task, input, pdf)
       local matches = object_re:search(obj_line_span, true, true)
 
       if matches and matches[1] then
-        objects[obj_count + 1] = {
+        local nobj = {
           start = first,
           len = len,
           data = input:span(first, len),
           major = tonumber(matches[1][2]),
           minor = tonumber(matches[1][3]),
         }
-
+        pdf.objects[obj_count + 1] = nobj
+        if nobj.major and nobj.minor then
+          -- Add reference
+          local ref = obj_ref(nobj.major, nobj.minor)
+          nobj.ref = ref -- Our internal reference
+          pdf.ref[ref] = nobj
+        end
       end
 
       obj_count = obj_count + 1
@@ -524,12 +553,51 @@ local function postprocess_pdf_objects(task, input, pdf)
       end_pos = end_pos + 1
     end
   end
+end
 
-  -- Now we have objects and we need to attach streams that are in bounds
+local function parse_object_grammar(obj, task, pdf)
+  -- Parse grammar
+  local obj_dict_span
+  if obj.stream then
+    obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
+  else
+    obj_dict_span = obj.data
+  end
+
+  if obj_dict_span:len() < config.max_processing_size then
+    local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
+
+    if ret then
+      if obj.stream then
+        obj.dict = obj_or_err
+        lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
+            obj.major, obj.minor, obj_or_err)
+      else
+        -- Direct object
+        pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
+        if type(obj_or_err) == 'table' then
+          obj.dict = obj_or_err
+        end
+        obj.uncompressed = obj_or_err
+        lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
+            obj.major, obj.minor, obj_or_err)
+      end
+    else
+      lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
+          obj.major, obj.minor, obj_or_err)
+    end
+  else
+    lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
+        obj.major, obj.minor, obj_dict_span:len())
+  end
+end
+
+-- This function attaches streams to objects and processes outer pdf grammar
+local function attach_pdf_streams(task, input, pdf)
   if pdf.start_streams and pdf.end_streams then
-    start_pos, end_pos = 1, 1
+    local start_pos, end_pos = 1, 1
 
-    for _,obj in ipairs(objects) do
+    for _,obj in ipairs(pdf.objects) do
       while start_pos <= #pdf.start_streams and end_pos <= #pdf.end_streams do
         local first = pdf.start_streams[start_pos]
         local last = pdf.end_streams[end_pos]
@@ -574,50 +642,29 @@ local function postprocess_pdf_objects(task, input, pdf)
         lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
             obj.major, obj.minor, obj.start, obj.len)
       end
-      if obj.major and obj.minor then
-        -- Parse grammar
-        local obj_dict_span
-        if obj.stream then
-          obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
-        else
-          obj_dict_span = obj.data
-        end
+    end
+  end
+end
 
-        if obj_dict_span:len() < config.max_processing_size then
-          local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
+-- Processes PDF objects: extracts streams, object numbers, process outer grammar,
+-- augment object types
+local function postprocess_pdf_objects(task, input, pdf)
+  pdf.objects = {} -- objects table
+  pdf.ref = {} -- references table
+  extract_outer_objects(task, input, pdf)
 
-          if ret then
-            if obj.stream then
-              obj.dict = obj_or_err
-              lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
-                  obj.major, obj.minor, obj_or_err)
-            else
-              -- Direct object
-              pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
-              if type(obj_or_err) == 'table' then
-                obj.dict = obj_or_err
-              end
-              obj.uncompressed = obj_or_err
-              lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
-                  obj.major, obj.minor, obj_or_err)
-            end
-          else
-            lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
-                obj.major, obj.minor, obj_or_err)
-          end
-        else
-          lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
-              obj.major, obj.minor, obj_dict_span:len())
-        end
-        pdf.ref[obj_ref(obj.major, obj.minor)] = obj
-      end
+  -- Now we have objects and we need to attach streams that are in bounds
+  attach_pdf_streams(task, input, pdf)
+  -- Parse grammar for outer objects
+  for _,obj in ipairs(pdf.objects) do
+    if obj.ref then
+      parse_object_grammar(obj, task, pdf)
     end
-
   end
+  extract_pdf_compound_objects(task, pdf)
 
-  pdf.objects = objects
-
-  for _,obj in ipairs(objects) do
+  -- Now we might probably have all objects being processed
+  for _,obj in ipairs(pdf.objects) do
     if obj.dict then
       -- Types processing
       process_dict(task, pdf, obj, obj.dict)
@@ -652,30 +699,22 @@ local function maybe_apply_filter(dict, data)
   return uncompressed
 end
 
-local function extract_pdf_objects(task, pdf)
-  local function maybe_extract_object(obj)
-    local dict = obj.dict
-    if dict.Filter and dict.Length then
-      local len = math.min(obj.stream.len,
-          tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
-      local real_stream = obj.stream.data:span(1, len)
+local function maybe_extract_object_stream(obj, pdf, task)
+  local dict = obj.dict
+  if dict.Filter and dict.Length then
+    local len = math.min(obj.stream.len,
+        tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
+    local real_stream = obj.stream.data:span(1, len)
 
-      local uncompressed = maybe_apply_filter(dict, real_stream)
+    local uncompressed = maybe_apply_filter(dict, real_stream)
 
-      if uncompressed then
-        obj.uncompressed = uncompressed
-        lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
-            obj.major, obj.minor, len, uncompressed:len())
-      else
-        lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
-            obj.major, obj.minor, len, dict.Filter)
-      end
-    end
-  end
-
-  for _,obj in ipairs(pdf.objects or {}) do
-    if obj.stream and obj.dict and type(obj.dict) == 'table' and not obj.dict.ignore then
-      maybe_extract_object(obj)
+    if uncompressed then
+      obj.uncompressed = uncompressed
+      lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
+          obj.major, obj.minor, len, uncompressed:len())
+    else
+      lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
+          obj.major, obj.minor, len, dict.Filter)
     end
   end
 end
@@ -709,6 +748,7 @@ local function search_text(task, pdf)
     if obj.type == 'Page' and obj.contents then
       local text = {}
       for _,tobj in ipairs(obj.contents) do
+        maybe_extract_object_stream(tobj, pdf, task)
         local matches = pdf_text_trie:match(tobj.uncompressed or '')
         if matches then
           local text_blocks = {}
@@ -812,7 +852,6 @@ local function process_pdf(input, _, task)
     if pdf_output.start_objects and pdf_output.end_objects then
       -- Postprocess objects
       postprocess_pdf_objects(task, input, pdf_output)
-      extract_pdf_objects(task, pdf_output)
       search_text(task, pdf_output)
     else
       pdf_output.flags.no_objects = true