commit a15303b: [Project] Lua_content: Improve JS extraction logic

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jan 21 12:49:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-21 12:30:56 +0000
URL: https://github.com/rspamd/rspamd/commit/a15303b3ed20d8bf14b861042fa53a153acf181a

[Project] Lua_content: Improve JS extraction logic

---
 lualib/lua_content/pdf.lua | 152 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 116 insertions(+), 36 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 4c6039d06..4fe1736e0 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -44,7 +44,7 @@ local pdf_patterns = {
   },
   start_object = {
     patterns = {
-      [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\r\n<]]=]
+      [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
     }
   },
   end_object = {
@@ -503,9 +503,40 @@ end
 -- Forward declaration
 local process_dict
 
+-- This function processes javascript string and returns JS hash and JS rspamd_text
+local function process_javascript(task, pdf, js)
+  local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
+  if type(js) == 'string' then
+    js = rspamd_text.fromstring(js):exclude_chars('%n%c')
+  elseif type(js) == 'userdata' then
+    js = js:exclude_chars('%n%c')
+  else
+    return nil
+  end
+
+  local hash = rspamd_cryptobox_hash.create(js)
+  local bin_hash = hash:bin()
+
+  if not pdf.scripts then
+    pdf.scripts = {}
+  end
+
+  if pdf.scripts[bin_hash] then
+    -- Duplicate
+    return pdf.scripts[bin_hash]
+  end
+
+  local njs = {
+    data = js,
+    hash = rspamd_util.encode_base32(bin_hash),
+  }
+  pdf.scripts[bin_hash] = njs
+  return njs
+end
+
 -- Extract interesting stuff from /Action, e.g. javascript
 local function process_action(task, pdf, obj)
-  if not obj.js and (obj.dict and obj.dict.JS) then
+  if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
     local js = maybe_dereference_object(obj.dict.JS, pdf, task)
 
     if js then
@@ -520,27 +551,34 @@ local function process_action(task, pdf, obj)
         end
       end
 
-      if type(js) == 'string' then
-        if not pdf.scripts then
-          pdf.scripts = {}
-        end
-        obj.js = rspamd_text.fromstring(js):exclude_chars('%n%c')
-        pdf.scripts[#pdf.scripts + 1] = obj.js
-        lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
-            obj.major, obj.minor, obj.js)
-      elseif type(js) == 'userdata' then
-        if not pdf.scripts then
-          pdf.scripts = {}
-        end
-        obj.js = js:exclude_chars('%n%c')
-        pdf.scripts[#pdf.scripts + 1] = obj.js
+      js = process_javascript(task, pdf, js)
+      if js then
+        obj.js = js
         lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
             obj.major, obj.minor, obj.js)
       else
         lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
             obj.major, obj.minor, js)
       end
+    elseif obj.dict.F then
+      local launch = maybe_dereference_object(obj.dict.F, pdf, task)
+
+      if launch then
+        if type(launch) == 'string' then
+          obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
+          lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
+              obj.major, obj.minor, obj.launch)
+        elseif type(launch) == 'userdata' then
+          obj.launch = launch:exclude_chars('%n%c')
+          lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
+              obj.major, obj.minor, obj.launch)
+        else
+          lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
+              obj.major, obj.minor, launch)
+        end
+      end
     else
+
       lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
           obj.major, obj.minor)
     end
@@ -549,27 +587,37 @@ end
 
 -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
 local function process_catalog(task, pdf, obj)
-  if obj.dict and obj.dict.OpenAction then
-    local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
-
-    if action and type(action) == 'table' then
-      -- This also processes action js (if not already processed)
-      process_dict(task, pdf, action, action.dict)
-      if action.js then
-        lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
-            obj.major, obj.minor, action.js)
-        pdf.openaction = action.js
+  if obj.dict then
+    if obj.dict.OpenAction then
+      local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
+
+      if action and type(action) == 'table' then
+        -- This also processes action js (if not already processed)
+        process_dict(task, pdf, action, action.dict)
+        if action.js then
+          lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
+              obj.major, obj.minor, action.js)
+          pdf.openaction = action.js
+        elseif action.launch then
+          lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
+              obj.major, obj.minor, action.launch)
+          pdf.launch = action.launch
+        else
+          lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
+              obj.major, obj.minor, action)
+        end
       else
-        lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
-            obj.major, obj.minor, action)
+        lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
+            obj.major, obj.minor, obj.dict.OpenAction, action)
       end
     else
-      lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
-          obj.major, obj.minor, obj.dict.OpenAction, action)
+      lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
+          obj.major, obj.minor)
+    end
+    if obj.dict.AA then
+      -- Additional action
+
     end
-  else
-    lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
-        obj.major, obj.minor)
   end
 end
 
@@ -581,9 +629,16 @@ process_dict = function(task, pdf, obj, dict)
     end
 
     if not obj.type then
-      lua_util.debugm(N, task, 'no type for %s:%s',
-          obj.major, obj.minor)
-      return
+
+      if obj.dict.S and obj.dict.JS then
+        obj.type = 'Javascript'
+        lua_util.debugm(N, task, 'implicit type for Javascript object %s:%s',
+            obj.major, obj.minor)
+      else
+        lua_util.debugm(N, task, 'no type for %s:%s',
+            obj.major, obj.minor)
+        return
+      end
     end
 
     lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
@@ -689,6 +744,31 @@ process_dict = function(task, pdf, obj, dict)
       process_action(task, pdf, obj)
     elseif obj.type == 'Catalog' then
       process_catalog(task, pdf, obj)
+    elseif obj.type == 'Javascript' then
+      local js = maybe_dereference_object(obj.dict.JS, pdf, task)
+
+      if js then
+        if type(js) == 'table' then
+          local extracted_js = maybe_extract_object_stream(js, pdf, task)
+
+          if not extracted_js then
+            lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
+                obj.major, obj.minor, js)
+          else
+            js = extracted_js
+          end
+        end
+
+        js = process_javascript(task, pdf, js)
+        if js then
+          obj.js = js
+          lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
+              obj.major, obj.minor, obj.js)
+        else
+          lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
+              obj.major, obj.minor, js)
+        end
+      end
     end
   end -- Already processed dict (obj.type is not empty)
 end


More information about the Commits mailing list