commit d5f70b3: [Project] Lua_content: Add preliminary support of fuzzy hashes from PDF scripts

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jan 22 12:21:08 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-22 11:40:33 +0000
URL: https://github.com/rspamd/rspamd/commit/d5f70b34e0a4f01722ea1020d1e7dac03b134bf2

[Project] Lua_content: Add preliminary support of fuzzy hashes from PDF scripts

---
 lualib/lua_content/pdf.lua | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 53625065a..d4ad892e0 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -113,6 +113,9 @@ local config = {
   text_extraction = false, -- NYI feature
   url_extraction = true,
   enabled = true,
+  js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
+  min_js_fuzzy = 32, -- Minimum size of js to be considered as a fuzzy
+  openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
 }
 
 -- Used to process patterns found in PDF
@@ -529,6 +532,7 @@ local function process_javascript(task, pdf, js)
   local njs = {
     data = js,
     hash = rspamd_util.encode_base32(bin_hash),
+    bin_hash = bin_hash,
   }
   pdf.scripts[bin_hash] = njs
   return njs
@@ -555,7 +559,7 @@ local function process_action(task, pdf, obj)
       if js then
         obj.js = js
         lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
-            obj.major, obj.minor, obj.js)
+            obj.major, obj.minor, obj.js.data)
       else
         lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
             obj.major, obj.minor, js)
@@ -759,7 +763,7 @@ process_dict = function(task, pdf, obj, dict)
         if js then
           obj.js = js
           lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
-              obj.major, obj.minor, obj.js)
+              obj.major, obj.minor, obj.js.data)
         else
           lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
               obj.major, obj.minor, js)
@@ -1154,6 +1158,36 @@ local function process_pdf(input, _, task)
       if config.url_extraction then
         search_urls(task, pdf_output)
       end
+
+      if config.js_fuzzy and pdf_output.scripts then
+        pdf_output.fuzzy_hashes = {}
+        if config.openaction_fuzzy_only then
+          -- OpenAction only
+          if pdf_output.openaction and pdf_output.openaction.bin_hash then
+            if config.min_js_fuzzy and #pdf_output.openaction.data >= config.min_js_fuzzy then
+              lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
+                  pdf_output.openaction.hash)
+              table.insert(pdf_output.fuzzy_hashes, pdf_output.openaction.bin_hash)
+            else
+              lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
+                  pdf_output.openaction.hash, #pdf_output.openaction.data)
+            end
+          end
+        else
+          -- All hashes
+          for h,sc in pairs(pdf_output.scripts) do
+            if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
+              lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
+                  sc.hash)
+              table.insert(pdf_output.fuzzy_hashes, h)
+            else
+              lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
+                  sc.hash, #sc.data)
+            end
+          end
+
+        end
+      end
     else
       pdf_output.flags.no_objects = true
     end


More information about the Commits mailing list