commit ef54307: [Project] Lua_magic: Add Oasis documents detection

Vsevolod Stakhov vsevolod at highsecure.ru
Sun Sep 8 08:56:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-09-08 09:50:27 +0100
URL: https://github.com/rspamd/rspamd/commit/ef54307ee4621ee2645c7cf9456e2542f51875f6 (HEAD -> master)

[Project] Lua_magic: Add Oasis documents detection

---
 lualib/lua_magic/heuristics.lua | 53 +++++++++++++++++++++++++++++++++++++++++
 lualib/lua_magic/types.lua      | 31 +++++++++++++++++-------
 2 files changed, 75 insertions(+), 9 deletions(-)

diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua
index 167edd0c9..b30f95794 100644
--- a/lualib/lua_magic/heuristics.lua
+++ b/lualib/lua_magic/heuristics.lua
@@ -167,6 +167,20 @@ end
 
 exports.ole_format_heuristic = detect_ole_format
 
+local function process_detected(res)
+  local extensions = lua_util.keys(res)
+
+  if #extensions > 0 then
+    table.sort(extensions, function(ex1, ex2)
+      return res[ex1] > res[ex2]
+    end)
+
+    return extensions,res[extensions[1]]
+  end
+
+  return nil
+end
+
 local function detect_archive_flaw(part, arch)
   local arch_type = arch:get_type()
   local res = {
@@ -174,6 +188,9 @@ local function detect_archive_flaw(part, arch)
     xlsx = 0,
     pptx = 0,
     jar = 0,
+    odt = 0,
+    odp = 0,
+    ods = 0
   } -- ext + confidence pairs
 
   -- General msoffice patterns
@@ -195,8 +212,44 @@ local function detect_archive_flaw(part, arch)
         res.xlsx = res.docx + 30
       elseif file == 'ppt/' then
         res.xlsx = res.pptx + 30
+      elseif file == 'META-INF/manifest.xml' then
+        -- Apply ODT detection logic
+        local content = part:get_content()
+
+        if #content > 80 then
+          -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
+          local start_span = content:span(30, 50)
+
+          local mp = tostring(start_span:span(1, 8))
+          if mp == 'mimetype' then
+            local spec_type = tostring(start_span:span(9))
+            if spec_type:find('vnd.oasis.opendocument.text') then
+              res.odt = 40
+            elseif spec_type:find('vnd.oasis.opendocument.spreadsheet') then
+              res.ods = 40
+            elseif spec_type:find('vnd.oasis.opendocument.formula') then
+              res.ods = 40
+            elseif spec_type:find('vnd.oasis.opendocument.chart') then
+              res.ods = 40
+            elseif spec_type:find('vnd.oasis.opendocument.presentation') then
+              res.odp = 40
+            elseif spec_type:find('vnd.oasis.opendocument.image') then
+              -- Assume image as odt
+              res.odt = 40
+            elseif spec_type:find('vnd.oasis.opendocument.graphics') then
+              -- Assume image as odt
+              res.odt = 40
+            end
+          end
+        end
       end
     end
+
+    local ext,weight = process_detected(res)
+
+    if weight >= 40 then
+      return ext,weight
+    end
   end
 
   return arch_type:lower(),40
diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua
index c5de552c8..299dc1924 100644
--- a/lualib/lua_magic/types.lua
+++ b/lualib/lua_magic/types.lua
@@ -133,23 +133,23 @@ local types = {
   -- Ole files
   ole = {
     ct = 'application/octet-stream',
-    type = 'msoffice'
+    type = 'office'
   },
   doc = {
     ct = 'application/msword',
-    type = 'msoffice'
+    type = 'office'
   },
   xls = {
     ct = 'application/vnd.ms-excel',
-    type = 'msoffice'
+    type = 'office'
   },
   ppt = {
     ct = 'application/vnd.ms-powerpoint',
-    type = 'msoffice'
+    type = 'office'
   },
   vsd = {
     ct = 'application/vnd.visio',
-    type = 'msoffice'
+    type = 'office'
   },
   msi = {
     ct = 'application/x-msi',
@@ -157,20 +157,33 @@ local types = {
   },
   msg = {
     ct = 'application/vnd.ms-outlook',
-    type = 'msoffice'
+    type = 'office'
   },
   -- newer office (2007+)
   docx = {
     ct = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-    type = 'msoffice'
+    type = 'office'
   },
   xlsx = {
     ct = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
-    type = 'msoffice'
+    type = 'office'
   },
   pptx = {
     ct = 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-    type = 'msoffice'
+    type = 'office'
+  },
+  -- OpenOffice formats
+  odt = {
+    ct = 'application/vnd.oasis.opendocument.text',
+    type = 'office'
+  },
+  ods = {
+    ct = 'application/vnd.oasis.opendocument.spreadsheet',
+    type = 'office'
+  },
+  odp = {
+    ct = 'application/vnd.oasis.opendocument.presentation',
+    type = 'office'
   },
   -- other
   pgp = {


More information about the Commits mailing list