commit 7cf7a88: [Project] Lua_magic: Add mime parts detection function

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Sep 7 15:42:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-09-07 16:37:58 +0100
URL: https://github.com/rspamd/rspamd/commit/7cf7a889a60efb651fdf2062b1773ad17c9eec7f (HEAD -> master)

[Project] Lua_magic: Add mime parts detection function

---
 lualib/lua_magic/heuristics.lua | 26 ++++++++++++++++++++++++
 lualib/lua_magic/init.lua       | 11 +++++++++++
 lualib/lua_magic/types.lua      | 44 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)

diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua
index d60c87162..6a407f5e9 100644
--- a/lualib/lua_magic/heuristics.lua
+++ b/lualib/lua_magic/heuristics.lua
@@ -52,6 +52,7 @@ local function compile_msoffice_trie(log_obj)
     local strs = {}
     for ext,pats in pairs(msoffice_patterns) do
       for _,pat in ipairs(pats) do
+        -- These are utf16 strings in fact...
         strs[#strs + 1] = '^' ..
             table.concat(
                 fun.totable(
@@ -66,6 +67,7 @@ local function compile_msoffice_trie(log_obj)
     strs = {}
     for ext,pats in pairs(msoffice_clsids) do
       for _,pat in ipairs(pats) do
+        -- Convert hex to re
         local hex_table = {}
         for i=1,#pat,2 do
           local subc = pat:sub(i, i + 1)
@@ -163,6 +165,30 @@ local function detect_ole_format(input, log_obj)
   until directory_offset >= inplen
 end
 
+
 exports.ole_format_heuristic = detect_ole_format
 
+exports.mime_part_heuristic = function(part)
+  if part:is_text() then
+    if part:get_text():is_html() then
+      return 'html',60
+    else
+      return 'txt',60
+    end
+  end
+
+  if part:is_image() then
+    local img = part:get_image()
+    return img:get_type():lower(),60
+  end
+
+  if part:is_archive() then
+    local arch = part:get_archive()
+    -- TODO: add files heuristics
+    return arch:get_type():lower(),60
+  end
+
+  return nil
+end
+
 return exports
\ No newline at end of file
diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua
index 59e2a6e36..8b5064bfe 100644
--- a/lualib/lua_magic/init.lua
+++ b/lualib/lua_magic/init.lua
@@ -21,6 +21,7 @@ limitations under the License.
 
 local patterns = require "lua_magic/patterns"
 local types = require "lua_magic/types"
+local heuristics = require "lua_magic/heuristics"
 local fun = require "fun"
 local lua_util = require "lua_util"
 
@@ -317,6 +318,16 @@ exports.detect = function(input, log_obj)
   return nil
 end
 
+exports.detect_mime_part = function(part, log_obj)
+  local ext,weight = heuristics.mime_part_heuristic(part)
+
+  if ext and weight and weight > 20 then
+    return ext,types[ext]
+  end
+
+  return exports.detect(part:get_content(), log_obj)
+end
+
 -- This parameter specifies how many bytes are checked in the input
 -- Rspamd checks 2 chunks at start and 1 chunk at the end
 exports.chunk_size = 32768
diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua
index 8255af663..c8850cd18 100644
--- a/lualib/lua_magic/types.lua
+++ b/lualib/lua_magic/types.lua
@@ -168,6 +168,50 @@ local types = {
     ct = 'application/x-uuencoded',
     type = 'binary',
   },
+  -- Types that are detected by Rspamd itself
+  -- Archives
+  zip = {
+    ct = 'application/zip',
+    type = 'archive',
+  },
+  rar = {
+    ct = 'application/x-rar',
+    type = 'archive',
+  },
+  ['7z'] = {
+    ct = 'x-7z-compressed',
+    type = 'archive',
+  },
+  gz = {
+    ct = 'application/gzip',
+    type = 'archive',
+  },
+  -- Images
+  png = {
+    ct = 'image/png',
+    type = 'image',
+  },
+  gif = {
+    ct = 'image/gif',
+    type = 'image',
+  },
+  jpg = {
+    ct = 'image/jpeg',
+    type = 'image',
+  },
+  bmp = {
+    type = 'image',
+    ct = 'image/bmp',
+  },
+  -- Text
+  txt = {
+    type = 'text',
+    ct = 'text/plain',
+  },
+  html = {
+    type = 'text',
+    ct = 'text/html',
+  },
 }
 
 return types
\ No newline at end of file


More information about the Commits mailing list