commit da99d2d: [Project] Lua_magic: Add heuristics for text parts

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Sep 9 14:14:05 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-09-09 15:01:28 +0100
URL: https://github.com/rspamd/rspamd/commit/da99d2d9118c0e67e40d8e568fcda998ad329d45

[Project] Lua_magic: Add heuristics for text parts

---
 lualib/lua_magic/heuristics.lua | 126 +++++++++++++++++++++++++++++++++++-----
 lualib/lua_magic/init.lua       |  12 +++-
 lualib/lua_magic/types.lua      |  12 ++++
 3 files changed, 136 insertions(+), 14 deletions(-)

diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua
index 8469fa9f8..d8c134e57 100644
--- a/lualib/lua_magic/heuristics.lua
+++ b/lualib/lua_magic/heuristics.lua
@@ -44,20 +44,46 @@ local msoffice_clsids = {
 local zip_trie
 local zip_patterns = {
   -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
-  odt = {[[mimetypeapplication/vnd\.oasis\.opendocument.text]],
-         [[mimetypeapplication/vnd\.oasis.opendocument\.image]],
-         [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]},
-  ods = {[[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]],
-         [[mimetypeapplication/vnd\.oasis\.opendocument.formula]],
-         [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]},
+  odt = {
+    [[mimetypeapplication/vnd\.oasis\.opendocument.text]],
+    [[mimetypeapplication/vnd\.oasis.opendocument\.image]],
+    [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]
+  },
+  ods = {
+    [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]],
+    [[mimetypeapplication/vnd\.oasis\.opendocument.formula]],
+    [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]
+  },
   odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]},
   epub = {[[epub\+zip]]}
 }
 
+local txt_trie
+local txt_patterns = {
+  html = {
+    [[(?i)\s*<html]],
+    [[(?i)\s*<\!DOCTYPE HTML]],
+    [[(?i)\s*<xml]],
+    [[(?i)\s*<body]],
+    [[(?i)\s*<table]],
+    [[(?i)\s*<a]],
+    [[(?i)\s*<p]],
+    [[(?i)\s*<div]],
+    [[(?i)\s*<span]],
+  },
+  csv = {
+    [[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+[\r\n])]]
+  },
+  js = {
+    [[\s*function\s*\(]],
+  },
+}
+
 -- Used to match pattern index and extension
 local msoffice_clsid_indexes = {}
 local msoffice_patterns_indexes = {}
 local zip_patterns_indexes = {}
+local txt_patterns_indexes = {}
 
 local exports = {}
 
@@ -102,6 +128,9 @@ local function compile_tries()
     -- Misc zip patterns at the initial fragment
     zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
         function(pat) return pat end)
+    -- Text patterns at the initial fragment
+    txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
+        function(pat) return pat end)
   end
 end
 
@@ -271,13 +300,6 @@ local function detect_archive_flaw(part, arch, log_obj)
 end
 
 exports.mime_part_heuristic = function(part, log_obj)
-  if part:is_text() then
-    if part:get_text():is_html() then
-      return 'html',60
-    else
-      return 'txt',60
-    end
-  end
 
   if part:is_image() then
     local img = part:get_image()
@@ -292,4 +314,82 @@ exports.mime_part_heuristic = function(part, log_obj)
   return nil
 end
 
+exports.text_part_heuristic = function(part, log_obj)
+  -- We get some span of data and check it
+  local function is_span_text(span)
+    local function rough_utf8_check(b)
+      if b >= 127 then
+        if bit.band(b, 0xe0) == 0xc0 or bit.band(b, 0xf0) == 0xe0 or bit.band(b, 0xf8) == 0xf0 then
+          return true
+        end
+        return false
+      else
+        return true
+      end
+    end
+
+    -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
+    local tlen = #span
+    local non_printable = 0
+    for _,b in ipairs(span:bytes()) do
+      if ((b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09))
+          or (not rough_utf8_check(b)) then
+        non_printable = non_printable + 1
+      end
+    end
+    lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
+        tlen - non_printable, non_printable, tlen)
+    if non_printable / tlen > 0.0625 then
+      return false
+    end
+
+    return true
+  end
+
+  local content = part:get_content()
+  local clen = #content
+  local is_text
+
+  if clen > 0 then
+    if clen > 80 * 3 then
+      -- Use chunks
+      is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80))
+    else
+      is_text = is_span_text(content)
+    end
+
+    if is_text then
+      -- Try patterns
+      local span_len = math.min(160, clen)
+      local start_span = content:span(1, span_len)
+      local matches = txt_trie:match(start_span)
+      local res = {}
+      if matches then
+        -- Require at least 2 occurrences of those patterns
+        for n,positions in pairs(matches) do
+          local ext = txt_patterns_indexes[n]
+          if ext then
+            res[ext] = (res[ext] or 0) + 20 * #positions
+            lua_util.debugm(N, log_obj, "found txt pattern for %s: %s",
+                ext, #positions)
+          end
+        end
+
+        if res.html and res.html >= 40 then
+          -- HTML has priority over something like js...
+          return 'html',res.html
+        end
+
+        local ext,weight = process_top_detected(res)
+
+        if weight and weight >= 40 then
+          return ext,weight
+        end
+      end
+
+      return 'txt',40
+    end
+  end
+end
+
 return exports
\ No newline at end of file
diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua
index e9e0297e9..27f968149 100644
--- a/lualib/lua_magic/init.lua
+++ b/lualib/lua_magic/init.lua
@@ -326,7 +326,17 @@ exports.detect_mime_part = function(part, log_obj)
     return ext,types[ext]
   end
 
-  return exports.detect(part:get_content(), log_obj)
+  ext,weight = exports.detect(part:get_content(), log_obj)
+
+  if ext and weight and weight > 20 then
+    return ext,types[ext]
+  end
+
+  -- Text/html and other parts
+  ext,weight = heuristics.text_part_heuristic(part, log_obj)
+  if ext and weight and weight > 20 then
+    return ext,types[ext]
+  end
 end
 
 -- This parameter specifies how many bytes are checked in the input
diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua
index 3ecd0575a..93bfa6641 100644
--- a/lualib/lua_magic/types.lua
+++ b/lualib/lua_magic/types.lua
@@ -250,6 +250,18 @@ local types = {
     type = 'text',
     ct = 'text/html',
   },
+  csv = {
+    type = 'text',
+    ct = 'text/csv',
+  },
+  eml = {
+    type = 'message',
+    ct = 'message/rfc822',
+  },
+  js = {
+    type = 'application',
+    ct = 'application/javascript',
+  },
 }
 
 return types
\ No newline at end of file


More information about the Commits mailing list