commit 12fdcf7: [Minor] Fix text parts heuristic

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Oct 7 13:56:06 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-10-07 14:33:50 +0100
URL: https://github.com/rspamd/rspamd/commit/12fdcf7bad30d4c5f9110f618e147562cefbea48

[Minor] Fix text parts heuristic

---
 lualib/lua_magic/heuristics.lua | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua
index 306b3e188..07b1ef76a 100644
--- a/lualib/lua_magic/heuristics.lua
+++ b/lualib/lua_magic/heuristics.lua
@@ -314,26 +314,50 @@ end
 exports.text_part_heuristic = function(part, log_obj)
   -- We get some span of data and check it
   local function is_span_text(span)
-    local function rough_utf8_check(b)
+    local function rough_utf8_check(bytes, idx, remain)
+      local b = bytes[idx]
       if b >= 127 then
-        if bit.band(b, 0xe0) == 0xc0 or bit.band(b, 0xf0) == 0xe0 or bit.band(b, 0xf8) == 0xf0 then
-          return true
+        if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
+            bit.band(bytes[idx + 1], 0xc0) == 0x80 then
+          return true,1
+        elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
+            bit.band(bytes[idx + 1], 0xc0) == 0x80 and
+            bit.band(bytes[idx + 2], 0xc0) == 0x80 then
+          return true,2
+        elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
+            bit.band(bytes[idx + 1], 0xc0) == 0x80 and
+            bit.band(bytes[idx + 2], 0xc0) == 0x80 and
+            bit.band(bytes[idx + 3], 0xc0) == 0x80 then
+          return true,3
         end
         return false
       else
-        return true
+        return true,0
       end
     end
 
     -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
     local tlen = #span
     local non_printable = 0
-    for _,b in ipairs(span:bytes()) do
-      if ((b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09))
-          or (not rough_utf8_check(b)) then
+    local bytes = span:bytes()
+    local i = 1
+    repeat
+      local b = bytes[i]
+
+      if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
         non_printable = non_printable + 1
+      elseif b >= 127 then
+        local c,nskip = rough_utf8_check(bytes, i, tlen - i)
+
+        if not c then
+          non_printable = non_printable + 1
+        else
+          i = i + nskip
+        end
       end
-    end
+      i = i + 1
+    until i > tlen
+
     lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
         tlen - non_printable, non_printable, tlen)
     if non_printable / tlen > 0.0078125 then


More information about the Commits mailing list