commit a0f7afd: [Fix] Fuzzy_check: Disable shingles for short texts (really)
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Sep 28 10:07:09 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-09-28 11:05:12 +0100
URL: https://github.com/rspamd/rspamd/commit/a0f7afdc2c97c742a92fd77bd3927b0c8fa422c7 (HEAD -> master)
[Fix] Fuzzy_check: Disable shingles for short texts (really)
---
conf/modules.d/fuzzy_check.conf | 3 ++-
lualib/lua_fuzzy.lua | 10 +++++++---
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/conf/modules.d/fuzzy_check.conf b/conf/modules.d/fuzzy_check.conf
index 5f02d864b..73e280f79 100644
--- a/conf/modules.d/fuzzy_check.conf
+++ b/conf/modules.d/fuzzy_check.conf
@@ -25,7 +25,8 @@ fuzzy_check {
max_score = 20.0;
read_only = yes;
skip_unknown = yes;
- short_text_direct_hash = true;
+ short_text_direct_hash = true; # If less than min_length then use direct hash
+ min_length = 64; # Minimum words count to consider shingles
fuzzy_map = {
FUZZY_DENIED {
max_score = 20.0;
diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua
index d2733d5d6..0131ef8e2 100644
--- a/lualib/lua_fuzzy.lua
+++ b/lualib/lua_fuzzy.lua
@@ -157,14 +157,18 @@ local function check_text_part(task, part, rule, text)
if rule.text_shingles then
-- Check number of words
- if rule.min_length > 0 and wcnt < rule.min_length then
+ local min_words = rule.min_length or 0
+ if min_words < 32 then
+ min_words = 32 -- Minimum for shingles
+ end
+ if wcnt < min_words then
lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles',
rule.min_length, wcnt)
allow_shingles = false
else
lua_util.debugm(N, task, 'allow shingles in text %s, %s words',
id, wcnt)
- allow_shingles = wcnt > 0
+ allow_shingles = true
end
if not rule.short_text_direct_hash and not allow_shingles then
@@ -191,7 +195,7 @@ end
local function has_sane_text_parts(task)
local text_parts = task:get_text_parts() or {}
- return fun.any(function(tp) return tp:get_words_count() > 10 end, text_parts)
+ return fun.any(function(tp) return tp:get_words_count() > 32 end, text_parts)
end
local function check_image_part(task, part, rule, image)
More information about the Commits
mailing list