commit cd08c88: [Minor] Fix performance issue with is_utf_outside_range

Miecio Za miecio at miecio.net
Tue Mar 19 10:42:04 UTC 2019


Author: Miecio Za
Date: 2019-03-18 14:06:56 +0100
URL: https://github.com/rspamd/rspamd/commit/cd08c8845f6ea0bac789ea8a49f7d8537f598b7d

[Minor] Fix performance issue with is_utf_outside_range
Fix performace issue, add some checking and add few tests

---
 src/lua/lua_util.c            | 67 +++++++++++++++++++++++++++++++------------
 test/lua/unit/rspamd_util.lua | 67 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 18 deletions(-)

diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 71d61da62..7c98a0989 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -22,6 +22,7 @@
 #include "libmime/email_addr.h"
 #include "libmime/content_type.h"
 #include "libmime/mime_headers.h"
+#include "libutil/hash.h"
 #include "linenoise.h"
 #include <math.h>
 #include <glob.h>
@@ -2458,6 +2459,12 @@ lua_util_is_utf_spoofed (lua_State *L)
 			uspoof_setChecks (spc_sgl,
 					USPOOF_INVISIBLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE,
 					&uc_err);
+			if (uc_err != U_ZERO_ERROR) {
+				msg_err ("Cannot set proper checks for uspoof: %s", u_errorName (uc_err));
+				lua_pushboolean (L, false);
+				uspoof_close(spc);
+				return 1;
+			}
 		}
 
 		ret = uspoof_checkUTF8 (spc_sgl, s1, l1, NULL, &uc_err);
@@ -2533,28 +2540,52 @@ lua_util_is_utf_outside_range(lua_State *L)
 	guint32 range_start = lua_tointeger (L, 2);
 	guint32 range_end = lua_tointeger (L, 3);
 
-	USpoofChecker *spc_sgl;
-	USet * allowed_chars;
-	UErrorCode uc_err = U_ZERO_ERROR;
+	static rspamd_lru_hash_t *validators;
+
+	if (validators == NULL) {
+		validators = rspamd_lru_hash_new(16, g_free, (GDestroyNotify)uspoof_close);
+	}
 
 	if (string_to_check) {
-		spc_sgl = uspoof_open (&uc_err);
-		if (uc_err != U_ZERO_ERROR) {
-			msg_err ("cannot init spoof checker: %s", u_errorName (uc_err));
-			lua_pushboolean (L, false);
-			uspoof_close(spc_sgl);
-			return 1;
-		}
+		guint64 hash_key = (guint64)range_end << 32 || range_start;
+
+		USpoofChecker *validator = rspamd_lru_hash_lookup(validators, &hash_key, time(NULL));
+
+		UErrorCode uc_err = U_ZERO_ERROR;
+
+		if (validator == NULL) {
+			USet * allowed_chars;
+			guint64 * creation_hash_key = g_malloc(sizeof(guint64));
+			*creation_hash_key = hash_key;
+
+			validator = uspoof_open (&uc_err);
+			if (uc_err != U_ZERO_ERROR) {
+				msg_err ("cannot init spoof checker: %s", u_errorName (uc_err));
+				lua_pushboolean (L, false);
+				uspoof_close(validator);
+				return 1;
+			}
+
+			allowed_chars = uset_openEmpty();
+			uset_addRange(allowed_chars, range_start, range_end);
+			uspoof_setAllowedChars(validator, allowed_chars, &uc_err);
+
+			uspoof_setChecks (validator,
+				USPOOF_CHAR_LIMIT | USPOOF_ANY_CASE, &uc_err);
 
-		allowed_chars = uset_openEmpty();
-		uset_addRange(allowed_chars, range_start, range_end);
-		uspoof_setAllowedChars(spc_sgl, allowed_chars, &uc_err);
+			uset_close(allowed_chars);
+
+			if (uc_err != U_ZERO_ERROR) {
+				msg_err ("Cannot configure uspoof: %s", u_errorName (uc_err));
+				lua_pushboolean (L, false);
+				uspoof_close(validator);
+				return 1;
+			}
+
+			rspamd_lru_hash_insert(validators, creation_hash_key, validator, time(NULL), 0);
+		}
 
-		uspoof_setChecks (spc_sgl,
-			USPOOF_CHAR_LIMIT | USPOOF_ANY_CASE, &uc_err);
-		ret = uspoof_checkUTF8 (spc_sgl, string_to_check, len_of_string, NULL, &uc_err);
-		uset_close(allowed_chars);
-		uspoof_close(spc_sgl);
+		ret = uspoof_checkUTF8 (validator, string_to_check, len_of_string, NULL, &uc_err);
 	}
 	else {
 		return luaL_error (L, "invalid arguments");
diff --git a/test/lua/unit/rspamd_util.lua b/test/lua/unit/rspamd_util.lua
new file mode 100644
index 000000000..802b400d2
--- /dev/null
+++ b/test/lua/unit/rspamd_util.lua
@@ -0,0 +1,67 @@
+context("Rspamd util for lua - check generic functions", function()
+    local util  = require 'rspamd_util'
+
+    local cases = {
+        {
+            input = "test1",
+            result = false,
+            range_start = 0x0000,
+            range_end = 0x017f
+        },
+        {
+            input = "test test xxx",
+            result = false,
+            range_start = 0x0000,
+            range_end = 0x017f
+        },
+        {
+            input = "АбЫрвАлг",
+            result = true,
+            range_start = 0x0000,
+            range_end = 0x017f
+        },
+        {
+            input = "АбЫрвАлг example",
+            result = true,
+            range_start = 0x0000,
+            range_end = 0x017f
+        },
+        {
+            input = "example ąłśćżłóę",
+            result = false,
+            range_start = 0x0000,
+            range_end = 0x017f
+        },
+        {
+            input = "ąłśćżłóę АбЫрвАлг",
+            result = true,
+            range_start = 0x0000,
+            range_end = 0x017f
+        },
+    }
+
+    for i,c in ipairs(cases) do
+        test("is_utf_outside_range, test case #" .. i, function()
+          local actual = util.is_utf_outside_range(c.input, c.range_start, c.range_end)
+
+          assert_equal(c.result, actual)
+        end)
+    end
+
+    test("is_utf_outside_range, check cache", function ()
+        cache_size = 20
+        for i = 1,cache_size do
+            local res = util.is_utf_outside_range("a", 0x0000, 0x0000+i)
+        end
+    end)
+
+    test("is_utf_outside_range, check empty string", function ()
+        assert_error(util.is_utf_outside_range)
+    end)
+
+    test("get_string_stats, test case", function()
+        local res = util.get_string_stats("this is test 99")
+        assert_equal(res["letters"], 10)
+        assert_equal(res["digits"], 2)
+    end)
+end)


More information about the Commits mailing list