commit 89c5fe4: [Minor] Rewrite is_utf_mixed_script to use libicu only

Miecio Za miecio at miecio.net
Wed Mar 27 11:07:05 UTC 2019


Author: Miecio Za
Date: 2019-03-26 20:23:13 +0100
URL: https://github.com/rspamd/rspamd/commit/89c5fe4c05012315e9229e033ae3ded8c31b1cd7 (refs/pull/2813/head)

[Minor] Rewrite is_utf_mixed_script to use libicu only
Rewrite to use U8_NEXT

---
 src/lua/lua_util.c            | 15 +++++++++------
 test/lua/unit/rspamd_util.lua | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 881257ed3..af4673af8 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -2513,15 +2513,18 @@ lua_util_is_utf_mixed_script(lua_State *L)
 {
 	LUA_TRACE_POINT;
 	gsize len_of_string;
-	const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string);
+	const gchar *string_to_check = lua_tolstring (L, 1, &len_of_string);
 	UScriptCode last_script_code = USCRIPT_INVALID_CODE;
 	UErrorCode uc_err = U_ZERO_ERROR;
 
-	if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) {
-		len_of_string = g_utf8_strlen (string_to_check, len_of_string);
-
-		for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){
-			gunichar char_to_check = g_utf8_get_char(string_to_check);
+	if (string_to_check) {
+		uint index = 0;
+		UChar32 char_to_check = 0;
+		while(index < len_of_string) {
+			U8_NEXT(string_to_check, index, len_of_string, char_to_check);
+			if (char_to_check < 0 ) {
+				return luaL_error (L, "passed string is not valid utf");
+			}
 			UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err);
 			if (uc_err != U_ZERO_ERROR){
 				msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err));
diff --git a/test/lua/unit/rspamd_util.lua b/test/lua/unit/rspamd_util.lua
index 802b400d2..859316be7 100644
--- a/test/lua/unit/rspamd_util.lua
+++ b/test/lua/unit/rspamd_util.lua
@@ -5,36 +5,42 @@ context("Rspamd util for lua - check generic functions", function()
         {
             input = "test1",
             result = false,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "test test xxx",
             result = false,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "АбЫрвАлг",
             result = true,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "АбЫрвАлг example",
             result = true,
+            mixed_script = true,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "example ąłśćżłóę",
             result = false,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "ąłśćżłóę АбЫрвАлг",
             result = true,
+            mixed_script = true,
             range_start = 0x0000,
             range_end = 0x017f
         },
@@ -64,4 +70,20 @@ context("Rspamd util for lua - check generic functions", function()
         assert_equal(res["letters"], 10)
         assert_equal(res["digits"], 2)
     end)
+
+    for i,c in ipairs(cases) do
+        test("is_utf_mixed_script, test case #" .. i, function()
+          local actual = util.is_utf_mixed_script(c.input)
+
+          assert_equal(c.mixed_script, actual)
+        end)
+    end
+
+    test("is_utf_mixed_script, invalid utf str should return errror", function()
+        assert_error(util.is_utf_mixed_script,'\200\213\202')
+    end)
+
+    test("is_utf_mixed_script, empty str should return errror", function()
+        assert_error(util.is_utf_mixed_script,'\200\213\202')
+    end)
 end)


More information about the Commits mailing list