commit 89c5fe4: [Minor] Rewrite is_utf_mixed_script to use libicu only
Miecio Za
miecio at miecio.net
Wed Mar 27 11:07:05 UTC 2019
Author: Miecio Za
Date: 2019-03-26 20:23:13 +0100
URL: https://github.com/rspamd/rspamd/commit/89c5fe4c05012315e9229e033ae3ded8c31b1cd7 (refs/pull/2813/head)
[Minor] Rewrite is_utf_mixed_script to use libicu only
Rewrite to use U8_NEXT
---
src/lua/lua_util.c | 15 +++++++++------
test/lua/unit/rspamd_util.lua | 22 ++++++++++++++++++++++
2 files changed, 31 insertions(+), 6 deletions(-)
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 881257ed3..af4673af8 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -2513,15 +2513,18 @@ lua_util_is_utf_mixed_script(lua_State *L)
{
LUA_TRACE_POINT;
gsize len_of_string;
- const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string);
+ const gchar *string_to_check = lua_tolstring (L, 1, &len_of_string);
UScriptCode last_script_code = USCRIPT_INVALID_CODE;
UErrorCode uc_err = U_ZERO_ERROR;
- if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) {
- len_of_string = g_utf8_strlen (string_to_check, len_of_string);
-
- for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){
- gunichar char_to_check = g_utf8_get_char(string_to_check);
+ if (string_to_check) {
+ uint index = 0;
+ UChar32 char_to_check = 0;
+ while(index < len_of_string) {
+ U8_NEXT(string_to_check, index, len_of_string, char_to_check);
+ if (char_to_check < 0 ) {
+ return luaL_error (L, "passed string is not valid utf");
+ }
UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err);
if (uc_err != U_ZERO_ERROR){
msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err));
diff --git a/test/lua/unit/rspamd_util.lua b/test/lua/unit/rspamd_util.lua
index 802b400d2..859316be7 100644
--- a/test/lua/unit/rspamd_util.lua
+++ b/test/lua/unit/rspamd_util.lua
@@ -5,36 +5,42 @@ context("Rspamd util for lua - check generic functions", function()
{
input = "test1",
result = false,
+ mixed_script = false,
range_start = 0x0000,
range_end = 0x017f
},
{
input = "test test xxx",
result = false,
+ mixed_script = false,
range_start = 0x0000,
range_end = 0x017f
},
{
input = "АбЫрвАлг",
result = true,
+ mixed_script = false,
range_start = 0x0000,
range_end = 0x017f
},
{
input = "АбЫрвАлг example",
result = true,
+ mixed_script = true,
range_start = 0x0000,
range_end = 0x017f
},
{
input = "example ąłśćżłóę",
result = false,
+ mixed_script = false,
range_start = 0x0000,
range_end = 0x017f
},
{
input = "ąłśćżłóę АбЫрвАлг",
result = true,
+ mixed_script = true,
range_start = 0x0000,
range_end = 0x017f
},
@@ -64,4 +70,20 @@ context("Rspamd util for lua - check generic functions", function()
assert_equal(res["letters"], 10)
assert_equal(res["digits"], 2)
end)
+
+ for i,c in ipairs(cases) do
+ test("is_utf_mixed_script, test case #" .. i, function()
+ local actual = util.is_utf_mixed_script(c.input)
+
+ assert_equal(c.mixed_script, actual)
+ end)
+ end
+
+ test("is_utf_mixed_script, invalid utf str should return errror", function()
+ assert_error(util.is_utf_mixed_script,'\200\213\202')
+ end)
+
+ test("is_utf_mixed_script, empty str should return errror", function()
+ assert_error(util.is_utf_mixed_script,'\200\213\202')
+ end)
end)
More information about the Commits
mailing list