commit f1e9625: [Minor] Rework utf8 lowercasing
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Aug 13 08:49:05 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-08-13 09:46:18 +0100
URL: https://github.com/rspamd/rspamd/commit/f1e9625920e4e9add168e30c0441a4312b23c890 (HEAD -> master)
[Minor] Rework utf8 lowercasing
---
src/libserver/html.c | 2 +-
src/libserver/url.c | 3 ++-
src/libutil/str_util.c | 46 +++++++++++++++++++---------------------------
src/libutil/str_util.h | 4 ++--
test/lua/unit/utf.lua | 8 ++++----
5 files changed, 28 insertions(+), 35 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 8f6b3d291..4ff310f1c 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1018,8 +1018,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
tag->name.len = rspamd_html_decode_entitles_inplace (s,
tag->name.len);
tag->name.start = s;
+ tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len);
s[tag->name.len] = '\0';
- rspamd_str_lc_utf8 (s, tag->name.len);
k = kh_get (tag_by_name, html_tag_by_name, s);
diff --git a/src/libserver/url.c b/src/libserver/url.c
index ef59b6da0..9314ce2bb 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2069,7 +2069,8 @@ rspamd_url_parse (struct rspamd_url *uri,
}
rspamd_str_lc (uri->string, uri->protocollen);
- rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+ unquoted_len = rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+ rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->protocol == PROTOCOL_UNKNOWN) {
for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) {
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 1f2c4629f..4ce84fa65 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -62,7 +62,7 @@ const guchar lc_map[256] = {
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
-void
+guint
rspamd_str_lc (gchar *str, guint size)
{
guint leftover = size % 4;
@@ -93,6 +93,7 @@ rspamd_str_lc (gchar *str, guint size)
*dest = lc_map[(guchar)str[i]];
}
+ return size;
}
gint
@@ -144,42 +145,33 @@ rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l)
* string to lower case, so some locale peculiarities are simply ignored
* If the target string is longer than initial one, then we just trim it
*/
-void
+guint
rspamd_str_lc_utf8 (gchar *str, guint size)
{
- const gchar *s = str, *p;
- gchar *d = str, tst[6];
- gint remain = size;
- gint r;
- gunichar uc;
+ guchar *d = (guchar *)str, tst[6];
+ gint32 i = 0, prev = 0;
+ UChar32 uc;
- while (remain > 0) {
- p = g_utf8_next_char (s);
+ while (i < size) {
+ prev = i;
- if (p - s > remain) {
- break;
- }
+ U8_NEXT ((guint8*)str, i, size, uc);
+ uc = u_tolower (uc);
- uc = g_utf8_get_char (s);
- uc = g_unichar_tolower (uc);
+ gint32 olen = 0;
+ U8_APPEND_UNSAFE (tst, olen, uc);
- if (remain >= 6) {
- r = g_unichar_to_utf8 (uc, d);
+ if (olen <= (i - prev)) {
+ memcpy (d, tst, olen);
+ d += olen;
}
else {
- /* We must be cautious here to avoid broken unicode being append */
- r = g_unichar_to_utf8 (uc, tst);
- if (r > remain) {
- break;
- }
- else {
- memcpy (d, tst, r);
- }
+ /* Lowercasing has increased the length, so we need to ignore it */
+ d += i - prev;
}
- remain -= r;
- s = p;
- d += r;
}
+
+ return d - (guchar *)str;
}
gboolean
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index a1f980526..b255c125b 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -41,12 +41,12 @@ gint rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l);
/**
* Convert string to lowercase in-place using ASCII conversion
*/
-void rspamd_str_lc (gchar *str, guint size);
+guint rspamd_str_lc (gchar *str, guint size);
/**
* Convert string to lowercase in-place using utf (limited) conversion
*/
-void rspamd_str_lc_utf8 (gchar *str, guint size);
+guint rspamd_str_lc_utf8 (gchar *str, guint size);
/*
* Hash table utility functions for case insensitive hashing
diff --git a/test/lua/unit/utf.lua b/test/lua/unit/utf.lua
index 277d99e41..75dd33977 100644
--- a/test/lua/unit/utf.lua
+++ b/test/lua/unit/utf.lua
@@ -3,8 +3,8 @@
context("UTF8 check functions", function()
local ffi = require("ffi")
ffi.cdef[[
- void rspamd_str_lc_utf8 (char *str, unsigned int size);
- void rspamd_str_lc (char *str, unsigned int size);
+ unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
+ unsigned int rspamd_str_lc (char *str, unsigned int size);
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen);
]]
@@ -19,8 +19,8 @@ context("UTF8 check functions", function()
test("UTF lowercase " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
- ffi.C.rspamd_str_lc_utf8(buf, #c[1])
- local s = ffi.string(buf)
+ local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1])
+ local s = ffi.string(buf, nlen)
assert_equal(s, c[2])
end)
end
More information about the Commits
mailing list