commit 803a906: [Project] Use own utf8 validation instead of glib
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Nov 15 18:56:08 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-11-15 16:40:54 +0000
URL: https://github.com/rspamd/rspamd/commit/803a9062065ccf7a3dde90db5adb872e86d4be5b
[Project] Use own utf8 validation instead of glib
---
src/libmime/mime_encoding.c | 42 +++++++++++++++++++++++-------------------
src/libserver/protocol.c | 10 ++++------
src/libserver/re_cache.c | 4 +++-
src/libutil/map_helpers.c | 5 +++--
src/libutil/str_util.c | 4 +++-
src/lua/lua_util.c | 28 ++++++++++++++++++++++++++--
6 files changed, 62 insertions(+), 31 deletions(-)
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 0fbba54b2..942358d11 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -22,6 +22,7 @@
#include "libserver/task.h"
#include "mime_encoding.h"
#include "message.h"
+#include "contrib/fastutf8/fastutf8.h"
#include <unicode/ucnv.h>
#include <unicode/ucsdet.h>
#if U_ICU_VERSION_MAJOR_NUM >= 44
@@ -468,36 +469,39 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
void
rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
{
- const gchar *end, *p;
- gsize remain = len;
+ gchar *p, *end;
+ goffset err_offset;
+ UChar32 uc = 0;
/* Now we validate input and replace bad characters with '?' symbol */
p = in;
+ end = in + len;
- while (remain > 0 && !g_utf8_validate (p, remain, &end)) {
- gchar *valid;
+ while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) {
+ goffset cur_offset = err_offset;
- if (end >= in + len) {
- if (p < in + len) {
- memset ((gchar *)p, '?', (in + len) - p);
- }
- break;
- }
+ while (cur_offset < len) {
+ goffset tmp = cur_offset;
- valid = g_utf8_find_next_char (end, in + len);
+ U8_NEXT (in, cur_offset, len, uc);
- if (!valid) {
- valid = in + len;
+ if (uc > 0) {
+ /* Fill string between err_offset and tmp with `?` character */
+ memset (in + err_offset, '?',
+ tmp - err_offset);
+ break;
+ }
}
- if (valid > end) {
- memset ((gchar *)end, '?', valid - end);
- p = valid;
- remain = (in + len) - p;
- }
- else {
+ if (uc < 0) {
+ /* Fill till the end */
+ memset (p + err_offset, '?',
+ len - err_offset);
break;
}
+
+ p = in + cur_offset;
+ len = end - p;
}
}
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 0786f4860..c457fc455 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -26,6 +26,7 @@
#include "unix-std.h"
#include "protocol_internal.h"
#include "libserver/mempool_vars_internal.h"
+#include "contrib/fastutf8/fastutf8.h"
#include "task.h"
#include <math.h>
@@ -922,16 +923,13 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
return;
}
- const gchar *end = NULL;
+ goffset err_offset;
- if (g_utf8_validate (url->host, url->hostlen, &end)) {
+ if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) {
obj = ucl_object_fromlstring (url->host, url->hostlen);
}
- else if (end - url->host > 0) {
- obj = ucl_object_fromlstring (url->host, end - url->host);
- }
else {
- return;
+ obj = ucl_object_fromlstring (url->host, err_offset);
}
}
else {
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index a9fc2270b..a495dfdd5 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -41,6 +41,8 @@
#include <pcre2.h>
#endif
+#include "contrib/fastutf8/fastutf8.h"
+
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
@@ -988,7 +990,7 @@ rspamd_re_cache_process_headers_list (struct rspamd_task *task,
in = (const guchar *)cur->value;
lenvec[i] = strlen (cur->value);
- if (!g_utf8_validate (in, lenvec[i], NULL)) {
+ if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
raw = TRUE;
}
}
diff --git a/src/libutil/map_helpers.c b/src/libutil/map_helpers.c
index a9bd8d70e..d67e2fc4d 100644
--- a/src/libutil/map_helpers.c
+++ b/src/libutil/map_helpers.c
@@ -20,6 +20,7 @@
#include "radix.h"
#include "rspamd.h"
#include "cryptobox.h"
+#include "contrib/fastutf8/fastutf8.h"
#ifdef WITH_HYPERSCAN
#include "hs.h"
@@ -1189,7 +1190,7 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map,
}
if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
- if (g_utf8_validate (in, len, NULL)) {
+ if (rspamd_fast_utf8_validate (in, len) == 0) {
validated = TRUE;
}
}
@@ -1280,7 +1281,7 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map,
g_assert (in != NULL);
if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
- if (g_utf8_validate (in, len, NULL)) {
+ if (rspamd_fast_utf8_validate (in, len) == 0) {
validated = TRUE;
}
}
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 866ef52d8..90924f8d1 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -27,6 +27,8 @@
#endif
#include <math.h>
+#include "contrib/fastutf8/fastutf8.h"
+
const guchar lc_map[256] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
@@ -2932,7 +2934,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
}
if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
- if (!g_utf8_validate (pattern, slen, NULL)) {
+ if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
}
}
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 1ea8d380c..ef9c3105e 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -34,6 +34,7 @@
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
+#include "contrib/fastutf8/fastutf8.h"
/***
* @module rspamd_util
@@ -2855,10 +2856,33 @@ lua_util_is_valid_utf8 (lua_State *L)
const gchar *str;
gsize len;
- str = lua_tolstring (L, 1, &len);
+ if (lua_isstring (L, 1)) {
+ str = lua_tolstring (L, 1, &len);
+ }
+ else {
+ struct rspamd_lua_text *t = lua_check_text (L, 1);
+
+ if (t) {
+ str = t->start;
+ len = t->len;
+ }
+ else {
+ return luaL_error (L, "invalid arguments (text expected)");
+ }
+ }
if (str) {
- lua_pushboolean (L, g_utf8_validate (str, len, NULL));
+ goffset error_offset = rspamd_fast_utf8_validate (str, len);
+
+ if (error_offset == 0) {
+ lua_pushboolean (L, true);
+ }
+ else {
+ lua_pushboolean (L, false);
+ lua_pushnumber (L, error_offset);
+
+ return 2;
+ }
}
else {
return luaL_error (L, "invalid arguments");
More information about the Commits
mailing list