commit 803a906: [Project] Use own utf8 validation instead of glib

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Nov 15 18:56:08 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-11-15 16:40:54 +0000
URL: https://github.com/rspamd/rspamd/commit/803a9062065ccf7a3dde90db5adb872e86d4be5b

[Project] Use own utf8 validation instead of glib

---
 src/libmime/mime_encoding.c | 42 +++++++++++++++++++++++-------------------
 src/libserver/protocol.c    | 10 ++++------
 src/libserver/re_cache.c    |  4 +++-
 src/libutil/map_helpers.c   |  5 +++--
 src/libutil/str_util.c      |  4 +++-
 src/lua/lua_util.c          | 28 ++++++++++++++++++++++++++--
 6 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 0fbba54b2..942358d11 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -22,6 +22,7 @@
 #include "libserver/task.h"
 #include "mime_encoding.h"
 #include "message.h"
+#include "contrib/fastutf8/fastutf8.h"
 #include <unicode/ucnv.h>
 #include <unicode/ucsdet.h>
 #if U_ICU_VERSION_MAJOR_NUM >= 44
@@ -468,36 +469,39 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
 void
 rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
 {
-	const gchar *end, *p;
-	gsize remain = len;
+	gchar *p, *end;
+	goffset err_offset;
+	UChar32 uc = 0;
 
 	/* Now we validate input and replace bad characters with '?' symbol */
 	p = in;
+	end = in + len;
 
-	while (remain > 0 && !g_utf8_validate (p, remain, &end)) {
-		gchar *valid;
+	while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) {
+		goffset cur_offset = err_offset;
 
-		if (end >= in + len) {
-			if (p < in + len) {
-				memset ((gchar *)p, '?', (in + len) - p);
-			}
-			break;
-		}
+		while (cur_offset < len) {
+			goffset tmp = cur_offset;
 
-		valid = g_utf8_find_next_char (end, in + len);
+			U8_NEXT (in, cur_offset, len, uc);
 
-		if (!valid) {
-			valid = in + len;
+			if (uc > 0) {
+				/* Fill string between err_offset and tmp with `?` character */
+				memset (in + err_offset, '?',
+					tmp - err_offset);
+				break;
+			}
 		}
 
-		if (valid > end) {
-			memset ((gchar *)end, '?', valid - end);
-			p = valid;
-			remain = (in + len) - p;
-		}
-		else {
+		if (uc < 0) {
+			/* Fill till the end */
+			memset (p + err_offset, '?',
+					len - err_offset);
 			break;
 		}
+
+		p = in + cur_offset;
+		len = end - p;
 	}
 }
 
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 0786f4860..c457fc455 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -26,6 +26,7 @@
 #include "unix-std.h"
 #include "protocol_internal.h"
 #include "libserver/mempool_vars_internal.h"
+#include "contrib/fastutf8/fastutf8.h"
 #include "task.h"
 #include <math.h>
 
@@ -922,16 +923,13 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 				return;
 			}
 
-			const gchar *end = NULL;
+			goffset err_offset;
 
-			if (g_utf8_validate (url->host, url->hostlen, &end)) {
+			if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) {
 				obj = ucl_object_fromlstring (url->host, url->hostlen);
 			}
-			else if (end - url->host > 0) {
-				obj = ucl_object_fromlstring (url->host, end - url->host);
-			}
 			else {
-				return;
+				obj = ucl_object_fromlstring (url->host, err_offset);
 			}
 		}
 		else {
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index a9fc2270b..a495dfdd5 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -41,6 +41,8 @@
 #include <pcre2.h>
 #endif
 
+#include "contrib/fastutf8/fastutf8.h"
+
 #ifdef HAVE_SYS_WAIT_H
 #include <sys/wait.h>
 #endif
@@ -988,7 +990,7 @@ rspamd_re_cache_process_headers_list (struct rspamd_task *task,
 			in = (const guchar *)cur->value;
 			lenvec[i] = strlen (cur->value);
 
-			if (!g_utf8_validate (in, lenvec[i], NULL)) {
+			if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
 				raw = TRUE;
 			}
 		}
diff --git a/src/libutil/map_helpers.c b/src/libutil/map_helpers.c
index a9bd8d70e..d67e2fc4d 100644
--- a/src/libutil/map_helpers.c
+++ b/src/libutil/map_helpers.c
@@ -20,6 +20,7 @@
 #include "radix.h"
 #include "rspamd.h"
 #include "cryptobox.h"
+#include "contrib/fastutf8/fastutf8.h"
 
 #ifdef WITH_HYPERSCAN
 #include "hs.h"
@@ -1189,7 +1190,7 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map,
 	}
 
 	if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
-		if (g_utf8_validate (in, len, NULL)) {
+		if (rspamd_fast_utf8_validate (in, len) == 0) {
 			validated = TRUE;
 		}
 	}
@@ -1280,7 +1281,7 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map,
 	g_assert (in != NULL);
 
 	if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
-		if (g_utf8_validate (in, len, NULL)) {
+		if (rspamd_fast_utf8_validate (in, len) == 0) {
 			validated = TRUE;
 		}
 	}
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 866ef52d8..90924f8d1 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -27,6 +27,8 @@
 #endif
 #include <math.h>
 
+#include "contrib/fastutf8/fastutf8.h"
+
 const guchar lc_map[256] = {
 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
@@ -2932,7 +2934,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 	}
 
 	if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
-		if (!g_utf8_validate (pattern, slen, NULL)) {
+		if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
 			tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
 		}
 	}
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 1ea8d380c..ef9c3105e 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -34,6 +34,7 @@
 
 #include "unicode/uspoof.h"
 #include "unicode/uscript.h"
+#include "contrib/fastutf8/fastutf8.h"
 
 /***
  * @module rspamd_util
@@ -2855,10 +2856,33 @@ lua_util_is_valid_utf8 (lua_State *L)
 	const gchar *str;
 	gsize len;
 
-	str = lua_tolstring (L, 1, &len);
+	if (lua_isstring (L, 1)) {
+		str = lua_tolstring (L, 1, &len);
+	}
+	else {
+		struct rspamd_lua_text *t = lua_check_text (L, 1);
+
+		if (t) {
+			str = t->start;
+			len = t->len;
+		}
+		else {
+			return luaL_error (L, "invalid arguments (text expected)");
+		}
+	}
 
 	if (str) {
-		lua_pushboolean (L, g_utf8_validate (str, len, NULL));
+		goffset error_offset = rspamd_fast_utf8_validate (str, len);
+
+		if (error_offset == 0) {
+			lua_pushboolean (L, true);
+		}
+		else {
+			lua_pushboolean (L, false);
+			lua_pushnumber (L, error_offset);
+
+			return 2;
+		}
 	}
 	else {
 		return luaL_error (L, "invalid arguments");


More information about the Commits mailing list