commit 58d60ea: [Rework] Use C++ version for unicode normalisation

Mon May 17 15:35:06 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-05-17 16:34:35 +0100
URL: https://github.com/rspamd/rspamd/commit/58d60ea07f1478fe5fcc1d152d54237169e3bd43 (HEAD -> master)

[Rework] Use C++ version for unicode normalisation

---
 src/libserver/html.c          |   2 +-
 src/libserver/url.h           |   3 +-
 src/libutil/cxx/utf8_util.cxx | 100 ++++++++++++++++++++++++++++++++
 src/libutil/cxx/utf8_util.h   |  17 ++++++
 src/libutil/str_util.c        | 129 ------------------------------------------
 src/libutil/str_util.h        |  16 ------
 6 files changed, 120 insertions(+), 147 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 8d7b722a5..cfdd0acef 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -2667,7 +2667,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
 		}
 	}
 
-	rspamd_normalise_unicode_inplace (pool, url->visible_part, &dlen);
+	rspamd_normalise_unicode_inplace (url->visible_part, &dlen);
 }
 
 static gboolean
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 72fce5f9e..4ace18f1a 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -6,6 +6,7 @@
 #include "mem_pool.h"
 #include "khash.h"
 #include "fstring.h"
+#include "libutil/cxx/utf8_util.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -356,7 +357,7 @@ int rspamd_url_cmp_qsort(const void *u1, const void *u2);
 #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
   do {                                                                            \
      enum rspamd_normalise_result norm_res;                                       \
-     norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out));     \
+     norm_res = rspamd_normalise_unicode_inplace((input), (len_out));     \
      if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {                               \
        url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED;                             \
      }                                                                            \
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
index f44d02671..6bca4b18e 100644
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -18,6 +18,8 @@
 #include <unicode/utypes.h>
 #include <unicode/utf8.h>
 #include <unicode/uchar.h>
+#include <unicode/normalizer2.h>
+#include <unicode/schriter.h>
 #include <utility>
 #include <string>
 
@@ -98,3 +100,101 @@ TEST_SUITE("utf8 utils") {
 }
 
 
+
+enum rspamd_normalise_result
+rspamd_normalise_unicode_inplace(char *start, size_t *len)
+{
+	UErrorCode uc_err = U_ZERO_ERROR;
+	const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
+	static icu::UnicodeSet zw_spaces{};
+
+	if (!zw_spaces.isFrozen()) {
+		/* Add zw spaces to the set */
+		zw_spaces.add(0x200B);
+		zw_spaces.add(0x200C);
+		zw_spaces.add(0x200D);
+		zw_spaces.add(0xFEF);
+		zw_spaces.add(0x00AD);
+		zw_spaces.freeze();
+	}
+
+	int ret = RSPAMD_UNICODE_NORM_NORMAL;
+
+	g_assert (U_SUCCESS (uc_err));
+
+	auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
+	auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
+
+	if (!U_SUCCESS (uc_err)) {
+		return RSPAMD_UNICODE_NORM_ERROR;
+	}
+
+	/* Filter zero width spaces and push resulting string back */
+	const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
+		icu::StringCharacterIterator it{input};
+		size_t i = 0;
+
+		while(it.hasNext()) {
+			auto uc = it.next32PostInc();
+
+			if (zw_spaces.contains(uc)) {
+				ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+			}
+			else {
+				UBool err = 0;
+				U8_APPEND(start, i, *len, uc, err);
+
+				if (err) {
+					ret = RSPAMD_UNICODE_NORM_ERROR;
+
+					return i;
+				}
+			}
+		}
+
+		return i;
+	};
+
+	if (is_normal != UNORM_YES) {
+		/* Need to normalise */
+		ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+
+		auto normalised = nfkc_norm->normalize(uc_string, uc_err);
+
+		if (!U_SUCCESS (uc_err)) {
+			return RSPAMD_UNICODE_NORM_ERROR;
+		}
+
+		*len = filter_zw_spaces_and_push_back(normalised);
+	}
+	else {
+		*len = filter_zw_spaces_and_push_back(uc_string);
+	}
+
+	return static_cast<enum rspamd_normalise_result>(ret);
+}
+
+TEST_SUITE("utf8 utils") {
+	TEST_CASE("utf8 normalise") {
+		std::tuple<const char *, const char *, int> cases[] = {
+				{"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+				{"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+				/* Zero width spaces */
+				{"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+				/* Special case of diacritic */
+				{"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+				/* Same with zw spaces */
+				{"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
+	 							RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
+		};
+
+		for (const auto &c : cases) {
+			std::string cpy{std::get<0>(c)};
+			auto ns = cpy.size();
+			auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+			cpy.resize(ns);
+			CHECK(cpy == std::string(std::get<1>(c)));
+			CHECK(res == std::get<2>(c));
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
index 40bb53bf0..21add9bae 100644
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -34,6 +34,23 @@ extern "C" {
  */
 char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
 
+enum rspamd_normalise_result {
+	RSPAMD_UNICODE_NORM_NORMAL = 0,
+	RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+	RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+	RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+	RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 00774d588..1e92c8e54 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -3020,135 +3020,6 @@ rspamd_get_unicode_normalizer (void)
 #endif
 }
 
-
-enum rspamd_normalise_result
-rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
-		gsize *len)
-{
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-	UErrorCode uc_err = U_ZERO_ERROR;
-	UConverter *utf8_conv = rspamd_get_utf8_converter ();
-	const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
-	gint32 nsym, end;
-	UChar *src = NULL, *dest = NULL;
-	enum rspamd_normalise_result ret = 0;
-	gboolean has_invisible = FALSE;
-
-	/* We first need to convert data to UChars :( */
-	src = g_malloc ((*len + 1) * sizeof (*src));
-	nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
-			start, *len, &uc_err);
-
-	if (!U_SUCCESS (uc_err)) {
-		msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
-				u_errorName (uc_err));
-		ret |= RSPAMD_UNICODE_NORM_ERROR;
-		goto out;
-	}
-
-	/* We can now check if we need to decompose */
-	end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
-
-	if (!U_SUCCESS (uc_err)) {
-		msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
-				u_errorName (uc_err));
-		ret |= RSPAMD_UNICODE_NORM_ERROR;
-		goto out;
-	}
-
-	for (gint32 i = 0; i < nsym; i ++) {
-		if (IS_ZERO_WIDTH_SPACE (src[i])) {
-			has_invisible = TRUE;
-			break;
-		}
-	}
-
-	uc_err = U_ZERO_ERROR;
-
-	if (end != nsym) {
-		/* No normalisation needed, but we may still have invisible spaces */
-		/* We copy sub(src, 0, end) to dest and normalise the rest */
-		ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
-		dest = g_malloc (nsym * sizeof (*dest));
-		memcpy (dest, src, end * sizeof (*dest));
-		nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
-				src + end, nsym - end, &uc_err);
-
-		if (!U_SUCCESS (uc_err)) {
-			if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
-				msg_warn_pool_check ("cannot normalise URL: %s",
-						u_errorName (uc_err));
-				ret |= RSPAMD_UNICODE_NORM_ERROR;
-			}
-
-			goto out;
-		}
-	}
-	else if (!has_invisible) {
-		goto out;
-	}
-	else {
-		dest = src;
-		src = NULL;
-	}
-
-	if (has_invisible) {
-		/* Also filter zero width spaces */
-		gint32 new_len = 0;
-		UChar *t = dest, *h = dest;
-
-		ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
-
-		for (gint32 i = 0; i < nsym; i ++) {
-			if (!IS_ZERO_WIDTH_SPACE (*h)) {
-				*t++ = *h++;
-				new_len ++;
-			}
-			else {
-				h ++;
-			}
-		}
-
-		nsym = new_len;
-	}
-
-	/* We now convert it back to utf */
-	nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
-
-	if (!U_SUCCESS (uc_err)) {
-		msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
-					   " input length: %d chars, unicode length: %d utf16 symbols",
-				u_errorName (uc_err), (gint)*len, (gint)nsym);
-
-		if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
-			ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
-		}
-		else {
-			ret |= RSPAMD_UNICODE_NORM_ERROR;
-		}
-
-		goto out;
-	}
-
-	*len = nsym;
-
-out:
-
-	if (src) {
-		g_free (src);
-	}
-
-	if (dest) {
-		g_free (dest);
-	}
-
-	return ret;
-#else
-	/* Kill that with fire please */
-	return FALSE;
-#endif
-}
-
 gchar *
 rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 		gsize *dst_len, enum rspamd_regexp_escape_flags flags)
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 427d6b94e..cfa37848f 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -475,23 +475,7 @@ struct UNormalizer2;
 
 const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
 
-enum rspamd_normalise_result {
-	RSPAMD_UNICODE_NORM_NORMAL = 0,
-	RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
-	RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
-	RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
-	RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
-};
 
-/**
- * Gets a string in UTF8 and normalises it to NFKC_Casefold form
- * @param pool optional memory pool used for logging purposes
- * @param start
- * @param len
- * @return TRUE if a string has been normalised
- */
-enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
-															   gchar *start, gsize *len);
 
 enum rspamd_regexp_escape_flags {
 	RSPAMD_REGEXP_ESCAPE_ASCII = 0,