commit 58d60ea: [Rework] Use C++ version for unicode normalisation
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon May 17 15:35:06 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-17 16:34:35 +0100
URL: https://github.com/rspamd/rspamd/commit/58d60ea07f1478fe5fcc1d152d54237169e3bd43 (HEAD -> master)
[Rework] Use C++ version for unicode normalisation
---
src/libserver/html.c | 2 +-
src/libserver/url.h | 3 +-
src/libutil/cxx/utf8_util.cxx | 100 ++++++++++++++++++++++++++++++++
src/libutil/cxx/utf8_util.h | 17 ++++++
src/libutil/str_util.c | 129 ------------------------------------------
src/libutil/str_util.h | 16 ------
6 files changed, 120 insertions(+), 147 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 8d7b722a5..cfdd0acef 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -2667,7 +2667,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
}
}
- rspamd_normalise_unicode_inplace (pool, url->visible_part, &dlen);
+ rspamd_normalise_unicode_inplace (url->visible_part, &dlen);
}
static gboolean
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 72fce5f9e..4ace18f1a 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -6,6 +6,7 @@
#include "mem_pool.h"
#include "khash.h"
#include "fstring.h"
+#include "libutil/cxx/utf8_util.h"
#ifdef __cplusplus
extern "C" {
@@ -356,7 +357,7 @@ int rspamd_url_cmp_qsort(const void *u1, const void *u2);
#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
do { \
enum rspamd_normalise_result norm_res; \
- norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out)); \
+ norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \
if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \
url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \
} \
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
index f44d02671..6bca4b18e 100644
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -18,6 +18,8 @@
#include <unicode/utypes.h>
#include <unicode/utf8.h>
#include <unicode/uchar.h>
+#include <unicode/normalizer2.h>
+#include <unicode/schriter.h>
#include <utility>
#include <string>
@@ -98,3 +100,101 @@ TEST_SUITE("utf8 utils") {
}
+
+enum rspamd_normalise_result
+rspamd_normalise_unicode_inplace(char *start, size_t *len)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+ const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
+ static icu::UnicodeSet zw_spaces{};
+
+ if (!zw_spaces.isFrozen()) {
+ /* Add zw spaces to the set */
+ zw_spaces.add(0x200B);
+ zw_spaces.add(0x200C);
+ zw_spaces.add(0x200D);
+ zw_spaces.add(0xFEF);
+ zw_spaces.add(0x00AD);
+ zw_spaces.freeze();
+ }
+
+ int ret = RSPAMD_UNICODE_NORM_NORMAL;
+
+ g_assert (U_SUCCESS (uc_err));
+
+ auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
+ auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ return RSPAMD_UNICODE_NORM_ERROR;
+ }
+
+ /* Filter zero width spaces and push resulting string back */
+ const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
+ icu::StringCharacterIterator it{input};
+ size_t i = 0;
+
+ while(it.hasNext()) {
+ auto uc = it.next32PostInc();
+
+ if (zw_spaces.contains(uc)) {
+ ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+ }
+ else {
+ UBool err = 0;
+ U8_APPEND(start, i, *len, uc, err);
+
+ if (err) {
+ ret = RSPAMD_UNICODE_NORM_ERROR;
+
+ return i;
+ }
+ }
+ }
+
+ return i;
+ };
+
+ if (is_normal != UNORM_YES) {
+ /* Need to normalise */
+ ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+
+ auto normalised = nfkc_norm->normalize(uc_string, uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ return RSPAMD_UNICODE_NORM_ERROR;
+ }
+
+ *len = filter_zw_spaces_and_push_back(normalised);
+ }
+ else {
+ *len = filter_zw_spaces_and_push_back(uc_string);
+ }
+
+ return static_cast<enum rspamd_normalise_result>(ret);
+}
+
+TEST_SUITE("utf8 utils") {
+ TEST_CASE("utf8 normalise") {
+ std::tuple<const char *, const char *, int> cases[] = {
+ {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+ {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+ /* Zero width spaces */
+ {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ /* Special case of diacritic */
+ {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+ /* Same with zw spaces */
+ {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
+ RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ };
+
+ for (const auto &c : cases) {
+ std::string cpy{std::get<0>(c)};
+ auto ns = cpy.size();
+ auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+ cpy.resize(ns);
+ CHECK(cpy == std::string(std::get<1>(c)));
+ CHECK(res == std::get<2>(c));
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
index 40bb53bf0..21add9bae 100644
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -34,6 +34,23 @@ extern "C" {
*/
char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
+enum rspamd_normalise_result {
+ RSPAMD_UNICODE_NORM_NORMAL = 0,
+ RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+ RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+ RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+ RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 00774d588..1e92c8e54 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -3020,135 +3020,6 @@ rspamd_get_unicode_normalizer (void)
#endif
}
-
-enum rspamd_normalise_result
-rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
- gsize *len)
-{
-#if U_ICU_VERSION_MAJOR_NUM >= 44
- UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *utf8_conv = rspamd_get_utf8_converter ();
- const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
- gint32 nsym, end;
- UChar *src = NULL, *dest = NULL;
- enum rspamd_normalise_result ret = 0;
- gboolean has_invisible = FALSE;
-
- /* We first need to convert data to UChars :( */
- src = g_malloc ((*len + 1) * sizeof (*src));
- nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
- start, *len, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
- u_errorName (uc_err));
- ret |= RSPAMD_UNICODE_NORM_ERROR;
- goto out;
- }
-
- /* We can now check if we need to decompose */
- end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
- u_errorName (uc_err));
- ret |= RSPAMD_UNICODE_NORM_ERROR;
- goto out;
- }
-
- for (gint32 i = 0; i < nsym; i ++) {
- if (IS_ZERO_WIDTH_SPACE (src[i])) {
- has_invisible = TRUE;
- break;
- }
- }
-
- uc_err = U_ZERO_ERROR;
-
- if (end != nsym) {
- /* No normalisation needed, but we may still have invisible spaces */
- /* We copy sub(src, 0, end) to dest and normalise the rest */
- ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
- dest = g_malloc (nsym * sizeof (*dest));
- memcpy (dest, src, end * sizeof (*dest));
- nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
- src + end, nsym - end, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
- msg_warn_pool_check ("cannot normalise URL: %s",
- u_errorName (uc_err));
- ret |= RSPAMD_UNICODE_NORM_ERROR;
- }
-
- goto out;
- }
- }
- else if (!has_invisible) {
- goto out;
- }
- else {
- dest = src;
- src = NULL;
- }
-
- if (has_invisible) {
- /* Also filter zero width spaces */
- gint32 new_len = 0;
- UChar *t = dest, *h = dest;
-
- ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
-
- for (gint32 i = 0; i < nsym; i ++) {
- if (!IS_ZERO_WIDTH_SPACE (*h)) {
- *t++ = *h++;
- new_len ++;
- }
- else {
- h ++;
- }
- }
-
- nsym = new_len;
- }
-
- /* We now convert it back to utf */
- nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
- " input length: %d chars, unicode length: %d utf16 symbols",
- u_errorName (uc_err), (gint)*len, (gint)nsym);
-
- if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
- ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
- }
- else {
- ret |= RSPAMD_UNICODE_NORM_ERROR;
- }
-
- goto out;
- }
-
- *len = nsym;
-
-out:
-
- if (src) {
- g_free (src);
- }
-
- if (dest) {
- g_free (dest);
- }
-
- return ret;
-#else
- /* Kill that with fire please */
- return FALSE;
-#endif
-}
-
gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
gsize *dst_len, enum rspamd_regexp_escape_flags flags)
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 427d6b94e..cfa37848f 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -475,23 +475,7 @@ struct UNormalizer2;
const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
-enum rspamd_normalise_result {
- RSPAMD_UNICODE_NORM_NORMAL = 0,
- RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
- RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
- RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
- RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
-};
-/**
- * Gets a string in UTF8 and normalises it to NFKC_Casefold form
- * @param pool optional memory pool used for logging purposes
- * @param start
- * @param len
- * @return TRUE if a string has been normalised
- */
-enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
- gchar *start, gsize *len);
enum rspamd_regexp_escape_flags {
RSPAMD_REGEXP_ESCAPE_ASCII = 0,
More information about the Commits
mailing list