commit 06ba232: [Feature] Add rspamd_utf8_strcmp utility
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Aug 2 20:35:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-08-02 17:00:14 +0100
URL: https://github.com/rspamd/rspamd/commit/06ba232b45946fc52c5d812551ac50c2343e3b99
[Feature] Add rspamd_utf8_strcmp utility
---
src/libutil/cxx/utf8_util.cxx | 173 +++++++++++++++++++++++++++++-------------
src/libutil/cxx/utf8_util.h | 9 +++
2 files changed, 130 insertions(+), 52 deletions(-)
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
index e42ef917f..8b99d1f35 100644
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -20,9 +20,11 @@
#include <unicode/uchar.h>
#include <unicode/normalizer2.h>
#include <unicode/schriter.h>
+#include <unicode/coll.h>
#include <utility>
#include <tuple>
#include <string>
+#include <limits>
#include "utf8_util.h"
#include "str_util.h"
@@ -73,35 +75,6 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len)
return ret;
}
-TEST_SUITE("utf8 utils") {
- TEST_CASE("utf8 trim") {
- std::pair<const char *, const char *> cases[] = {
- {" \u200B""abc ", "abc"},
- {" ", ""},
- {" a", "a"},
- {"a ", "a"},
- {"a a", "a a"},
- {"abc", "abc"},
- {"a ", "a"},
- {" abc ", "abc"},
- {" abc ", "abc"},
- {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
- {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
- {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"},
- };
-
- for (const auto &c : cases) {
- std::string cpy{c.first};
- auto ns = cpy.size();
- auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
- std::string res{nstart, ns};
- CHECK(res == std::string{c.second});
- }
- }
-}
-
-
-
enum rspamd_normalise_result
rspamd_normalise_unicode_inplace(char *start, size_t *len)
{
@@ -184,30 +157,126 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len)
return static_cast<enum rspamd_normalise_result>(ret);
}
+struct rspamd_icu_collate_storage {
+ icu::Collator* collator = nullptr;
+ rspamd_icu_collate_storage() {
+ UErrorCode success = U_ZERO_ERROR;
+ collator = icu::Collator::createInstance(icu::Locale::getEnglish(), success);
+ /* Ignore all difference except functional */
+ collator->setStrength(icu::Collator::PRIMARY);
+ }
+
+ ~rspamd_icu_collate_storage() {
+ if (collator) {
+ delete collator;
+ }
+ }
+};
+
+static rspamd_icu_collate_storage collate_storage;
+
+int
+rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
+{
+ if (n >= std::numeric_limits<int>::max()) {
+ /*
+ * It's hard to say what to do here... But libicu wants int, so we fall
+ * back to g_ascii_strcasecmp which can deal with size_t
+ */
+ return g_ascii_strncasecmp(s1, s2, n);
+ }
+
+ UErrorCode success = U_ZERO_ERROR;
+ auto res = collate_storage.collator->compareUTF8({s1, (int) n}, {s2, (int) n},
+ success);
+
+ switch (res) {
+ case UCOL_EQUAL:
+ return 0;
+ case UCOL_GREATER:
+ return 1;
+ case UCOL_LESS:
+ default:
+ return -1;
+ }
+}
+
TEST_SUITE("utf8 utils") {
- TEST_CASE("utf8 normalise") {
- std::tuple<const char *, const char *, int> cases[] = {
- {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
- {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
- /* Zero width spaces */
- {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
- /* Special case of diacritic */
- {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
- /* Same with zw spaces */
- {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
- RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
- /* Buffer overflow case */
- {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������",
- RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
- };
-
- for (const auto &c : cases) {
- std::string cpy{std::get<0>(c)};
- auto ns = cpy.size();
- auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
- cpy.resize(ns);
- CHECK(cpy == std::string(std::get<1>(c)));
- CHECK(res == std::get<2>(c));
+TEST_CASE("utf8 normalise") {
+ std::tuple<const char *, const char *, int> cases[] = {
+ {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+ {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+ /* Zero width spaces */
+ {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ /* Special case of diacritic */
+ {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+ /* Same with zw spaces */
+ {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
+ RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ /* Buffer overflow case */
+ {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������",
+ RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
+ };
+
+ for (const auto &c : cases) {
+ std::string cpy{std::get<0>(c)};
+ auto ns = cpy.size();
+ auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+ cpy.resize(ns);
+ CHECK(cpy == std::string(std::get<1>(c)));
+ CHECK(res == std::get<2>(c));
+ }
+}
+
+TEST_CASE("utf8 trim") {
+ std::pair<const char *, const char *> cases[] = {
+ {" \u200B""abc ", "abc"},
+ {" ", ""},
+ {" a", "a"},
+ {"a ", "a"},
+ {"a a", "a a"},
+ {"abc", "abc"},
+ {"a ", "a"},
+ {" abc ", "abc"},
+ {" abc ", "abc"},
+ {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+ {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+ {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"},
+ };
+
+ for (const auto &c : cases) {
+ std::string cpy{c.first};
+ auto ns = cpy.size();
+ auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+ std::string res{nstart, ns};
+ CHECK(res == std::string{c.second});
+ }
+}
+
+
+TEST_CASE("utf8 strcmp") {
+ std::tuple<const char *, const char *, int, int> cases[] = {
+ {"abc", "abc", -1, 0},
+ {"", "", -1, 0},
+ {"aBc", "AbC", -1, 0},
+ {"abc", "ab", 2, 0},
+ {"теСт", "ТесТ", -1, 0},
+ {"теСт", "Тезт", 4, 0},
+ {"теСт", "Тезт", -1, 1},
+ {"abc", "ABD", -1, -1},
+ {"\0a\0", "\0a\1", 2, 0},
+ {"\0a\0", "\0b\1", 3, -1},
+ };
+
+ for (const auto &c : cases) {
+ auto [s1, s2, n, expected] = c;
+ if (n == -1) {
+ n = MIN(strlen(s1), strlen(s2));
+ }
+ SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) {
+ auto ret = rspamd_utf8_strcmp(s1, s2, n);
+ CHECK(ret == expected);
}
}
+}
}
\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
index 242e03f00..28bd6a144 100644
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -51,6 +51,15 @@ enum rspamd_normalise_result {
*/
enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
+/**
+ * Compare two strings using libicu collator
+ * @param s1
+ * @param s2
+ * @param n
+ * @return an integer greater than, equal to, or less than 0, according as the string s1 is greater than, equal to, or less than the string s2.
+ */
+int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n);
+
#ifdef __cplusplus
}
#endif
More information about the Commits
mailing list