commit 06ba232: [Feature] Add rspamd_utf8_strcmp utility

Mon Aug 2 20:35:04 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-08-02 17:00:14 +0100
URL: https://github.com/rspamd/rspamd/commit/06ba232b45946fc52c5d812551ac50c2343e3b99

[Feature] Add rspamd_utf8_strcmp utility

---
 src/libutil/cxx/utf8_util.cxx | 173 +++++++++++++++++++++++++++++-------------
 src/libutil/cxx/utf8_util.h   |   9 +++
 2 files changed, 130 insertions(+), 52 deletions(-)

diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
index e42ef917f..8b99d1f35 100644
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -20,9 +20,11 @@
 #include <unicode/uchar.h>
 #include <unicode/normalizer2.h>
 #include <unicode/schriter.h>
+#include <unicode/coll.h>
 #include <utility>
 #include <tuple>
 #include <string>
+#include <limits>
 
 #include "utf8_util.h"
 #include "str_util.h"
@@ -73,35 +75,6 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len)
 	return ret;
 }
 
-TEST_SUITE("utf8 utils") {
-	TEST_CASE("utf8 trim") {
-		std::pair<const char *, const char *> cases[] = {
-				{" \u200B""abc ", "abc"},
-				{"   ",  ""},
-				{"   a", "a"},
-				{"a   ", "a"},
-				{"a a",  "a a"},
-				{"abc",  "abc"},
-				{"a ", "a"},
-				{"   abc      ", "abc"},
-				{" abc ", "abc"},
-				{" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
-				{" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
-				{" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
-		};
-
-		for (const auto &c : cases) {
-			std::string cpy{c.first};
-			auto ns = cpy.size();
-			auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
-			std::string res{nstart, ns};
-			CHECK(res == std::string{c.second});
-		}
-	}
-}
-
-
-
 enum rspamd_normalise_result
 rspamd_normalise_unicode_inplace(char *start, size_t *len)
 {
@@ -184,30 +157,126 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len)
 	return static_cast<enum rspamd_normalise_result>(ret);
 }
 
+struct rspamd_icu_collate_storage {
+	icu::Collator* collator = nullptr;
+	rspamd_icu_collate_storage() {
+		UErrorCode success = U_ZERO_ERROR;
+		collator = icu::Collator::createInstance(icu::Locale::getEnglish(), success);
+		/* Ignore all difference except functional */
+		collator->setStrength(icu::Collator::PRIMARY);
+	}
+
+	~rspamd_icu_collate_storage() {
+		if (collator) {
+			delete collator;
+		}
+	}
+};
+
+static rspamd_icu_collate_storage collate_storage;
+
+int
+rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
+{
+	if (n >= std::numeric_limits<int>::max()) {
+		/*
+		 * It's hard to say what to do here... But libicu wants int, so we fall
+		 * back to g_ascii_strcasecmp which can deal with size_t
+		 */
+		return g_ascii_strncasecmp(s1, s2, n);
+	}
+
+	UErrorCode success = U_ZERO_ERROR;
+	auto res = collate_storage.collator->compareUTF8({s1, (int) n}, {s2, (int) n},
+			success);
+
+	switch (res) {
+	case UCOL_EQUAL:
+		return 0;
+	case UCOL_GREATER:
+		return 1;
+	case UCOL_LESS:
+	default:
+		return -1;
+	}
+}
+
 TEST_SUITE("utf8 utils") {
-	TEST_CASE("utf8 normalise") {
-		std::tuple<const char *, const char *, int> cases[] = {
-				{"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
-				{"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
-				/* Zero width spaces */
-				{"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
-				/* Special case of diacritic */
-				{"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
-				/* Same with zw spaces */
-				{"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
-	 							RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
-				/* Buffer overflow case */
-				{"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������",
-	 							RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
-		};
-
-		for (const auto &c : cases) {
-			std::string cpy{std::get<0>(c)};
-			auto ns = cpy.size();
-			auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
-			cpy.resize(ns);
-			CHECK(cpy == std::string(std::get<1>(c)));
-			CHECK(res == std::get<2>(c));
+TEST_CASE("utf8 normalise") {
+	std::tuple<const char *, const char *, int> cases[] = {
+			{"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+			{"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+			/* Zero width spaces */
+			{"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+			/* Special case of diacritic */
+			{"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+			/* Same with zw spaces */
+			{"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
+					RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
+			/* Buffer overflow case */
+			{"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������",
+					RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
+	};
+
+	for (const auto &c : cases) {
+		std::string cpy{std::get<0>(c)};
+		auto ns = cpy.size();
+		auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+		cpy.resize(ns);
+		CHECK(cpy == std::string(std::get<1>(c)));
+		CHECK(res == std::get<2>(c));
+	}
+}
+
+TEST_CASE("utf8 trim") {
+	std::pair<const char *, const char *> cases[] = {
+			{" \u200B""abc ", "abc"},
+			{"   ",  ""},
+			{"   a", "a"},
+			{"a   ", "a"},
+			{"a a",  "a a"},
+			{"abc",  "abc"},
+			{"a ", "a"},
+			{"   abc      ", "abc"},
+			{" abc ", "abc"},
+			{" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+			{" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+			{" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
+	};
+
+	for (const auto &c : cases) {
+		std::string cpy{c.first};
+		auto ns = cpy.size();
+		auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+		std::string res{nstart, ns};
+		CHECK(res == std::string{c.second});
+	}
+}
+
+
+TEST_CASE("utf8 strcmp") {
+	std::tuple<const char *, const char *, int, int> cases[] = {
+			{"abc", "abc", -1, 0},
+			{"",  "", -1, 0},
+			{"aBc", "AbC", -1, 0},
+			{"abc", "ab", 2, 0},
+			{"теСт", "ТесТ", -1, 0},
+			{"теСт", "Тезт", 4, 0},
+			{"теСт", "Тезт", -1, 1},
+			{"abc", "ABD", -1, -1},
+			{"\0a\0", "\0a\1", 2, 0},
+			{"\0a\0", "\0b\1", 3, -1},
+	};
+
+	for (const auto &c : cases) {
+		auto [s1, s2, n, expected] = c;
+		if (n == -1) {
+			n = MIN(strlen(s1), strlen(s2));
+		}
+		SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) {
+			auto ret = rspamd_utf8_strcmp(s1, s2, n);
+			CHECK(ret == expected);
 		}
 	}
+}
 }
\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
index 242e03f00..28bd6a144 100644
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -51,6 +51,15 @@ enum rspamd_normalise_result {
  */
 enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
 
+/**
+ * Compare two strings using libicu collator
+ * @param s1
+ * @param s2
+ * @param n
+ * @return an integer greater than, equal to, or less than 0, according as the string s1 is greater than, equal to, or less than the string s2.
+ */
+int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n);
+
 #ifdef  __cplusplus
 }
 #endif