commit a59e81c: [Rework] Use C++ utf8 library with unit tests to trim whitespaces

Vsevolod Stakhov vsevolod at highsecure.ru
Fri May 14 20:00:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-14 20:58:28 +0100
URL: https://github.com/rspamd/rspamd/commit/a59e81ca90c986725107c8c013ccf33a91b07d45 (HEAD -> master)

[Rework] Use C++ utf8 library with unit tests to trim whitespaces

---
 src/libserver/html.c                               |  40 +--------
 src/libutil/CMakeLists.txt                         |   3 +-
 src/libutil/cxx/utf8_util.cxx                      | 100 +++++++++++++++++++++
 .../lua_compress.h => libutil/cxx/utf8_util.h}     |  24 ++---
 4 files changed, 119 insertions(+), 48 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 30c2c022b..8d7b722a5 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -25,6 +25,7 @@
 #include "contrib/libucl/khash.h"
 #include "libmime/images.h"
 #include "css/css.h"
+#include "libutil/cxx/utf8_util.h"
 
 #include <unicode/uversion.h>
 #include <unicode/ucnv.h>
@@ -2619,43 +2620,8 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
 	dlen = dest->len - href_offset;
 
 	/* Strip unicode spaces from the start and the end */
-	gchar *p = url->visible_part, *end = url->visible_part + dlen;
-	gint i = 0;
-
-	while (i < dlen) {
-		UChar32 uc;
-		gint prev_i = i;
-
-		U8_NEXT(p, i, dlen, uc);
-
-		if (!u_isspace (uc)) {
-			i = prev_i;
-			break;
-		}
-	}
-
-	p += i;
-	dlen -= i;
-	url->visible_part = p;
-	i = end - url->visible_part - 1;
-
-	if (i > 0) {
-		gint32 dl = dlen;
-
-		while (i > 0) {
-			UChar32 uc;
-
-			U8_PREV(p, i, dl, uc);
-
-			if (!u_isspace (uc)) {
-				break;
-			}
-		}
-
-		dlen = i;
-	}
-
-
+	url->visible_part = rspamd_string_unicode_trim_inplace (url->visible_part,
+			&dlen);
 	rspamd_html_url_is_phished (pool, url,
 			url->visible_part,
 			dlen,
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index 64cc8ee1e..5160dfe7b 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -16,6 +16,7 @@ SET(LIBRSPAMDUTILSRC
 				${CMAKE_CURRENT_SOURCE_DIR}/upstream.c
 				${CMAKE_CURRENT_SOURCE_DIR}/util.c
 				${CMAKE_CURRENT_SOURCE_DIR}/heap.c
-				${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c)
+				${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c
+				${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx)
 # Rspamdutil
 SET(RSPAMD_UTIL ${LIBRSPAMDUTILSRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
new file mode 100644
index 000000000..f44d02671
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -0,0 +1,100 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/utypes.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <utility>
+#include <string>
+
+#include "utf8_util.h"
+#include "str_util.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+char *
+rspamd_string_unicode_trim_inplace (char *str, size_t *len)
+{
+	auto *p = str, *end = str + *len;
+	auto i = 0;
+
+	while (i < *len) {
+		UChar32 uc;
+		auto prev_i = i;
+
+		U8_NEXT(p, i, *len, uc);
+
+		if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+			i = prev_i;
+			break;
+		}
+	}
+
+	p += i;
+	(*len) -= i;
+	i = end - p;
+	auto *ret = p;
+
+	if (i > 0) {
+
+		while (i > 0) {
+			UChar32 uc;
+			auto prev_i = i;
+
+			U8_PREV(p, 0, i, uc);
+
+			if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+				i = prev_i;
+				break;
+			}
+		}
+
+		*len = i;
+	}
+
+	return ret;
+}
+
+TEST_SUITE("utf8 utils") {
+	TEST_CASE("utf8 trim") {
+		std::pair<const char *, const char *> cases[] = {
+				{" \u200B""abc ", "abc"},
+				{"   ",  ""},
+				{"   a", "a"},
+				{"a   ", "a"},
+				{"a a",  "a a"},
+				{"abc",  "abc"},
+				{"a ", "a"},
+				{"   abc      ", "abc"},
+				{" abc ", "abc"},
+				{" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+				{" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+				{" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
+		};
+
+		for (const auto &c : cases) {
+			std::string cpy{c.first};
+			auto ns = cpy.size();
+			auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+			std::string res{nstart, ns};
+			CHECK(res == std::string{c.second});
+		}
+	}
+}
+
+
diff --git a/src/lua/lua_compress.h b/src/libutil/cxx/utf8_util.h
similarity index 65%
copy from src/lua/lua_compress.h
copy to src/libutil/cxx/utf8_util.h
index 7ac8d1a66..40bb53bf0 100644
--- a/src/lua/lua_compress.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -14,24 +14,28 @@
  * limitations under the License.
  */
 
-#ifndef RSPAMD_LUA_COMPRESS_H
-#define RSPAMD_LUA_COMPRESS_H
+#pragma once
 
-#include "lua_common.h"
+#ifndef RSPAMD_UTF8_UTIL_H
+#define RSPAMD_UTF8_UTIL_H
+
+#include "config.h"
+#include "mem_pool.h"
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-gint lua_compress_zstd_compress (lua_State *L);
-gint lua_compress_zstd_decompress (lua_State *L);
-gint lua_compress_zlib_compress (lua_State *L);
-gint lua_compress_zlib_decompress (lua_State *L, bool is_gzip);
-
-void luaopen_compress (lua_State *L);
+/**
+ * Removes all unicode spaces from a string
+ * @param str start of the string
+ * @param len length
+ * @return new length of the string trimmed
+ */
+char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
 
 #ifdef  __cplusplus
 }
 #endif
 
-#endif //RSPAMD_LUA_COMPRESS_H
+#endif //RSPAMD_UTF8_UTIL_H


More information about the Commits mailing list