commit a59e81c: [Rework] Use C++ utf8 library with unit tests to trim whitespaces
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri May 14 20:00:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-14 20:58:28 +0100
URL: https://github.com/rspamd/rspamd/commit/a59e81ca90c986725107c8c013ccf33a91b07d45 (HEAD -> master)
[Rework] Use C++ utf8 library with unit tests to trim whitespaces
---
src/libserver/html.c | 40 +--------
src/libutil/CMakeLists.txt | 3 +-
src/libutil/cxx/utf8_util.cxx | 100 +++++++++++++++++++++
.../lua_compress.h => libutil/cxx/utf8_util.h} | 24 ++---
4 files changed, 119 insertions(+), 48 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 30c2c022b..8d7b722a5 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -25,6 +25,7 @@
#include "contrib/libucl/khash.h"
#include "libmime/images.h"
#include "css/css.h"
+#include "libutil/cxx/utf8_util.h"
#include <unicode/uversion.h>
#include <unicode/ucnv.h>
@@ -2619,43 +2620,8 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
dlen = dest->len - href_offset;
/* Strip unicode spaces from the start and the end */
- gchar *p = url->visible_part, *end = url->visible_part + dlen;
- gint i = 0;
-
- while (i < dlen) {
- UChar32 uc;
- gint prev_i = i;
-
- U8_NEXT(p, i, dlen, uc);
-
- if (!u_isspace (uc)) {
- i = prev_i;
- break;
- }
- }
-
- p += i;
- dlen -= i;
- url->visible_part = p;
- i = end - url->visible_part - 1;
-
- if (i > 0) {
- gint32 dl = dlen;
-
- while (i > 0) {
- UChar32 uc;
-
- U8_PREV(p, i, dl, uc);
-
- if (!u_isspace (uc)) {
- break;
- }
- }
-
- dlen = i;
- }
-
-
+ url->visible_part = rspamd_string_unicode_trim_inplace (url->visible_part,
+ &dlen);
rspamd_html_url_is_phished (pool, url,
url->visible_part,
dlen,
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index 64cc8ee1e..5160dfe7b 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -16,6 +16,7 @@ SET(LIBRSPAMDUTILSRC
${CMAKE_CURRENT_SOURCE_DIR}/upstream.c
${CMAKE_CURRENT_SOURCE_DIR}/util.c
${CMAKE_CURRENT_SOURCE_DIR}/heap.c
- ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c)
+ ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx)
# Rspamdutil
SET(RSPAMD_UTIL ${LIBRSPAMDUTILSRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
new file mode 100644
index 000000000..f44d02671
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -0,0 +1,100 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/utypes.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <utility>
+#include <string>
+
+#include "utf8_util.h"
+#include "str_util.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+char *
+rspamd_string_unicode_trim_inplace (char *str, size_t *len)
+{
+ auto *p = str, *end = str + *len;
+ auto i = 0;
+
+ while (i < *len) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_NEXT(p, i, *len, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ p += i;
+ (*len) -= i;
+ i = end - p;
+ auto *ret = p;
+
+ if (i > 0) {
+
+ while (i > 0) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_PREV(p, 0, i, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ *len = i;
+ }
+
+ return ret;
+}
+
+TEST_SUITE("utf8 utils") {
+ TEST_CASE("utf8 trim") {
+ std::pair<const char *, const char *> cases[] = {
+ {" \u200B""abc ", "abc"},
+ {" ", ""},
+ {" a", "a"},
+ {"a ", "a"},
+ {"a a", "a a"},
+ {"abc", "abc"},
+ {"a ", "a"},
+ {" abc ", "abc"},
+ {" abc ", "abc"},
+ {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+ {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+ {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"},
+ };
+
+ for (const auto &c : cases) {
+ std::string cpy{c.first};
+ auto ns = cpy.size();
+ auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+ std::string res{nstart, ns};
+ CHECK(res == std::string{c.second});
+ }
+ }
+}
+
+
diff --git a/src/lua/lua_compress.h b/src/libutil/cxx/utf8_util.h
similarity index 65%
copy from src/lua/lua_compress.h
copy to src/libutil/cxx/utf8_util.h
index 7ac8d1a66..40bb53bf0 100644
--- a/src/lua/lua_compress.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -14,24 +14,28 @@
* limitations under the License.
*/
-#ifndef RSPAMD_LUA_COMPRESS_H
-#define RSPAMD_LUA_COMPRESS_H
+#pragma once
-#include "lua_common.h"
+#ifndef RSPAMD_UTF8_UTIL_H
+#define RSPAMD_UTF8_UTIL_H
+
+#include "config.h"
+#include "mem_pool.h"
#ifdef __cplusplus
extern "C" {
#endif
-gint lua_compress_zstd_compress (lua_State *L);
-gint lua_compress_zstd_decompress (lua_State *L);
-gint lua_compress_zlib_compress (lua_State *L);
-gint lua_compress_zlib_decompress (lua_State *L, bool is_gzip);
-
-void luaopen_compress (lua_State *L);
+/**
+ * Removes all unicode spaces from a string
+ * @param str start of the string
+ * @param len length
+ * @return new length of the string trimmed
+ */
+char* rspamd_string_unicode_trim_inplace (char *str, size_t *len);
#ifdef __cplusplus
}
#endif
-#endif //RSPAMD_LUA_COMPRESS_H
+#endif //RSPAMD_UTF8_UTIL_H
More information about the Commits
mailing list