commit 4627303: [Project] Add preliminary language detector based on fasttext library
Vsevolod Stakhov
vsevolod at rspamd.com
Sat Apr 29 17:14:03 UTC 2023
Author: Vsevolod Stakhov
Date: 2023-04-29 14:22:41 +0100
URL: https://github.com/rspamd/rspamd/commit/4627303717edb6c620b6d7855c5fce50a6c84577
[Project] Add preliminary language detector based on fasttext library
---
CMakeLists.txt | 1 +
config.h.in | 1 +
src/libmime/CMakeLists.txt | 1 +
src/libmime/lang_detection_fasttext.cxx | 170 ++++++++++++++++++++++++++++++++
src/libmime/lang_detection_fasttext.h | 70 +++++++++++++
5 files changed, 243 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ab5658d3..ac2585669 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,6 +247,7 @@ if(ENABLE_FASTTEXT MATCHES "ON")
ProcessPackage(FASTTEXT LIBRARY fasttext INCLUDE fasttext.h
INCLUDE_SUFFIXES include/fasttext
ROOT ${FASTTEXT_ROOT_DIR} MODULES fasttext)
+ SET(WITH_FASTTEXT "1")
endif()
include (CompilerWarnings)
diff --git a/config.h.in b/config.h.in
index 4fedba724..b70308331 100644
--- a/config.h.in
+++ b/config.h.in
@@ -116,6 +116,7 @@
#cmakedefine WITH_LIBUNWIND 1
#cmakedefine WITH_LUA_TRACE 1
#cmakedefine WITH_LUA_REPL 1
+#cmakedefine WITH_FASTTEXT 1
#cmakedefine DISABLE_PTHREAD_MUTEX 1
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt
index 4a64aac58..09e5dbfca 100644
--- a/src/libmime/CMakeLists.txt
+++ b/src/libmime/CMakeLists.txt
@@ -12,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx
${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
)
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
new file mode 100644
index 000000000..cf6b5c852
--- /dev/null
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -0,0 +1,170 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lang_detection_fasttext.h"
+
+#ifdef WITH_FASTTEXT
+#include "fasttext/fasttext.h"
+#include "libserver/cfg_file.h"
+#include "libserver/logger.h"
+#include "fmt/core.h"
+#include <exception>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <streambuf>
+#endif
+
+#ifdef WITH_FASTTEXT
+namespace rspamd::langdet {
+class fasttext_langdet {
+private:
+ fasttext::FastText ft;
+ bool loaded;
+
+ struct one_shot_buf : public std::streambuf {
+ explicit one_shot_buf(const char *in, std::size_t sz) {
+ auto deconst_in = const_cast<char *>(in);
+ setg(deconst_in, deconst_in, deconst_in + sz);
+ }
+ };
+public:
+ explicit fasttext_langdet(struct rspamd_config *cfg) {
+ const auto *ucl_obj = cfg->rcl_obj;
+ const auto *opts_section = ucl_object_find_key(ucl_obj, "options");
+
+ if (opts_section) {
+ const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model");
+
+ if (model) {
+ try {
+ ft.loadModel(ucl_object_tostring(model));
+ loaded = true;
+ }
+ catch (std::exception &e) {
+ auto err_message = fmt::format("cannot load fasttext model: {}", e.what());
+ msg_err_config("%s", err_message.c_str());
+ loaded = false;
+ }
+ }
+ }
+ }
+
+ /* Disallow multiple initialisation */
+ fasttext_langdet() = delete;
+ fasttext_langdet(const fasttext_langdet &) = delete;
+ fasttext_langdet(fasttext_langdet &&) = delete;
+
+ ~fasttext_langdet() = default;
+
+
+ auto detect_language(const char *in, size_t len, int k) -> std::vector<std::pair<fasttext::real, std::string>> *
+ {
+ if (!loaded) {
+ return nullptr;
+ }
+
+ /* Hack to deal with streams without copies */
+ one_shot_buf buf{in, len};
+ auto stream = std::istream{&buf};
+ auto predictions = new std::vector<std::pair<fasttext::real, std::string>>;
+ predictions->reserve(k);
+ auto res = ft.predictLine(stream, *predictions, k, 0.0f);
+
+ if (res) {
+ return predictions;
+ }
+ else {
+ delete predictions;
+ }
+
+ return nullptr;
+ }
+};
+}
+#endif
+
+/* C API part */
+G_BEGIN_DECLS
+
+#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p)
+#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res)
+
+void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg)
+{
+#ifndef WITH_FASTTEXT
+ return nullptr;
+#else
+ return (void *)new rspamd::langdet::fasttext_langdet(cfg);
+#endif
+}
+
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+ const char *in, size_t len, int k)
+{
+#ifndef WITH_FASTTEXT
+ return nullptr;
+#else
+ auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
+ auto *res = real_model->detect_language(in, len, k);
+
+ return (rspamd_fasttext_predict_result_t)res;
+#endif
+}
+
+void rspamd_lang_detection_fasttext_destroy(void *ud)
+{
+#ifdef WITH_FASTTEXT
+ delete FASTTEXT_MODEL_TO_C_API(ud);
+#endif
+}
+
+const char *
+rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ if (real_res && !real_res->empty()) {
+ return real_res->front().second.c_str();
+ }
+#endif
+ return nullptr;
+}
+
+float
+rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ if (real_res && !real_res->empty()) {
+ return real_res->front().first;
+ }
+#endif
+ return 0.0f;
+}
+
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ delete real_res;
+#endif
+}
+
+G_END_DECLS
\ No newline at end of file
diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h
new file mode 100644
index 000000000..44bc8bf71
--- /dev/null
+++ b/src/libmime/lang_detection_fasttext.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H
+#define RSPAMD_LANG_DETECTION_FASTTEXT_H
+
+#include "config.h"
+
+G_BEGIN_DECLS
+struct rspamd_config;
+/**
+ * Initialize fasttext language detector
+ * @param cfg
+ * @return opaque pointer
+ */
+void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg);
+
+
+typedef void * rspamd_fasttext_predict_result_t;
+/**
+ * Detect language using fasttext
+ * @param ud opaque pointer
+ * @param in input text
+ * @param len length of input text
+ * @param k number of results to return
+ * @return TRUE if language is detected
+ */
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+ const char *in, size_t len, int k);
+
+/**
+ * Get language from fasttext result
+ * @param res
+ * @return
+ */
+const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Get probability from fasttext result
+ * @param res
+ * @return
+ */
+float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext result
+ * @param res
+ */
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext language detector
+ */
+void rspamd_lang_detection_fasttext_destroy(void *ud);
+
+
+G_END_DECLS
+#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */
More information about the Commits
mailing list