commit 4627303: [Project] Add preliminary language detector based on fasttext library

Vsevolod Stakhov vsevolod at rspamd.com
Sat Apr 29 17:14:03 UTC 2023


Author: Vsevolod Stakhov
Date: 2023-04-29 14:22:41 +0100
URL: https://github.com/rspamd/rspamd/commit/4627303717edb6c620b6d7855c5fce50a6c84577

[Project] Add preliminary language detector based on fasttext library

---
 CMakeLists.txt                          |   1 +
 config.h.in                             |   1 +
 src/libmime/CMakeLists.txt              |   1 +
 src/libmime/lang_detection_fasttext.cxx | 170 ++++++++++++++++++++++++++++++++
 src/libmime/lang_detection_fasttext.h   |  70 +++++++++++++
 5 files changed, 243 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ab5658d3..ac2585669 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,6 +247,7 @@ if(ENABLE_FASTTEXT MATCHES "ON")
 	ProcessPackage(FASTTEXT LIBRARY fasttext INCLUDE fasttext.h
 			INCLUDE_SUFFIXES include/fasttext
 			ROOT ${FASTTEXT_ROOT_DIR} MODULES fasttext)
+	SET(WITH_FASTTEXT "1")
 endif()
 
 include (CompilerWarnings)
diff --git a/config.h.in b/config.h.in
index 4fedba724..b70308331 100644
--- a/config.h.in
+++ b/config.h.in
@@ -116,6 +116,7 @@
 #cmakedefine WITH_LIBUNWIND      1
 #cmakedefine WITH_LUA_TRACE      1
 #cmakedefine WITH_LUA_REPL       1
+#cmakedefine WITH_FASTTEXT       1
 
 #cmakedefine DISABLE_PTHREAD_MUTEX 1
 
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt
index 4a64aac58..09e5dbfca 100644
--- a/src/libmime/CMakeLists.txt
+++ b/src/libmime/CMakeLists.txt
@@ -12,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
 				${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
 				${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
 				${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
+		${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx
 		${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
 		)
 
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
new file mode 100644
index 000000000..cf6b5c852
--- /dev/null
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -0,0 +1,170 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lang_detection_fasttext.h"
+
+#ifdef WITH_FASTTEXT
+#include "fasttext/fasttext.h"
+#include "libserver/cfg_file.h"
+#include "libserver/logger.h"
+#include "fmt/core.h"
+#include <exception>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <streambuf>
+#endif
+
+#ifdef WITH_FASTTEXT
+namespace rspamd::langdet {
+class fasttext_langdet {
+private:
+	fasttext::FastText ft;
+	bool loaded;
+
+	struct one_shot_buf : public std::streambuf {
+		explicit one_shot_buf(const char *in, std::size_t sz) {
+			auto deconst_in = const_cast<char *>(in);
+			setg(deconst_in, deconst_in, deconst_in + sz);
+		}
+	};
+public:
+	explicit fasttext_langdet(struct rspamd_config *cfg) {
+		const auto *ucl_obj = cfg->rcl_obj;
+		const auto *opts_section = ucl_object_find_key(ucl_obj, "options");
+
+		if (opts_section) {
+			const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model");
+
+			if (model) {
+				try {
+					ft.loadModel(ucl_object_tostring(model));
+					loaded = true;
+				}
+				catch (std::exception &e) {
+					auto err_message = fmt::format("cannot load fasttext model: {}", e.what());
+					msg_err_config("%s", err_message.c_str());
+					loaded = false;
+				}
+			}
+		}
+	}
+
+	/* Disallow multiple initialisation */
+	fasttext_langdet() = delete;
+	fasttext_langdet(const fasttext_langdet &) = delete;
+	fasttext_langdet(fasttext_langdet &&) = delete;
+
+	~fasttext_langdet() = default;
+
+
+	auto detect_language(const char *in, size_t len, int k) -> std::vector<std::pair<fasttext::real, std::string>> *
+	{
+		if (!loaded) {
+			return nullptr;
+		}
+
+		/* Hack to deal with streams without copies */
+		one_shot_buf buf{in, len};
+		auto stream = std::istream{&buf};
+		auto predictions = new std::vector<std::pair<fasttext::real, std::string>>;
+		predictions->reserve(k);
+		auto res = ft.predictLine(stream, *predictions, k, 0.0f);
+
+		if (res) {
+			return predictions;
+		}
+		else {
+			delete predictions;
+		}
+
+		return nullptr;
+	}
+};
+}
+#endif
+
+/* C API part */
+G_BEGIN_DECLS
+
+#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p)
+#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res)
+
+void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg)
+{
+#ifndef WITH_FASTTEXT
+	return nullptr;
+#else
+	return (void *)new rspamd::langdet::fasttext_langdet(cfg);
+#endif
+}
+
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+											   const char *in, size_t len, int k)
+{
+#ifndef WITH_FASTTEXT
+	return nullptr;
+#else
+	auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
+	auto *res = real_model->detect_language(in, len, k);
+
+	return (rspamd_fasttext_predict_result_t)res;
+#endif
+}
+
+void rspamd_lang_detection_fasttext_destroy(void *ud)
+{
+#ifdef WITH_FASTTEXT
+	delete FASTTEXT_MODEL_TO_C_API(ud);
+#endif
+}
+
+const char *
+rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+	auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+	if (real_res && !real_res->empty()) {
+		return real_res->front().second.c_str();
+	}
+#endif
+	return nullptr;
+}
+
+float
+rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+	auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+	if (real_res && !real_res->empty()) {
+		return real_res->front().first;
+	}
+#endif
+	return 0.0f;
+}
+
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+	auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+	delete real_res;
+#endif
+}
+
+G_END_DECLS
\ No newline at end of file
diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h
new file mode 100644
index 000000000..44bc8bf71
--- /dev/null
+++ b/src/libmime/lang_detection_fasttext.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H
+#define RSPAMD_LANG_DETECTION_FASTTEXT_H
+
+#include "config.h"
+
+G_BEGIN_DECLS
+struct rspamd_config;
+/**
+ * Initialize fasttext language detector
+ * @param cfg
+ * @return opaque pointer
+ */
+void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg);
+
+
+typedef  void * rspamd_fasttext_predict_result_t;
+/**
+ * Detect language using fasttext
+ * @param ud opaque pointer
+ * @param in input text
+ * @param len length of input text
+ * @param k number of results to return
+ * @return TRUE if language is detected
+ */
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+		const char *in, size_t len, int k);
+
+/**
+ * Get language from fasttext result
+ * @param res
+ * @return
+ */
+const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Get probability from fasttext result
+ * @param res
+ * @return
+ */
+float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext result
+ * @param res
+ */
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext language detector
+ */
+void rspamd_lang_detection_fasttext_destroy(void *ud);
+
+
+G_END_DECLS
+#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */


More information about the Commits mailing list