commit 68ea114: [Project] Some further fixes
Vsevolod Stakhov
vsevolod at rspamd.com
Sat Apr 29 17:14:09 UTC 2023
Author: Vsevolod Stakhov
Date: 2023-04-29 17:44:16 +0100
URL: https://github.com/rspamd/rspamd/commit/68ea1140d77cd4dda13247ec300251563a28c176 (refs/pull/4473/head, vstakhov-fasttext-langdet)
[Project] Some further fixes
---
src/libmime/lang_detection.c | 49 ++++++++++++++++++++++-----------
src/libmime/lang_detection_fasttext.cxx | 16 ++++++++---
2 files changed, 45 insertions(+), 20 deletions(-)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index d8e81e075..62d04975c 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -174,8 +174,10 @@ KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *,
char, false,
rspamd_ftok_hash, rspamd_ftok_equal);
+KHASH_INIT (rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true,
+ rspamd_str_hash, rspamd_str_equal);
struct rspamd_lang_detector {
- GPtrArray *languages;
+ khash_t(rspamd_languages_hash) *languages;
khash_t(rspamd_trigram_hash) *trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */
struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
khash_t(rspamd_stopwords_hash) *stop_words_norm;
@@ -686,7 +688,10 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
skipped, loaded, nelt->stop_words,
rspamd_language_detector_print_flags (nelt));
- g_ptr_array_add (d->languages, nelt);
+ int ret;
+ khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret);
+ g_assert (ret > 0); /* must be unique */
+ kh_value(d->languages, k) = nelt;
ucl_object_unref (top);
}
@@ -764,7 +769,7 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
}
if (d->languages) {
- g_ptr_array_free (d->languages, TRUE);
+ kh_destroy (rspamd_languages_hash, d->languages);
}
kh_destroy (rspamd_stopwords_hash, d->stop_words_norm);
@@ -833,7 +838,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
}
ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret));
- ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
+ ret->languages = kh_init(rspamd_languages_hash);
+ kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc);
ret->uchar_converter = rspamd_get_utf8_converter ();
ret->short_text_limit = short_text_limit;
ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
@@ -894,7 +900,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
msg_info_config ("loaded %d languages, "
"%d trigrams; %s",
- (gint)ret->languages->len,
+ (gint)kh_size(ret->languages),
(gint)total, fasttext_status);
g_free (fasttext_status);
@@ -1810,25 +1816,28 @@ rspamd_language_detector_detect (struct rspamd_task *task,
guint nchinese = 0, nspecial = 0;
rspamd_language_detector_unicode_scripts (task, part, &nchinese, &nspecial);
- /* Apply unicode scripts heuristic */
- if (rspamd_language_detector_try_uniscript (task, part, nchinese, nspecial)) {
- ret = TRUE;
- }
+ /* Disable internal language detection heuristics if we have fasttext */
+ if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
+ /* Apply unicode scripts heuristic */
+ if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
+ ret = TRUE;
+ }
- cat = rspamd_language_detector_get_category (part->unicode_scripts);
+ cat = rspamd_language_detector_get_category(part->unicode_scripts);
- if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) {
- ret = TRUE;
+ if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
+ ret = TRUE;
+ }
}
if (!ret) {
unsigned ndetected = 0;
if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
- rspamd_fasttext_predict_result_t fasttext_predict_result;
- fasttext_predict_result = rspamd_lang_detection_fasttext_detect(d->fasttext_detector,
- part->utf_stripped_content->data,
- part->utf_stripped_content->len, 4);
+ rspamd_fasttext_predict_result_t fasttext_predict_result =
+ rspamd_lang_detection_fasttext_detect(d->fasttext_detector,
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len, 4);
ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
@@ -1851,6 +1860,12 @@ rspamd_language_detector_detect (struct rspamd_task *task,
cand = kh_value(candidates, k);
cand->lang = lang;
cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
+
+ /* Find the corresponding language elt */
+ k = kh_get(rspamd_languages_hash, d->languages, lang);
+ if (k != kh_end(d->languages)) {
+ cand->elt = kh_value(d->languages, k);
+ }
}
}
@@ -1864,6 +1879,8 @@ rspamd_language_detector_detect (struct rspamd_task *task,
r = rs_detect_none;
}
}
+
+ rspamd_fasttext_predict_result_destroy(fasttext_predict_result);
}
if (ndetected == 0) {
if (part->utf_words->len < default_short_text_limit) {
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
index eda4c2850..7e16414bc 100644
--- a/src/libmime/lang_detection_fasttext.cxx
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -23,6 +23,7 @@
#include "fmt/core.h"
#include <exception>
#include <string>
+#include <string_view>
#include <vector>
#include <sstream>
#include <streambuf>
@@ -154,8 +155,10 @@ rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
#ifndef WITH_FASTTEXT
return nullptr;
#else
+ /* Avoid too long inputs */
+ static const size_t max_fasttext_input_len = 1024 * 1024 * 1;
auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
- auto *res = real_model->detect_language(in, len, k);
+ auto *res = real_model->detect_language(in, std::min(max_fasttext_input_len, len), k);
return (rspamd_fasttext_predict_result_t)res;
#endif
@@ -188,8 +191,13 @@ rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res, un
#ifdef WITH_FASTTEXT
auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
- if (real_res && real_res->size() < idx) {
- return real_res->at(idx).second.c_str();
+ if (real_res && real_res->size() > idx) {
+ /* Fasttext returns result in form __label__<lang>, so we need to remove __label__ prefix */
+ auto lang = std::string_view{real_res->at(idx).second};
+ if (lang.size() > sizeof("__label__") && lang.substr(0, sizeof("__label__") - 1) == "__label__") {
+ lang.remove_prefix(sizeof("__label__") - 1);
+ }
+ return lang.data();
}
#endif
return nullptr;
@@ -201,7 +209,7 @@ rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res, un
#ifdef WITH_FASTTEXT
auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
- if (real_res && real_res->size() < idx) {
+ if (real_res && real_res->size() > idx) {
return real_res->at(idx).first;
}
#endif
More information about the Commits
mailing list