commit e92b112: [Feature] Allow to use other methods when fasttext detection is enabled

Vsevolod Stakhov vsevolod at rspamd.com
Tue May 2 19:49:04 UTC 2023


Author: Vsevolod Stakhov
Date: 2023-05-02 18:03:49 +0100
URL: https://github.com/rspamd/rspamd/commit/e92b112a8a2bc41e1157246252482e0604b652eb (HEAD -> master)

[Feature] Allow to use other methods when fasttext detection is enabled

---
 conf/lang_detection.inc      |  3 +++
 src/libmime/lang_detection.c | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/conf/lang_detection.inc b/conf/lang_detection.inc
index 19ece79bd..50c62ad19 100644
--- a/conf/lang_detection.inc
+++ b/conf/lang_detection.inc
@@ -23,3 +23,6 @@
 # Use the following fasttext model for language detection (if Fasttext support is compiled in)
 # fasttext_model = "${RSPAMD_SHAREDIR}/languages/fasttext_model.ftz"
 
+# Prefer fasttext over all other methods
+# prefer_fasttext = true;
+
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index d4d10b216..7696c4aed 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -183,6 +183,7 @@ struct rspamd_lang_detector {
 	khash_t(rspamd_stopwords_hash) *stop_words_norm;
 	UConverter *uchar_converter;
 	gsize short_text_limit;
+	bool prefer_fasttext;
 	gsize total_occurrences; /* number of all languages found */
 	gpointer fasttext_detector;
 	ref_entry_t ref;
@@ -792,6 +793,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
 	struct rspamd_lang_detector *ret = NULL;
 	struct ucl_parser *parser;
 	ucl_object_t *stop_words;
+	bool prefer_fasttext = true;
 
 	section = ucl_object_lookup (cfg->rcl_obj, "lang_detection");
 
@@ -810,6 +812,11 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
 
 		languages_enable = ucl_object_lookup (section, "languages_enable");
 		languages_disable = ucl_object_lookup (section, "languages_disable");
+
+		elt = ucl_object_lookup(section, "prefer_fasttext");
+		if (elt) {
+			prefer_fasttext = ucl_object_toboolean (elt);
+		}
 	}
 
 	languages_pattern = g_string_sized_new (PATH_MAX);
@@ -843,6 +850,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
 	ret->uchar_converter = rspamd_get_utf8_converter ();
 	ret->short_text_limit = short_text_limit;
 	ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
+	ret->prefer_fasttext = prefer_fasttext;
 
 	/* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
 	for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
@@ -1818,7 +1826,7 @@ rspamd_language_detector_detect (struct rspamd_task *task,
 	rspamd_language_detector_unicode_scripts (task, part, &nchinese, &nspecial);
 
 	/* Disable internal language detection heuristics if we have fasttext */
-	if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
+	if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) {
 		/* Apply unicode scripts heuristic */
 		if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
 			ret = TRUE;


More information about the Commits mailing list