commit 59823dd: [Rework] Convert chartable plugin to c++ for convenience
Vsevolod Stakhov
vsevolod at rspamd.com
Tue Dec 6 22:07:03 UTC 2022
Author: Vsevolod Stakhov
Date: 2022-12-06 22:01:59 +0000
URL: https://github.com/rspamd/rspamd/commit/59823dd77dc52c980ef71a0c1f650dcaede59f69 (HEAD -> master)
[Rework] Convert chartable plugin to c++ for convenience
---
src/CMakeLists.txt | 2 +-
src/plugins/{chartable.c => chartable.cxx} | 79 ++++++++++--------------------
2 files changed, 27 insertions(+), 54 deletions(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e51b1e381..5dee8e610 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -98,7 +98,7 @@ SET(RSPAMDSRC controller.c
rspamd_proxy.c)
SET(PLUGINSSRC plugins/regexp.c
- plugins/chartable.c
+ plugins/chartable.cxx
plugins/fuzzy_check.c
plugins/dkim_check.c
libserver/rspamd_control.c)
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.cxx
similarity index 94%
rename from src/plugins/chartable.c
rename to src/plugins/chartable.cxx
index 7f05d10d6..c5820c606 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.cxx
@@ -31,25 +31,13 @@
#include "unicode/utf8.h"
#include "unicode/uchar.h"
+#include "contrib/ankerl/unordered_dense.h"
#define DEFAULT_SYMBOL "R_MIXED_CHARSET"
#define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL"
#define DEFAULT_THRESHOLD 0.1
-#define msg_err_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
- "chartable", task->task_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-#define msg_warn_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
- "chartable", task->task_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-#define msg_info_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
- "chartable", task->task_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-
-#define msg_debug_chartable(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
+#define msg_debug_chartable(...) rspamd_conditional_debug_fast (nullptr, task->from_addr, \
rspamd_chartable_log_id, "chartable", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
@@ -66,7 +54,7 @@ module_t chartable_module = {
chartable_module_init,
chartable_module_config,
chartable_module_reconfig,
- NULL,
+ nullptr,
RSPAMD_MODULE_VER,
(guint)-1,
};
@@ -98,8 +86,8 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
{
struct chartable_ctx *chartable_module_ctx;
- chartable_module_ctx = rspamd_mempool_alloc0 (cfg->cfg_pool,
- sizeof (*chartable_module_ctx));
+ chartable_module_ctx = rspamd_mempool_alloc0_type(cfg->cfg_pool,
+ struct chartable_ctx);
chartable_module_ctx->max_word_len = 10;
*ctx = (struct module_ctx *)chartable_module_ctx;
@@ -120,21 +108,21 @@ chartable_module_config (struct rspamd_config *cfg, bool validate)
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != NULL) {
+ rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != nullptr) {
chartable_module_ctx->symbol = ucl_obj_tostring (value);
}
else {
chartable_module_ctx->symbol = DEFAULT_SYMBOL;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "url_symbol")) != NULL) {
+ rspamd_config_get_module_opt (cfg, "chartable", "url_symbol")) != nullptr) {
chartable_module_ctx->url_symbol = ucl_obj_tostring (value);
}
else {
chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != NULL) {
+ rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != nullptr) {
if (!ucl_obj_todouble_safe (value, &chartable_module_ctx->threshold)) {
msg_warn_config ("invalid numeric value");
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
@@ -144,7 +132,7 @@ chartable_module_config (struct rspamd_config *cfg, bool validate)
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}
if ((value =
- rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != NULL) {
+ rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != nullptr) {
chartable_module_ctx->max_word_len = ucl_object_toint (value);
}
else {
@@ -155,14 +143,14 @@ chartable_module_config (struct rspamd_config *cfg, bool validate)
chartable_module_ctx->symbol,
0,
chartable_symbol_callback,
- NULL,
+ nullptr,
SYMBOL_TYPE_NORMAL,
-1);
rspamd_symcache_add_symbol (cfg->cache,
chartable_module_ctx->url_symbol,
0,
chartable_url_symbol_callback,
- NULL,
+ nullptr,
SYMBOL_TYPE_NORMAL,
-1);
@@ -177,7 +165,7 @@ chartable_module_reconfig (struct rspamd_config *cfg)
return chartable_module_config (cfg, false);
}
-static gint latin_confusable[] = {
+static const auto latin_confusable = ankerl::unordered_dense::set<int>{
0x02028, 0x02029, 0x01680, 0x02000, 0x02001, 0x02002, 0x02003, 0x02004, 0x02005, 0x02006,
0x02008, 0x02009, 0x0200a, 0x0205f, 0x000a0, 0x02007, 0x0202f, 0x007fa, 0x0fe4d, 0x0fe4e,
0x0fe4f, 0x02010, 0x02011, 0x02012, 0x02013, 0x0fe58, 0x006d4, 0x02043, 0x002d7, 0x02212,
@@ -332,24 +320,10 @@ static gint latin_confusable[] = {
0x1d689, 0x00396, 0x1d6ad, 0x1d6e7, 0x1d721, 0x1d75b, 0x1d795, 0x013c3, 0x0a4dc, 0x118a9,
};
-GHashTable *latin_confusable_ht = NULL;
-
static gboolean
rspamd_can_alias_latin (gint ch)
{
- if (latin_confusable_ht == NULL) {
- guint i;
-
- /* Build hash table */
- latin_confusable_ht = g_hash_table_new (g_int_hash, g_int_equal);
-
- for (i = 0; i < G_N_ELEMENTS (latin_confusable); i ++) {
- g_hash_table_insert(latin_confusable_ht, &latin_confusable[i],
- GINT_TO_POINTER (-1));
- }
- }
-
- return g_hash_table_lookup (latin_confusable_ht, &ch) != NULL;
+ return latin_confusable.contains(ch);
}
static gdouble
@@ -505,7 +479,6 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
gboolean is_url,
struct chartable_ctx *chartable_module_ctx)
{
- const guchar *p, *end;
gdouble badness = 0.0;
enum {
ascii = 1,
@@ -519,9 +492,9 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
got_unknown,
} state = start_process;
- p = w->normalized.begin;
- end = p + w->normalized.len;
- last_sc = 0;
+ const auto *p = w->normalized.begin;
+ const auto *end = p + w->normalized.len;
+ last_sc = non_ascii;
if (w->normalized.len > chartable_module_ctx->max_word_len) {
return 0.0;
@@ -595,7 +568,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
guint i, ncap = 0;
gdouble cur_score = 0.0;
- if (part == NULL || part->utf_words == NULL ||
+ if (part == nullptr || part->utf_words == nullptr ||
part->utf_words->len == 0 || part->nwords == 0) {
return FALSE;
}
@@ -631,7 +604,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
if (cur_score > chartable_module_ctx->threshold) {
rspamd_task_insert_result (task, chartable_module_ctx->symbol,
- cur_score, NULL);
+ cur_score, nullptr);
return TRUE;
}
@@ -646,7 +619,7 @@ chartable_symbol_callback (struct rspamd_task *task,
guint i;
struct rspamd_mime_text_part *part;
struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg);
- const gchar *language = NULL;
+ const gchar *language = nullptr;
gboolean ignore_diacritics = FALSE, seen_violated_part = FALSE;
/* Check if we have parts with diacritic symbols language */
@@ -674,7 +647,7 @@ chartable_symbol_callback (struct rspamd_task *task,
ignore_diacritics = TRUE;
}
- if (task->meta_words != NULL && task->meta_words->len > 0) {
+ if (task->meta_words != nullptr && task->meta_words->len > 0) {
rspamd_stat_token_t *w;
gdouble cur_score = 0;
gsize arlen = task->meta_words->len;
@@ -682,7 +655,7 @@ chartable_symbol_callback (struct rspamd_task *task,
for (i = 0; i < arlen; i++) {
w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
- NULL, chartable_module_ctx, language, ignore_diacritics);
+ nullptr, chartable_module_ctx, language, ignore_diacritics);
}
cur_score /= (gdouble)arlen;
@@ -736,9 +709,9 @@ chartable_url_symbol_callback (struct rspamd_task *task,
w.stemmed.begin = u->host;
w.stemmed.len = u->hostlen;
- if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) {
+ if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, nullptr)) {
cur_score += rspamd_chartable_process_word_utf (task, &w,
- TRUE, NULL, chartable_module_ctx);
+ TRUE, nullptr, chartable_module_ctx);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, &w,
@@ -761,9 +734,9 @@ chartable_url_symbol_callback (struct rspamd_task *task,
w.stemmed.begin = u->host;
w.stemmed.len = u->hostlen;
- if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) {
+ if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, nullptr)) {
cur_score += rspamd_chartable_process_word_utf (task, &w,
- TRUE, NULL, chartable_module_ctx);
+ TRUE, nullptr, chartable_module_ctx);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, &w,
@@ -774,7 +747,7 @@ chartable_url_symbol_callback (struct rspamd_task *task,
if (cur_score > chartable_module_ctx->threshold) {
rspamd_task_insert_result (task, chartable_module_ctx->symbol,
- cur_score, NULL);
+ cur_score, nullptr);
}
#endif
More information about the Commits
mailing list