commit e1b043f: [Feature] Add method task:lookup_words
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Dec 27 18:28:03 UTC 2018
Author: Vsevolod Stakhov
Date: 2018-12-05 18:06:12 +0000
URL: https://github.com/rspamd/rspamd/commit/e1b043f8bf7970278f55ae7ca1a106dee6c4fa98
[Feature] Add method task:lookup_words
---
src/lua/lua_common.c | 153 +++++++++++++++++++++++++++------------------------
src/lua/lua_common.h | 7 +++
src/lua/lua_task.c | 122 +++++++++++++++++++++++++++++++++++++++-
3 files changed, 207 insertions(+), 75 deletions(-)
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index 7bb45f347..01d5dc869 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -2408,12 +2408,90 @@ rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
return FALSE;
}
+void
+rspamd_lua_push_full_word (lua_State *L, rspamd_stat_token_t *w)
+{
+ gint fl_cnt;
+
+ lua_createtable (L, 4, 0);
+
+ if (w->stemmed.len > 0) {
+ lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
+ lua_rawseti (L, -2, 1);
+ }
+ else {
+ lua_pushstring (L, "");
+ lua_rawseti (L, -2, 1);
+ }
+
+ if (w->normalized.len > 0) {
+ lua_pushlstring (L, w->normalized.begin, w->normalized.len);
+ lua_rawseti (L, -2, 2);
+ }
+ else {
+ lua_pushstring (L, "");
+ lua_rawseti (L, -2, 2);
+ }
+
+ if (w->original.len > 0) {
+ lua_pushlstring (L, w->original.begin, w->original.len);
+ lua_rawseti (L, -2, 3);
+ }
+ else {
+ lua_pushstring (L, "");
+ lua_rawseti (L, -2, 3);
+ }
+
+ /* Flags part */
+ fl_cnt = 1;
+ lua_createtable (L, 4, 0);
+
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
+ lua_pushstring (L, "normalised");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
+ lua_pushstring (L, "broken_unicode");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ lua_pushstring (L, "utf");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ lua_pushstring (L, "text");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
+ lua_pushstring (L, "header");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
+ lua_pushstring (L, "meta");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
+ lua_pushstring (L, "stop_word");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
+ lua_pushstring (L, "invisible_spaces");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
+ lua_pushstring (L, "stemmed");
+ lua_rawseti (L, -2, fl_cnt ++);
+ }
+
+ lua_rawseti (L, -2, 4);
+}
+
gint
rspamd_lua_push_words (lua_State *L, GArray *words,
enum rspamd_lua_words_type how)
{
rspamd_stat_token_t *w;
- guint i, cnt, fl_cnt;
+ guint i, cnt;
lua_createtable (L, words->len, 0);
@@ -2440,78 +2518,7 @@ rspamd_lua_push_words (lua_State *L, GArray *words,
}
break;
case RSPAMD_LUA_WORDS_FULL:
- lua_createtable (L, 4, 0);
-
- if (w->stemmed.len > 0) {
- lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
- lua_rawseti (L, -2, 1);
- }
- else {
- lua_pushstring (L, "");
- lua_rawseti (L, -2, 1);
- }
-
- if (w->normalized.len > 0) {
- lua_pushlstring (L, w->normalized.begin, w->normalized.len);
- lua_rawseti (L, -2, 2);
- }
- else {
- lua_pushstring (L, "");
- lua_rawseti (L, -2, 2);
- }
-
- if (w->original.len > 0) {
- lua_pushlstring (L, w->original.begin, w->original.len);
- lua_rawseti (L, -2, 3);
- }
- else {
- lua_pushstring (L, "");
- lua_rawseti (L, -2, 3);
- }
-
- /* Flags part */
- fl_cnt = 1;
- lua_createtable (L, 4, 0);
-
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
- lua_pushstring (L, "normalised");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
- lua_pushstring (L, "broken_unicode");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
- lua_pushstring (L, "utf");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
- lua_pushstring (L, "text");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
- lua_pushstring (L, "header");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
- lua_pushstring (L, "meta");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
- lua_pushstring (L, "stop_word");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
- lua_pushstring (L, "invisible_spaces");
- lua_rawseti (L, -2, fl_cnt ++);
- }
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
- lua_pushstring (L, "stemmed");
- lua_rawseti (L, -2, fl_cnt ++);
- }
-
- lua_rawseti (L, -2, 4);
-
+ rspamd_lua_push_full_word (L, w);
/* Push to the resulting vector */
lua_rawseti (L, -2, cnt ++);
break;
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 25f5b7ff4..31d7f852b 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -433,6 +433,13 @@ gboolean rspamd_lua_require_function (lua_State *L, const gchar *modname,
gboolean rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
struct rspamd_config *cfg, gint *ref_id);
+struct rspamd_stat_token_s;
+/**
+ * Pushes a single word into Lua
+ * @param L
+ * @param word
+ */
+void rspamd_lua_push_full_word (lua_State *L, struct rspamd_stat_token_s *word);
enum rspamd_lua_words_type {
RSPAMD_LUA_WORDS_STEM = 0,
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 6f4923dc8..a8a53f517 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -21,11 +21,12 @@
#include "unix-std.h"
#include "libmime/smtp_parsers.h"
#include "libserver/mempool_vars_internal.h"
+#include "libserver/dkim.h"
#include "libserver/task.h"
#include "libstat/stat_api.h"
+#include "libutil/map_helpers.h"
+
#include <math.h>
-#include <src/libserver/task.h>
-#include <src/libserver/dkim.h>
/***
* @module rspamd_task
@@ -958,6 +959,17 @@ LUA_FUNCTION_DEF (task, get_newlines_type);
*/
LUA_FUNCTION_DEF (task, get_stat_tokens);
+/***
+ * @method task:lookup_words(map, function({o, n, s, f}) ... end)
+ * Matches words in a task (including meta words) against some map (set, regexp and so on)
+ * and call the specified function with a table containing 4 values:
+ * - [1] - stemmed word
+ * - [2] - normalised word
+ * - [3] - raw word
+ * - [4] - flags (table of strings)
+ */
+LUA_FUNCTION_DEF (task, lookup_words);
+
static const struct luaL_reg tasklib_f[] = {
LUA_INTERFACE_DEF (task, load_from_file),
LUA_INTERFACE_DEF (task, load_from_string),
@@ -1060,6 +1072,7 @@ static const struct luaL_reg tasklib_m[] = {
LUA_INTERFACE_DEF (task, get_newlines_type),
LUA_INTERFACE_DEF (task, get_stat_tokens),
LUA_INTERFACE_DEF (task, get_meta_words),
+ LUA_INTERFACE_DEF (task, lookup_words),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}
};
@@ -5171,6 +5184,111 @@ lua_task_get_meta_words (lua_State *L)
return 1;
}
+static guint
+lua_lookup_words_array (lua_State *L,
+ gint cbpos,
+ struct rspamd_task *task,
+ struct rspamd_lua_map *map,
+ GArray *words)
+{
+ rspamd_stat_token_t *tok;
+ guint i, nmatched = 0;
+ gint err_idx;
+ gboolean matched;
+ const gchar *key;
+ gsize keylen;
+
+ for (i = 0; i < words->len; i ++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+ matched = FALSE;
+
+ if (tok->normalized.len == 0) {
+ continue;
+ }
+
+ key = tok->normalized.begin;
+ keylen = tok->normalized.len;
+
+ switch (map->type) {
+ case RSPAMD_LUA_MAP_SET:
+ case RSPAMD_LUA_MAP_HASH:
+ /* We know that tok->normalized is zero terminated in fact */
+ if (rspamd_match_hash_map (map->data.hash, key)) {
+ matched = TRUE;
+ }
+ break;
+ case RSPAMD_LUA_MAP_REGEXP:
+ case RSPAMD_LUA_MAP_REGEXP_MULTIPLE:
+ if (rspamd_match_regexp_map_single (map->data.re_map, key,
+ keylen)) {
+ matched = TRUE;
+ }
+ break;
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+
+ if (matched) {
+ nmatched ++;
+
+ lua_pushcfunction (L, &rspamd_lua_traceback);
+ err_idx = lua_gettop (L);
+ lua_pushvalue (L, cbpos); /* Function */
+ rspamd_lua_push_full_word (L, tok);
+
+ if (lua_pcall (L, 1, 0, err_idx) != 0) {
+ GString *tb = lua_touserdata (L, -1);
+ msg_err_task ("cannot call callback function for lookup words: %s",
+ tb->str);
+ g_string_free (tb, TRUE);
+ }
+
+ lua_settop (L, err_idx - 1);
+ }
+ }
+
+ return nmatched;
+}
+
+static gint
+lua_task_lookup_words (lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_task *task = lua_check_task (L, 1);
+ struct rspamd_lua_map *map = lua_check_map (L, 2);
+ struct rspamd_mime_text_part *tp;
+
+ guint i, matches = 0;
+
+ if (task == NULL || map == NULL || lua_type (L, 3) != LUA_TFUNCTION) {
+ return luaL_error (L, "invalid arguments");
+ }
+
+ if (map->type != RSPAMD_LUA_MAP_SET &&
+ map->type != RSPAMD_LUA_MAP_REGEXP &&
+ map->type != RSPAMD_LUA_MAP_HASH &&
+ map->type != RSPAMD_LUA_MAP_REGEXP_MULTIPLE) {
+ return luaL_error (L, "invalid map type");
+ }
+
+ PTR_ARRAY_FOREACH (task->text_parts, i, tp) {
+ if (tp->utf_words) {
+ matches += lua_lookup_words_array (L, 3, task, map, tp->utf_words);
+ }
+ }
+
+ if (task->meta_words) {
+ matches += lua_lookup_words_array (L, 3, task, map, task->meta_words);
+ }
+
+ lua_pushinteger (L, matches);
+
+ return 1;
+}
+
+
/* Image functions */
static gint
lua_image_get_width (lua_State *L)
More information about the Commits
mailing list