commit e1b043f: [Feature] Add method task:lookup_words

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Dec 27 18:28:03 UTC 2018


Author: Vsevolod Stakhov
Date: 2018-12-05 18:06:12 +0000
URL: https://github.com/rspamd/rspamd/commit/e1b043f8bf7970278f55ae7ca1a106dee6c4fa98

[Feature] Add method task:lookup_words

---
 src/lua/lua_common.c | 153 +++++++++++++++++++++++++++------------------------
 src/lua/lua_common.h |   7 +++
 src/lua/lua_task.c   | 122 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 207 insertions(+), 75 deletions(-)

diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index 7bb45f347..01d5dc869 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -2408,12 +2408,90 @@ rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
 	return FALSE;
 }
 
+void
+rspamd_lua_push_full_word (lua_State *L, rspamd_stat_token_t *w)
+{
+	gint fl_cnt;
+
+	lua_createtable (L, 4, 0);
+
+	if (w->stemmed.len > 0) {
+		lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
+		lua_rawseti (L, -2, 1);
+	}
+	else {
+		lua_pushstring (L, "");
+		lua_rawseti (L, -2, 1);
+	}
+
+	if (w->normalized.len > 0) {
+		lua_pushlstring (L, w->normalized.begin, w->normalized.len);
+		lua_rawseti (L, -2, 2);
+	}
+	else {
+		lua_pushstring (L, "");
+		lua_rawseti (L, -2, 2);
+	}
+
+	if (w->original.len > 0) {
+		lua_pushlstring (L, w->original.begin, w->original.len);
+		lua_rawseti (L, -2, 3);
+	}
+	else {
+		lua_pushstring (L, "");
+		lua_rawseti (L, -2, 3);
+	}
+
+	/* Flags part */
+	fl_cnt = 1;
+	lua_createtable (L, 4, 0);
+
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
+		lua_pushstring (L, "normalised");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
+		lua_pushstring (L, "broken_unicode");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+		lua_pushstring (L, "utf");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+		lua_pushstring (L, "text");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
+		lua_pushstring (L, "header");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
+		lua_pushstring (L, "meta");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
+		lua_pushstring (L, "stop_word");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
+		lua_pushstring (L, "invisible_spaces");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+	if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
+		lua_pushstring (L, "stemmed");
+		lua_rawseti (L, -2, fl_cnt ++);
+	}
+
+	lua_rawseti (L, -2, 4);
+}
+
 gint
 rspamd_lua_push_words (lua_State *L, GArray *words,
 							enum rspamd_lua_words_type how)
 {
 	rspamd_stat_token_t *w;
-	guint i, cnt, fl_cnt;
+	guint i, cnt;
 
 	lua_createtable (L, words->len, 0);
 
@@ -2440,78 +2518,7 @@ rspamd_lua_push_words (lua_State *L, GArray *words,
 			}
 			break;
 		case RSPAMD_LUA_WORDS_FULL:
-			lua_createtable (L, 4, 0);
-
-			if (w->stemmed.len > 0) {
-				lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
-				lua_rawseti (L, -2, 1);
-			}
-			else {
-				lua_pushstring (L, "");
-				lua_rawseti (L, -2, 1);
-			}
-
-			if (w->normalized.len > 0) {
-				lua_pushlstring (L, w->normalized.begin, w->normalized.len);
-				lua_rawseti (L, -2, 2);
-			}
-			else {
-				lua_pushstring (L, "");
-				lua_rawseti (L, -2, 2);
-			}
-
-			if (w->original.len > 0) {
-				lua_pushlstring (L, w->original.begin, w->original.len);
-				lua_rawseti (L, -2, 3);
-			}
-			else {
-				lua_pushstring (L, "");
-				lua_rawseti (L, -2, 3);
-			}
-
-			/* Flags part */
-			fl_cnt = 1;
-			lua_createtable (L, 4, 0);
-
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
-				lua_pushstring (L, "normalised");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
-				lua_pushstring (L, "broken_unicode");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
-				lua_pushstring (L, "utf");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
-				lua_pushstring (L, "text");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
-				lua_pushstring (L, "header");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
-				lua_pushstring (L, "meta");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
-				lua_pushstring (L, "stop_word");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
-				lua_pushstring (L, "invisible_spaces");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-			if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
-				lua_pushstring (L, "stemmed");
-				lua_rawseti (L, -2, fl_cnt ++);
-			}
-
-			lua_rawseti (L, -2, 4);
-
+			rspamd_lua_push_full_word (L, w);
 			/* Push to the resulting vector */
 			lua_rawseti (L, -2, cnt ++);
 			break;
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 25f5b7ff4..31d7f852b 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -433,6 +433,13 @@ gboolean rspamd_lua_require_function (lua_State *L, const gchar *modname,
 gboolean rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
 		struct rspamd_config *cfg, gint *ref_id);
 
+struct rspamd_stat_token_s;
+/**
+ * Pushes a single word into Lua
+ * @param L
+ * @param word
+ */
+void rspamd_lua_push_full_word (lua_State *L, struct rspamd_stat_token_s *word);
 
 enum rspamd_lua_words_type {
 	RSPAMD_LUA_WORDS_STEM = 0,
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 6f4923dc8..a8a53f517 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -21,11 +21,12 @@
 #include "unix-std.h"
 #include "libmime/smtp_parsers.h"
 #include "libserver/mempool_vars_internal.h"
+#include "libserver/dkim.h"
 #include "libserver/task.h"
 #include "libstat/stat_api.h"
+#include "libutil/map_helpers.h"
+
 #include <math.h>
-#include <src/libserver/task.h>
-#include <src/libserver/dkim.h>
 
 /***
  * @module rspamd_task
@@ -958,6 +959,17 @@ LUA_FUNCTION_DEF (task, get_newlines_type);
  */
 LUA_FUNCTION_DEF (task, get_stat_tokens);
 
+/***
+ * @method task:lookup_words(map, function({o, n, s, f}) ... end)
+ * Matches words in a task (including meta words) against some map (set, regexp and so on)
+ * and call the specified function with a table containing 4 values:
+ *   - [1] - stemmed word
+ *   - [2] - normalised word
+ *   - [3] - raw word
+ *   - [4] - flags (table of strings)
+ */
+LUA_FUNCTION_DEF (task, lookup_words);
+
 static const struct luaL_reg tasklib_f[] = {
 	LUA_INTERFACE_DEF (task, load_from_file),
 	LUA_INTERFACE_DEF (task, load_from_string),
@@ -1060,6 +1072,7 @@ static const struct luaL_reg tasklib_m[] = {
 	LUA_INTERFACE_DEF (task, get_newlines_type),
 	LUA_INTERFACE_DEF (task, get_stat_tokens),
 	LUA_INTERFACE_DEF (task, get_meta_words),
+	LUA_INTERFACE_DEF (task, lookup_words),
 	{"__tostring", rspamd_lua_class_tostring},
 	{NULL, NULL}
 };
@@ -5171,6 +5184,111 @@ lua_task_get_meta_words (lua_State *L)
 	return 1;
 }
 
+static guint
+lua_lookup_words_array (lua_State *L,
+						gint cbpos,
+						struct rspamd_task *task,
+						struct rspamd_lua_map *map,
+						GArray *words)
+{
+	rspamd_stat_token_t *tok;
+	guint i, nmatched = 0;
+	gint err_idx;
+	gboolean matched;
+	const gchar *key;
+	gsize keylen;
+
+	for (i = 0; i < words->len; i ++) {
+		tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+		matched = FALSE;
+
+		if (tok->normalized.len == 0) {
+			continue;
+		}
+
+		key = tok->normalized.begin;
+		keylen = tok->normalized.len;
+
+		switch (map->type) {
+		case RSPAMD_LUA_MAP_SET:
+		case RSPAMD_LUA_MAP_HASH:
+			/* We know that tok->normalized is zero terminated in fact */
+			if (rspamd_match_hash_map (map->data.hash, key)) {
+				matched = TRUE;
+			}
+			break;
+		case RSPAMD_LUA_MAP_REGEXP:
+		case RSPAMD_LUA_MAP_REGEXP_MULTIPLE:
+			if (rspamd_match_regexp_map_single (map->data.re_map, key,
+					keylen)) {
+				matched = TRUE;
+			}
+			break;
+		default:
+			g_assert_not_reached ();
+			break;
+		}
+
+		if (matched) {
+			nmatched ++;
+
+			lua_pushcfunction (L, &rspamd_lua_traceback);
+			err_idx = lua_gettop (L);
+			lua_pushvalue (L, cbpos); /* Function */
+			rspamd_lua_push_full_word (L, tok);
+
+			if (lua_pcall (L, 1, 0, err_idx) != 0) {
+				GString *tb = lua_touserdata (L, -1);
+				msg_err_task ("cannot call callback function for lookup words: %s",
+						tb->str);
+				g_string_free (tb, TRUE);
+			}
+
+			lua_settop (L, err_idx - 1);
+		}
+	}
+
+	return nmatched;
+}
+
+static gint
+lua_task_lookup_words (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_task *task = lua_check_task (L, 1);
+	struct rspamd_lua_map *map = lua_check_map (L, 2);
+	struct rspamd_mime_text_part *tp;
+
+	guint i, matches = 0;
+
+	if (task == NULL || map == NULL || lua_type (L, 3) != LUA_TFUNCTION) {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	if (map->type != RSPAMD_LUA_MAP_SET &&
+		map->type != RSPAMD_LUA_MAP_REGEXP &&
+		map->type != RSPAMD_LUA_MAP_HASH &&
+		map->type != RSPAMD_LUA_MAP_REGEXP_MULTIPLE) {
+		return luaL_error (L, "invalid map type");
+	}
+
+	PTR_ARRAY_FOREACH (task->text_parts, i, tp) {
+		if (tp->utf_words) {
+			matches += lua_lookup_words_array (L, 3, task, map, tp->utf_words);
+		}
+	}
+
+	if (task->meta_words) {
+		matches += lua_lookup_words_array (L, 3, task, map, task->meta_words);
+	}
+
+	lua_pushinteger (L, matches);
+
+	return 1;
+}
+
+
 /* Image functions */
 static gint
 lua_image_get_width (lua_State *L)


More information about the Commits mailing list