commit fa15fa2: [Rework] Move parsers to a separate lua library

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Nov 11 13:49:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-11-11 13:41:36 +0000
URL: https://github.com/rspamd/rspamd/commit/fa15fa29ca5a89e95d5cf90009668814b0032cf9 (HEAD -> master)

[Rework] Move parsers to a separate lua library

---
 src/lua/CMakeLists.txt |   3 +-
 src/lua/lua_common.c   |   4 +-
 src/lua/lua_common.h   |   2 +
 src/lua/lua_parsers.c  | 418 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/lua/lua_parsers.h  |  88 +++++++++++
 src/lua/lua_util.c     | 293 ++--------------------------------
 6 files changed, 521 insertions(+), 287 deletions(-)

diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt
index 84c819c2d..7f31aac98 100644
--- a/src/lua/CMakeLists.txt
+++ b/src/lua/CMakeLists.txt
@@ -32,6 +32,7 @@ SET(LUASRC			  ${CMAKE_CURRENT_SOURCE_DIR}/lua_common.c
 		 			  ${CMAKE_CURRENT_SOURCE_DIR}/lua_worker.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_kann.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_spf.c
-					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_tensor.c)
+					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_tensor.c
+					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_parsers.c)
 
 SET(RSPAMD_LUA ${LUASRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index b7fcc2034..87474793c 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -980,14 +980,12 @@ rspamd_lua_init (bool wipe_mem)
 	luaopen_kann (L);
 	luaopen_spf (L);
 	luaopen_tensor (L);
+	luaopen_parsers (L);
 #ifndef WITH_LUAJIT
 	rspamd_lua_add_preload (L, "bit", luaopen_bit);
 	lua_settop (L, 0);
 #endif
 
-	rspamd_lua_new_class (L, "rspamd{ev_base}", NULL);
-	lua_pop (L, 1);
-
 	rspamd_lua_new_class (L, "rspamd{session}", NULL);
 	lua_pop (L, 1);
 
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index f1b59ca28..87caa6206 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -385,6 +385,8 @@ void luaopen_spf (lua_State *L);
 
 void luaopen_tensor (lua_State *L);
 
+void luaopen_parsers (lua_State *L);
+
 void rspamd_lua_dostring (const gchar *line);
 
 double rspamd_lua_normalize (struct rspamd_config *cfg,
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
new file mode 100644
index 000000000..01d7fecc7
--- /dev/null
+++ b/src/lua/lua_parsers.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright 2020 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lua_common.h"
+#include "tokenizers/tokenizers.h"
+#include "contrib/uthash/utlist.h"
+#include "libserver/html.h"
+#include "libmime/email_addr.h"
+#include "libmime/content_type.h"
+#include "libmime/mime_headers.h"
+#include "libmime/smtp_parsers.h"
+#include "lua_parsers.h"
+
+/***
+ * @module rspamd_util
+ * This module contains Lua-C interfaces to Rspamd parsers of different kind.
+ */
+
+/***
+ * @function util.tokenize_text(input[, exceptions])
+ * Create tokens from a text using optional exceptions list
+ * @param {text/string} input input data
+ * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
+ * @return {table/strings} list of strings representing words in the text
+ */
+
+
+/***
+ * @function parsers.parse_html(input)
+ * Parses HTML and returns the according text
+ * @param {string|text} in input HTML
+ * @return {rspamd_text} processed text with no HTML tags
+ */
+
+/***
+ * @function parsers.parse_mail_address(str, [pool])
+ * Parses email address and returns a table of tables in the following format:
+ *
+ * - `raw` - the original value without any processing
+ * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah at foo.com>` it returns `Vsevolod Stakhov`
+ * - `addr` - address part of the address
+ * - `user` - user part (if present) of the address, e.g. `blah`
+ * - `domain` - domain part (if present), e.g. `foo.com`
+ * - `flags` - table with following keys set to true if given condition fulfilled:
+ *   - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
+ *   - [ip] - domain is IPv4/IPv6 address
+ *   - [braced] - angled `<blah at foo.com>` address
+ *   - [quoted] - quoted user part
+ *   - [empty] - empty address
+ *   - [backslash] - user part contains backslash
+ *   - [8bit] - contains 8bit characters
+ *
+ * @param {string} str input string
+ * @param {rspamd_mempool} pool memory pool to use
+ * @return {table/tables} parsed list of mail addresses
+ */
+
+/***
+ *  @function parsers.parse_content_type(ct_string, mempool)
+ * Parses content-type string to a table:
+ * - `type`
+ * - `subtype`
+ * - `charset`
+ * - `boundary`
+ * - other attributes
+ *
+ * @param {string} ct_string content type as string
+ * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
+ * @return table or nil if cannot parse content type
+ */
+
+/***
+ * @function parsers.parse_smtp_date(str[, local_tz])
+ * Converts an SMTP date string to unix timestamp
+ * @param {string} str input string
+ * @param {boolean} local_tz convert to local tz if `true`
+ * @return {number} time as unix timestamp (converted to float)
+ */
+
+static const struct luaL_reg parserslib_f[] = {
+	LUA_INTERFACE_DEF (parsers, tokenize_text),
+	LUA_INTERFACE_DEF (parsers, parse_html),
+	LUA_INTERFACE_DEF (parsers, parse_mail_address),
+	LUA_INTERFACE_DEF (parsers, parse_content_type),
+	LUA_INTERFACE_DEF (parsers, parse_smtp_date),
+
+	{NULL, NULL}
+};
+
+gint
+lua_parsers_tokenize_text (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	const gchar *in = NULL;
+	gsize len = 0, pos, ex_len, i;
+	GList *exceptions = NULL, *cur;
+	struct rspamd_lua_text *t;
+	struct rspamd_process_exception *ex;
+	UText utxt = UTEXT_INITIALIZER;
+	GArray *res;
+	rspamd_stat_token_t *w;
+
+	if (lua_type (L, 1) == LUA_TSTRING) {
+		in = luaL_checklstring (L, 1, &len);
+	}
+	else if (lua_type (L, 1) == LUA_TUSERDATA) {
+		t = lua_check_text (L, 1);
+
+		if (t) {
+			in = t->start;
+			len = t->len;
+		}
+	}
+
+	if (in == NULL) {
+		lua_pushnil (L);
+		return 1;
+	}
+
+	if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
+		lua_pushvalue (L, 2);
+		lua_pushnil (L);
+
+		while (lua_next (L, -2) != 0) {
+			if (lua_type (L, -1) == LUA_TTABLE) {
+				lua_rawgeti (L, -1, 1);
+				pos = luaL_checknumber (L, -1);
+				lua_pop (L, 1);
+				lua_rawgeti (L, -1, 2);
+				ex_len = luaL_checknumber (L, -1);
+				lua_pop (L, 1);
+
+				if (ex_len > 0) {
+					ex = g_malloc0 (sizeof (*ex));
+					ex->pos = pos;
+					ex->len = ex_len;
+					ex->type = RSPAMD_EXCEPTION_GENERIC;
+					exceptions = g_list_prepend (exceptions, ex);
+				}
+			}
+			lua_pop (L, 1);
+		}
+
+		lua_pop (L, 1);
+	}
+
+	if (exceptions) {
+		exceptions = g_list_reverse (exceptions);
+	}
+
+	UErrorCode uc_err = U_ZERO_ERROR;
+	utext_openUTF8 (&utxt,
+			in,
+			len,
+			&uc_err);
+
+	res = rspamd_tokenize_text ((gchar *)in, len,
+			&utxt,
+			RSPAMD_TOKENIZE_UTF, NULL,
+			exceptions,
+			NULL, NULL, NULL);
+
+	if (res == NULL) {
+		lua_pushnil (L);
+	}
+	else {
+		lua_createtable (L, res->len, 0);
+
+		for (i = 0; i < res->len; i ++) {
+			w = &g_array_index (res, rspamd_stat_token_t, i);
+			lua_pushlstring (L, w->original.begin, w->original.len);
+			lua_rawseti (L, -2, i + 1);
+		}
+	}
+
+	cur = exceptions;
+	while (cur) {
+		ex = cur->data;
+		g_free (ex);
+		cur = g_list_next (cur);
+	}
+
+	g_list_free (exceptions);
+	utext_close (&utxt);
+
+	return 1;
+}
+
+gint
+lua_parsers_parse_html (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_lua_text *t;
+	const gchar *start = NULL;
+	gsize len;
+	GByteArray *res, *in;
+	rspamd_mempool_t *pool;
+	struct html_content *hc;
+
+	if (lua_type (L, 1) == LUA_TUSERDATA) {
+		t = lua_check_text (L, 1);
+
+		if (t != NULL) {
+			start = t->start;
+			len = t->len;
+		}
+	}
+	else if (lua_type (L, 1) == LUA_TSTRING) {
+		start = luaL_checklstring (L, 1, &len);
+	}
+
+	if (start != NULL) {
+		pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
+		hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
+		in = g_byte_array_sized_new (len);
+		g_byte_array_append (in, start, len);
+
+		res = rspamd_html_process_part (pool, hc, in);
+
+		t = lua_newuserdata (L, sizeof (*t));
+		rspamd_lua_setclass (L, "rspamd{text}", -1);
+		t->start = res->data;
+		t->len = res->len;
+		t->flags = RSPAMD_TEXT_FLAG_OWN;
+
+		g_byte_array_free (res, FALSE);
+		g_byte_array_free (in, TRUE);
+		rspamd_mempool_delete (pool);
+	}
+	else {
+		lua_pushnil (L);
+	}
+
+	return 1;
+}
+
+gint
+lua_parsers_parse_mail_address (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	GPtrArray *addrs;
+	gsize len;
+	const gchar *str = luaL_checklstring (L, 1, &len);
+	rspamd_mempool_t *pool;
+	gboolean own_pool = FALSE;
+
+	if (str) {
+
+		if (lua_type (L, 2) == LUA_TUSERDATA) {
+			pool = rspamd_lua_check_mempool (L, 2);
+
+			if (pool == NULL) {
+				return luaL_error (L, "invalid arguments");
+			}
+		}
+		else {
+			pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
+					"lua util", 0);
+			own_pool = TRUE;
+		}
+
+		addrs = rspamd_email_address_from_mime (pool, str, len, NULL, -1);
+
+		if (addrs == NULL) {
+			lua_pushnil (L);
+		}
+		else {
+			lua_push_emails_address_list (L, addrs, 0);
+		}
+
+		if (own_pool) {
+			rspamd_mempool_delete (pool);
+		}
+	}
+	else {
+		lua_pushnil (L);
+	}
+
+	return 1;
+}
+
+gint
+lua_parsers_parse_content_type (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	gsize len;
+	const gchar *ct_str = luaL_checklstring (L, 1, &len);
+	rspamd_mempool_t *pool = rspamd_lua_check_mempool (L, 2);
+	struct rspamd_content_type *ct;
+
+	if (!ct_str || !pool) {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	ct = rspamd_content_type_parse (ct_str, len, pool);
+
+	if (ct == NULL) {
+		lua_pushnil (L);
+	}
+	else {
+		GHashTableIter it;
+		gpointer k, v;
+
+		lua_createtable (L, 0, 4 + (ct->attrs ? g_hash_table_size (ct->attrs) : 0));
+
+		if (ct->type.len > 0) {
+			lua_pushstring (L, "type");
+			lua_pushlstring (L, ct->type.begin, ct->type.len);
+			lua_settable (L, -3);
+		}
+
+		if (ct->subtype.len > 0) {
+			lua_pushstring (L, "subtype");
+			lua_pushlstring (L, ct->subtype.begin, ct->subtype.len);
+			lua_settable (L, -3);
+		}
+
+		if (ct->charset.len > 0) {
+			lua_pushstring (L, "charset");
+			lua_pushlstring (L, ct->charset.begin, ct->charset.len);
+			lua_settable (L, -3);
+		}
+
+		if (ct->orig_boundary.len > 0) {
+			lua_pushstring (L, "boundary");
+			lua_pushlstring (L, ct->orig_boundary.begin, ct->orig_boundary.len);
+			lua_settable (L, -3);
+		}
+
+		if (ct->attrs) {
+			g_hash_table_iter_init (&it, ct->attrs);
+
+			while (g_hash_table_iter_next (&it, &k, &v)) {
+				struct rspamd_content_type_param *param =
+						(struct rspamd_content_type_param *)v, *cur;
+				guint i = 1;
+
+				lua_pushlstring (L, param->name.begin, param->name.len);
+				lua_createtable (L, 1, 0);
+
+				DL_FOREACH (param, cur) {
+					lua_pushlstring (L, cur->value.begin, cur->value.len);
+					lua_rawseti (L, -2, i++);
+				}
+
+				lua_settable (L, -3);
+			}
+		}
+	}
+
+	return 1;
+}
+
+int
+lua_parsers_parse_smtp_date (lua_State *L)
+{
+	gsize slen;
+	const gchar *str = lua_tolstring (L, 1, &slen);
+	GError *err = NULL;
+
+	if (str == NULL) {
+		return luaL_argerror (L, 1, "invalid argument");
+	}
+
+	time_t tt = rspamd_parse_smtp_date (str, slen, &err);
+
+	if (err == NULL) {
+		if (lua_isboolean (L, 2) && !!lua_toboolean (L, 2)) {
+			struct tm t;
+
+			rspamd_localtime (tt, &t);
+#if !defined(__sun)
+			t.tm_gmtoff = 0;
+#endif
+			t.tm_isdst = 0;
+			tt = mktime (&t);
+		}
+
+		lua_pushnumber (L, tt);
+	}
+	else {
+		lua_pushnil (L);
+		lua_pushstring (L, err->message);
+		g_error_free (err);
+
+		return 2;
+	}
+
+	return 1;
+}
+
+static gint
+lua_load_parsers (lua_State * L)
+{
+	lua_newtable (L);
+	luaL_register (L, NULL, parserslib_f);
+
+	return 1;
+}
+
+void
+luaopen_parsers (lua_State * L)
+{
+	rspamd_lua_add_preload (L, "rspamd_parsers", lua_load_parsers);
+}
\ No newline at end of file
diff --git a/src/lua/lua_parsers.h b/src/lua/lua_parsers.h
new file mode 100644
index 000000000..900072a10
--- /dev/null
+++ b/src/lua/lua_parsers.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright 2020 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LUA_PARSERS_H
+#define RSPAMD_LUA_PARSERS_H
+
+#include "lua_common.h"
+
+/***
+ * @function parsers.tokenize_text(input[, exceptions])
+ * Create tokens from a text using optional exceptions list
+ * @param {text/string} input input data
+ * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
+ * @return {table/strings} list of strings representing words in the text
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, tokenize_text);
+
+/***
+ * @function parsers.parse_html(input)
+ * Parses HTML and returns the according text
+ * @param {string|text} in input HTML
+ * @return {rspamd_text} processed text with no HTML tags
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_html);
+
+/***
+ * @function parsers.parse_mail_address(str, [pool])
+ * Parses email address and returns a table of tables in the following format:
+ *
+ * - `raw` - the original value without any processing
+ * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah at foo.com>` it returns `Vsevolod Stakhov`
+ * - `addr` - address part of the address
+ * - `user` - user part (if present) of the address, e.g. `blah`
+ * - `domain` - domain part (if present), e.g. `foo.com`
+ * - `flags` - table with following keys set to true if given condition fulfilled:
+ *   - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
+ *   - [ip] - domain is IPv4/IPv6 address
+ *   - [braced] - angled `<blah at foo.com>` address
+ *   - [quoted] - quoted user part
+ *   - [empty] - empty address
+ *   - [backslash] - user part contains backslash
+ *   - [8bit] - contains 8bit characters
+ *
+ * @param {string} str input string
+ * @param {rspamd_mempool} pool memory pool to use
+ * @return {table/tables} parsed list of mail addresses
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_mail_address);
+
+/***
+ *  @function parsers.parse_content_type(ct_string, mempool)
+ * Parses content-type string to a table:
+ * - `type`
+ * - `subtype`
+ * - `charset`
+ * - `boundary`
+ * - other attributes
+ *
+ * @param {string} ct_string content type as string
+ * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
+ * @return table or nil if cannot parse content type
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_content_type);
+
+/***
+ * @function parsers.parse_smtp_date(str[, local_tz])
+ * Converts an SMTP date string to unix timestamp
+ * @param {string} str input string
+ * @param {boolean} local_tz convert to local tz if `true`
+ * @return {number} time as unix timestamp (converted to float)
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_smtp_date);
+
+
+#endif //RSPAMD_LUA_PARSERS_H
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 1a2b52f80..e879d37af 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 #include "lua_common.h"
-#include "html.h"
-#include "tokenizers/tokenizers.h"
 #include "unix-std.h"
 #include "contrib/zstd/zstd.h"
-#include "contrib/uthash/utlist.h"
 #include "libmime/email_addr.h"
 #include "libmime/content_type.h"
 #include "libmime/mime_headers.h"
 #include "libutil/hash.h"
 
+#include "lua_parsers.h"
+
 #ifdef WITH_LUA_REPL
 #include "replxx.h"
 #endif
@@ -34,7 +33,6 @@
 
 #include "unicode/uspoof.h"
 #include "unicode/uscript.h"
-#include "libmime/smtp_parsers.h"
 #include "contrib/fastutf8/fastutf8.h"
 
 /***
@@ -1313,100 +1311,7 @@ lua_util_decode_url (lua_State *L)
 static gint
 lua_util_tokenize_text (lua_State *L)
 {
-	LUA_TRACE_POINT;
-	const gchar *in = NULL;
-	gsize len = 0, pos, ex_len, i;
-	GList *exceptions = NULL, *cur;
-	struct rspamd_lua_text *t;
-	struct rspamd_process_exception *ex;
-	UText utxt = UTEXT_INITIALIZER;
-	GArray *res;
-	rspamd_stat_token_t *w;
-
-	if (lua_type (L, 1) == LUA_TSTRING) {
-		in = luaL_checklstring (L, 1, &len);
-	}
-	else if (lua_type (L, 1) == LUA_TUSERDATA) {
-		t = lua_check_text (L, 1);
-
-		if (t) {
-			in = t->start;
-			len = t->len;
-		}
-	}
-
-	if (in == NULL) {
-		lua_pushnil (L);
-		return 1;
-	}
-
-	if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
-		lua_pushvalue (L, 2);
-		lua_pushnil (L);
-
-		while (lua_next (L, -2) != 0) {
-			if (lua_type (L, -1) == LUA_TTABLE) {
-				lua_rawgeti (L, -1, 1);
-				pos = luaL_checknumber (L, -1);
-				lua_pop (L, 1);
-				lua_rawgeti (L, -1, 2);
-				ex_len = luaL_checknumber (L, -1);
-				lua_pop (L, 1);
-
-				if (ex_len > 0) {
-					ex = g_malloc0 (sizeof (*ex));
-					ex->pos = pos;
-					ex->len = ex_len;
-					ex->type = RSPAMD_EXCEPTION_GENERIC;
-					exceptions = g_list_prepend (exceptions, ex);
-				}
-			}
-			lua_pop (L, 1);
-		}
-
-		lua_pop (L, 1);
-	}
-
-	if (exceptions) {
-		exceptions = g_list_reverse (exceptions);
-	}
-
-	UErrorCode uc_err = U_ZERO_ERROR;
-	utext_openUTF8 (&utxt,
-			in,
-			len,
-			&uc_err);
-
-	res = rspamd_tokenize_text ((gchar *)in, len,
-			&utxt,
-			RSPAMD_TOKENIZE_UTF, NULL,
-			exceptions,
-			NULL, NULL, NULL);
-
-	if (res == NULL) {
-		lua_pushnil (L);
-	}
-	else {
-		lua_createtable (L, res->len, 0);
-
-		for (i = 0; i < res->len; i ++) {
-			w = &g_array_index (res, rspamd_stat_token_t, i);
-			lua_pushlstring (L, w->original.begin, w->original.len);
-			lua_rawseti (L, -2, i + 1);
-		}
-	}
-
-	cur = exceptions;
*** OUTPUT TRUNCATED, 248 LINES SKIPPED ***


More information about the Commits mailing list