commit fa15fa2: [Rework] Move parsers to a separate lua library
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Nov 11 13:49:06 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-11-11 13:41:36 +0000
URL: https://github.com/rspamd/rspamd/commit/fa15fa29ca5a89e95d5cf90009668814b0032cf9 (HEAD -> master)
[Rework] Move parsers to a separate lua library
---
src/lua/CMakeLists.txt | 3 +-
src/lua/lua_common.c | 4 +-
src/lua/lua_common.h | 2 +
src/lua/lua_parsers.c | 418 +++++++++++++++++++++++++++++++++++++++++++++++++
src/lua/lua_parsers.h | 88 +++++++++++
src/lua/lua_util.c | 293 ++--------------------------------
6 files changed, 521 insertions(+), 287 deletions(-)
diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt
index 84c819c2d..7f31aac98 100644
--- a/src/lua/CMakeLists.txt
+++ b/src/lua/CMakeLists.txt
@@ -32,6 +32,7 @@ SET(LUASRC ${CMAKE_CURRENT_SOURCE_DIR}/lua_common.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_worker.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_kann.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_spf.c
- ${CMAKE_CURRENT_SOURCE_DIR}/lua_tensor.c)
+ ${CMAKE_CURRENT_SOURCE_DIR}/lua_tensor.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/lua_parsers.c)
SET(RSPAMD_LUA ${LUASRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index b7fcc2034..87474793c 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -980,14 +980,12 @@ rspamd_lua_init (bool wipe_mem)
luaopen_kann (L);
luaopen_spf (L);
luaopen_tensor (L);
+ luaopen_parsers (L);
#ifndef WITH_LUAJIT
rspamd_lua_add_preload (L, "bit", luaopen_bit);
lua_settop (L, 0);
#endif
- rspamd_lua_new_class (L, "rspamd{ev_base}", NULL);
- lua_pop (L, 1);
-
rspamd_lua_new_class (L, "rspamd{session}", NULL);
lua_pop (L, 1);
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index f1b59ca28..87caa6206 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -385,6 +385,8 @@ void luaopen_spf (lua_State *L);
void luaopen_tensor (lua_State *L);
+void luaopen_parsers (lua_State *L);
+
void rspamd_lua_dostring (const gchar *line);
double rspamd_lua_normalize (struct rspamd_config *cfg,
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
new file mode 100644
index 000000000..01d7fecc7
--- /dev/null
+++ b/src/lua/lua_parsers.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright 2020 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lua_common.h"
+#include "tokenizers/tokenizers.h"
+#include "contrib/uthash/utlist.h"
+#include "libserver/html.h"
+#include "libmime/email_addr.h"
+#include "libmime/content_type.h"
+#include "libmime/mime_headers.h"
+#include "libmime/smtp_parsers.h"
+#include "lua_parsers.h"
+
+/***
+ * @module rspamd_util
+ * This module contains Lua-C interfaces to Rspamd parsers of different kind.
+ */
+
+/***
+ * @function util.tokenize_text(input[, exceptions])
+ * Create tokens from a text using optional exceptions list
+ * @param {text/string} input input data
+ * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
+ * @return {table/strings} list of strings representing words in the text
+ */
+
+
+/***
+ * @function parsers.parse_html(input)
+ * Parses HTML and returns the according text
+ * @param {string|text} in input HTML
+ * @return {rspamd_text} processed text with no HTML tags
+ */
+
+/***
+ * @function parsers.parse_mail_address(str, [pool])
+ * Parses email address and returns a table of tables in the following format:
+ *
+ * - `raw` - the original value without any processing
+ * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah at foo.com>` it returns `Vsevolod Stakhov`
+ * - `addr` - address part of the address
+ * - `user` - user part (if present) of the address, e.g. `blah`
+ * - `domain` - domain part (if present), e.g. `foo.com`
+ * - `flags` - table with following keys set to true if given condition fulfilled:
+ * - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
+ * - [ip] - domain is IPv4/IPv6 address
+ * - [braced] - angled `<blah at foo.com>` address
+ * - [quoted] - quoted user part
+ * - [empty] - empty address
+ * - [backslash] - user part contains backslash
+ * - [8bit] - contains 8bit characters
+ *
+ * @param {string} str input string
+ * @param {rspamd_mempool} pool memory pool to use
+ * @return {table/tables} parsed list of mail addresses
+ */
+
+/***
+ * @function parsers.parse_content_type(ct_string, mempool)
+ * Parses content-type string to a table:
+ * - `type`
+ * - `subtype`
+ * - `charset`
+ * - `boundary`
+ * - other attributes
+ *
+ * @param {string} ct_string content type as string
+ * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
+ * @return table or nil if cannot parse content type
+ */
+
+/***
+ * @function parsers.parse_smtp_date(str[, local_tz])
+ * Converts an SMTP date string to unix timestamp
+ * @param {string} str input string
+ * @param {boolean} local_tz convert to local tz if `true`
+ * @return {number} time as unix timestamp (converted to float)
+ */
+
+static const struct luaL_reg parserslib_f[] = {
+ LUA_INTERFACE_DEF (parsers, tokenize_text),
+ LUA_INTERFACE_DEF (parsers, parse_html),
+ LUA_INTERFACE_DEF (parsers, parse_mail_address),
+ LUA_INTERFACE_DEF (parsers, parse_content_type),
+ LUA_INTERFACE_DEF (parsers, parse_smtp_date),
+
+ {NULL, NULL}
+};
+
+gint
+lua_parsers_tokenize_text (lua_State *L)
+{
+ LUA_TRACE_POINT;
+ const gchar *in = NULL;
+ gsize len = 0, pos, ex_len, i;
+ GList *exceptions = NULL, *cur;
+ struct rspamd_lua_text *t;
+ struct rspamd_process_exception *ex;
+ UText utxt = UTEXT_INITIALIZER;
+ GArray *res;
+ rspamd_stat_token_t *w;
+
+ if (lua_type (L, 1) == LUA_TSTRING) {
+ in = luaL_checklstring (L, 1, &len);
+ }
+ else if (lua_type (L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text (L, 1);
+
+ if (t) {
+ in = t->start;
+ len = t->len;
+ }
+ }
+
+ if (in == NULL) {
+ lua_pushnil (L);
+ return 1;
+ }
+
+ if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
+ lua_pushvalue (L, 2);
+ lua_pushnil (L);
+
+ while (lua_next (L, -2) != 0) {
+ if (lua_type (L, -1) == LUA_TTABLE) {
+ lua_rawgeti (L, -1, 1);
+ pos = luaL_checknumber (L, -1);
+ lua_pop (L, 1);
+ lua_rawgeti (L, -1, 2);
+ ex_len = luaL_checknumber (L, -1);
+ lua_pop (L, 1);
+
+ if (ex_len > 0) {
+ ex = g_malloc0 (sizeof (*ex));
+ ex->pos = pos;
+ ex->len = ex_len;
+ ex->type = RSPAMD_EXCEPTION_GENERIC;
+ exceptions = g_list_prepend (exceptions, ex);
+ }
+ }
+ lua_pop (L, 1);
+ }
+
+ lua_pop (L, 1);
+ }
+
+ if (exceptions) {
+ exceptions = g_list_reverse (exceptions);
+ }
+
+ UErrorCode uc_err = U_ZERO_ERROR;
+ utext_openUTF8 (&utxt,
+ in,
+ len,
+ &uc_err);
+
+ res = rspamd_tokenize_text ((gchar *)in, len,
+ &utxt,
+ RSPAMD_TOKENIZE_UTF, NULL,
+ exceptions,
+ NULL, NULL, NULL);
+
+ if (res == NULL) {
+ lua_pushnil (L);
+ }
+ else {
+ lua_createtable (L, res->len, 0);
+
+ for (i = 0; i < res->len; i ++) {
+ w = &g_array_index (res, rspamd_stat_token_t, i);
+ lua_pushlstring (L, w->original.begin, w->original.len);
+ lua_rawseti (L, -2, i + 1);
+ }
+ }
+
+ cur = exceptions;
+ while (cur) {
+ ex = cur->data;
+ g_free (ex);
+ cur = g_list_next (cur);
+ }
+
+ g_list_free (exceptions);
+ utext_close (&utxt);
+
+ return 1;
+}
+
+gint
+lua_parsers_parse_html (lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t;
+ const gchar *start = NULL;
+ gsize len;
+ GByteArray *res, *in;
+ rspamd_mempool_t *pool;
+ struct html_content *hc;
+
+ if (lua_type (L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text (L, 1);
+
+ if (t != NULL) {
+ start = t->start;
+ len = t->len;
+ }
+ }
+ else if (lua_type (L, 1) == LUA_TSTRING) {
+ start = luaL_checklstring (L, 1, &len);
+ }
+
+ if (start != NULL) {
+ pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL, 0);
+ hc = rspamd_mempool_alloc0 (pool, sizeof (*hc));
+ in = g_byte_array_sized_new (len);
+ g_byte_array_append (in, start, len);
+
+ res = rspamd_html_process_part (pool, hc, in);
+
+ t = lua_newuserdata (L, sizeof (*t));
+ rspamd_lua_setclass (L, "rspamd{text}", -1);
+ t->start = res->data;
+ t->len = res->len;
+ t->flags = RSPAMD_TEXT_FLAG_OWN;
+
+ g_byte_array_free (res, FALSE);
+ g_byte_array_free (in, TRUE);
+ rspamd_mempool_delete (pool);
+ }
+ else {
+ lua_pushnil (L);
+ }
+
+ return 1;
+}
+
+gint
+lua_parsers_parse_mail_address (lua_State *L)
+{
+ LUA_TRACE_POINT;
+ GPtrArray *addrs;
+ gsize len;
+ const gchar *str = luaL_checklstring (L, 1, &len);
+ rspamd_mempool_t *pool;
+ gboolean own_pool = FALSE;
+
+ if (str) {
+
+ if (lua_type (L, 2) == LUA_TUSERDATA) {
+ pool = rspamd_lua_check_mempool (L, 2);
+
+ if (pool == NULL) {
+ return luaL_error (L, "invalid arguments");
+ }
+ }
+ else {
+ pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
+ "lua util", 0);
+ own_pool = TRUE;
+ }
+
+ addrs = rspamd_email_address_from_mime (pool, str, len, NULL, -1);
+
+ if (addrs == NULL) {
+ lua_pushnil (L);
+ }
+ else {
+ lua_push_emails_address_list (L, addrs, 0);
+ }
+
+ if (own_pool) {
+ rspamd_mempool_delete (pool);
+ }
+ }
+ else {
+ lua_pushnil (L);
+ }
+
+ return 1;
+}
+
+gint
+lua_parsers_parse_content_type (lua_State *L)
+{
+ LUA_TRACE_POINT;
+ gsize len;
+ const gchar *ct_str = luaL_checklstring (L, 1, &len);
+ rspamd_mempool_t *pool = rspamd_lua_check_mempool (L, 2);
+ struct rspamd_content_type *ct;
+
+ if (!ct_str || !pool) {
+ return luaL_error (L, "invalid arguments");
+ }
+
+ ct = rspamd_content_type_parse (ct_str, len, pool);
+
+ if (ct == NULL) {
+ lua_pushnil (L);
+ }
+ else {
+ GHashTableIter it;
+ gpointer k, v;
+
+ lua_createtable (L, 0, 4 + (ct->attrs ? g_hash_table_size (ct->attrs) : 0));
+
+ if (ct->type.len > 0) {
+ lua_pushstring (L, "type");
+ lua_pushlstring (L, ct->type.begin, ct->type.len);
+ lua_settable (L, -3);
+ }
+
+ if (ct->subtype.len > 0) {
+ lua_pushstring (L, "subtype");
+ lua_pushlstring (L, ct->subtype.begin, ct->subtype.len);
+ lua_settable (L, -3);
+ }
+
+ if (ct->charset.len > 0) {
+ lua_pushstring (L, "charset");
+ lua_pushlstring (L, ct->charset.begin, ct->charset.len);
+ lua_settable (L, -3);
+ }
+
+ if (ct->orig_boundary.len > 0) {
+ lua_pushstring (L, "boundary");
+ lua_pushlstring (L, ct->orig_boundary.begin, ct->orig_boundary.len);
+ lua_settable (L, -3);
+ }
+
+ if (ct->attrs) {
+ g_hash_table_iter_init (&it, ct->attrs);
+
+ while (g_hash_table_iter_next (&it, &k, &v)) {
+ struct rspamd_content_type_param *param =
+ (struct rspamd_content_type_param *)v, *cur;
+ guint i = 1;
+
+ lua_pushlstring (L, param->name.begin, param->name.len);
+ lua_createtable (L, 1, 0);
+
+ DL_FOREACH (param, cur) {
+ lua_pushlstring (L, cur->value.begin, cur->value.len);
+ lua_rawseti (L, -2, i++);
+ }
+
+ lua_settable (L, -3);
+ }
+ }
+ }
+
+ return 1;
+}
+
+int
+lua_parsers_parse_smtp_date (lua_State *L)
+{
+ gsize slen;
+ const gchar *str = lua_tolstring (L, 1, &slen);
+ GError *err = NULL;
+
+ if (str == NULL) {
+ return luaL_argerror (L, 1, "invalid argument");
+ }
+
+ time_t tt = rspamd_parse_smtp_date (str, slen, &err);
+
+ if (err == NULL) {
+ if (lua_isboolean (L, 2) && !!lua_toboolean (L, 2)) {
+ struct tm t;
+
+ rspamd_localtime (tt, &t);
+#if !defined(__sun)
+ t.tm_gmtoff = 0;
+#endif
+ t.tm_isdst = 0;
+ tt = mktime (&t);
+ }
+
+ lua_pushnumber (L, tt);
+ }
+ else {
+ lua_pushnil (L);
+ lua_pushstring (L, err->message);
+ g_error_free (err);
+
+ return 2;
+ }
+
+ return 1;
+}
+
+static gint
+lua_load_parsers (lua_State * L)
+{
+ lua_newtable (L);
+ luaL_register (L, NULL, parserslib_f);
+
+ return 1;
+}
+
+void
+luaopen_parsers (lua_State * L)
+{
+ rspamd_lua_add_preload (L, "rspamd_parsers", lua_load_parsers);
+}
\ No newline at end of file
diff --git a/src/lua/lua_parsers.h b/src/lua/lua_parsers.h
new file mode 100644
index 000000000..900072a10
--- /dev/null
+++ b/src/lua/lua_parsers.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright 2020 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LUA_PARSERS_H
+#define RSPAMD_LUA_PARSERS_H
+
+#include "lua_common.h"
+
+/***
+ * @function parsers.tokenize_text(input[, exceptions])
+ * Create tokens from a text using optional exceptions list
+ * @param {text/string} input input data
+ * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
+ * @return {table/strings} list of strings representing words in the text
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, tokenize_text);
+
+/***
+ * @function parsers.parse_html(input)
+ * Parses HTML and returns the according text
+ * @param {string|text} in input HTML
+ * @return {rspamd_text} processed text with no HTML tags
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_html);
+
+/***
+ * @function parsers.parse_mail_address(str, [pool])
+ * Parses email address and returns a table of tables in the following format:
+ *
+ * - `raw` - the original value without any processing
+ * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah at foo.com>` it returns `Vsevolod Stakhov`
+ * - `addr` - address part of the address
+ * - `user` - user part (if present) of the address, e.g. `blah`
+ * - `domain` - domain part (if present), e.g. `foo.com`
+ * - `flags` - table with following keys set to true if given condition fulfilled:
+ * - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
+ * - [ip] - domain is IPv4/IPv6 address
+ * - [braced] - angled `<blah at foo.com>` address
+ * - [quoted] - quoted user part
+ * - [empty] - empty address
+ * - [backslash] - user part contains backslash
+ * - [8bit] - contains 8bit characters
+ *
+ * @param {string} str input string
+ * @param {rspamd_mempool} pool memory pool to use
+ * @return {table/tables} parsed list of mail addresses
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_mail_address);
+
+/***
+ * @function parsers.parse_content_type(ct_string, mempool)
+ * Parses content-type string to a table:
+ * - `type`
+ * - `subtype`
+ * - `charset`
+ * - `boundary`
+ * - other attributes
+ *
+ * @param {string} ct_string content type as string
+ * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
+ * @return table or nil if cannot parse content type
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_content_type);
+
+/***
+ * @function parsers.parse_smtp_date(str[, local_tz])
+ * Converts an SMTP date string to unix timestamp
+ * @param {string} str input string
+ * @param {boolean} local_tz convert to local tz if `true`
+ * @return {number} time as unix timestamp (converted to float)
+ */
+LUA_PUBLIC_FUNCTION_DEF (parsers, parse_smtp_date);
+
+
+#endif //RSPAMD_LUA_PARSERS_H
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 1a2b52f80..e879d37af 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -14,16 +14,15 @@
* limitations under the License.
*/
#include "lua_common.h"
-#include "html.h"
-#include "tokenizers/tokenizers.h"
#include "unix-std.h"
#include "contrib/zstd/zstd.h"
-#include "contrib/uthash/utlist.h"
#include "libmime/email_addr.h"
#include "libmime/content_type.h"
#include "libmime/mime_headers.h"
#include "libutil/hash.h"
+#include "lua_parsers.h"
+
#ifdef WITH_LUA_REPL
#include "replxx.h"
#endif
@@ -34,7 +33,6 @@
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
-#include "libmime/smtp_parsers.h"
#include "contrib/fastutf8/fastutf8.h"
/***
@@ -1313,100 +1311,7 @@ lua_util_decode_url (lua_State *L)
static gint
lua_util_tokenize_text (lua_State *L)
{
- LUA_TRACE_POINT;
- const gchar *in = NULL;
- gsize len = 0, pos, ex_len, i;
- GList *exceptions = NULL, *cur;
- struct rspamd_lua_text *t;
- struct rspamd_process_exception *ex;
- UText utxt = UTEXT_INITIALIZER;
- GArray *res;
- rspamd_stat_token_t *w;
-
- if (lua_type (L, 1) == LUA_TSTRING) {
- in = luaL_checklstring (L, 1, &len);
- }
- else if (lua_type (L, 1) == LUA_TUSERDATA) {
- t = lua_check_text (L, 1);
-
- if (t) {
- in = t->start;
- len = t->len;
- }
- }
-
- if (in == NULL) {
- lua_pushnil (L);
- return 1;
- }
-
- if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) {
- lua_pushvalue (L, 2);
- lua_pushnil (L);
-
- while (lua_next (L, -2) != 0) {
- if (lua_type (L, -1) == LUA_TTABLE) {
- lua_rawgeti (L, -1, 1);
- pos = luaL_checknumber (L, -1);
- lua_pop (L, 1);
- lua_rawgeti (L, -1, 2);
- ex_len = luaL_checknumber (L, -1);
- lua_pop (L, 1);
-
- if (ex_len > 0) {
- ex = g_malloc0 (sizeof (*ex));
- ex->pos = pos;
- ex->len = ex_len;
- ex->type = RSPAMD_EXCEPTION_GENERIC;
- exceptions = g_list_prepend (exceptions, ex);
- }
- }
- lua_pop (L, 1);
- }
-
- lua_pop (L, 1);
- }
-
- if (exceptions) {
- exceptions = g_list_reverse (exceptions);
- }
-
- UErrorCode uc_err = U_ZERO_ERROR;
- utext_openUTF8 (&utxt,
- in,
- len,
- &uc_err);
-
- res = rspamd_tokenize_text ((gchar *)in, len,
- &utxt,
- RSPAMD_TOKENIZE_UTF, NULL,
- exceptions,
- NULL, NULL, NULL);
-
- if (res == NULL) {
- lua_pushnil (L);
- }
- else {
- lua_createtable (L, res->len, 0);
-
- for (i = 0; i < res->len; i ++) {
- w = &g_array_index (res, rspamd_stat_token_t, i);
- lua_pushlstring (L, w->original.begin, w->original.len);
- lua_rawseti (L, -2, i + 1);
- }
- }
-
- cur = exceptions;
*** OUTPUT TRUNCATED, 248 LINES SKIPPED ***
More information about the Commits
mailing list