commit 4385e17: [Rework] Further rework of lua urls extraction API
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Apr 21 15:21:16 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-04-21 16:07:40 +0100
URL: https://github.com/rspamd/rspamd/commit/4385e1701570617eda31009299817e0b38a90be5 (HEAD -> master)
[Rework] Further rework of lua urls extraction API
---
src/lua/lua_task.c | 32 +++++++-----
src/lua/lua_url.c | 148 +++++++++++++++++++++++++++++++++++++++++++----------
src/lua/lua_url.h | 14 +++--
3 files changed, 150 insertions(+), 44 deletions(-)
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 5c7a8b0a4..2ceb1c3c2 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -2256,6 +2256,8 @@ lua_task_get_urls (lua_State * L)
struct rspamd_task *task = lua_check_task (L, 1);
struct lua_tree_cb_data cb;
struct rspamd_url *u;
+ static const gint default_protocols_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS|
+ PROTOCOL_FILE|PROTOCOL_FTP;
gsize sz, max_urls = 0;
if (task) {
@@ -2269,15 +2271,15 @@ lua_task_get_urls (lua_State * L)
return 1;
}
- if (!lua_url_cbdata_fill (L, 2, &cb)) {
+ /* Exclude RSPAMD_URL_FLAG_CONTENT to preserve backward compatibility */
+ if (!lua_url_cbdata_fill (L, 2, &cb, default_protocols_mask,
+ (~RSPAMD_URL_FLAG_CONTENT), max_urls)) {
return luaL_error (L, "invalid arguments");
}
- memset (&cb, 0, sizeof (cb));
-
sz = kh_size (MESSAGE_FIELD (task, urls));
sz = lua_url_adjust_skip_prob (task->task_timestamp,
- MESSAGE_FIELD (task, digest), &cb, sz, max_urls);
+ MESSAGE_FIELD (task, digest), &cb, sz);
lua_createtable (L, sz, 0);
@@ -2425,20 +2427,26 @@ lua_task_get_emails (lua_State * L)
struct rspamd_task *task = lua_check_task (L, 1);
struct lua_tree_cb_data cb;
struct rspamd_url *u;
+ gsize max_urls = 0, sz;
if (task) {
if (task->message) {
- lua_createtable (L, kh_size (MESSAGE_FIELD (task, urls)), 0);
- memset (&cb, 0, sizeof (cb));
- cb.i = 1;
- cb.L = L;
- cb.mask = PROTOCOL_MAILTO;
+ if (!lua_url_cbdata_fill (L, 2, &cb, PROTOCOL_MAILTO,
+ (~RSPAMD_URL_FLAG_CONTENT), max_urls)) {
+ return luaL_error (L, "invalid arguments");
+ }
+
+ sz = kh_size (MESSAGE_FIELD (task, urls));
+ sz = lua_url_adjust_skip_prob (task->task_timestamp,
+ MESSAGE_FIELD (task, digest), &cb, sz);
+
+ lua_createtable (L, sz, 0);
kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
- if ((u->protocol & PROTOCOL_MAILTO)) {
- lua_tree_url_callback (u, u, &cb);
- }
+ lua_tree_url_callback (u, u, &cb);
});
+
+ lua_url_cbdata_dtor (&cb);
}
else {
lua_newtable (L);
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
index 65f0569a5..45f9ab683 100644
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -933,10 +933,7 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
struct rspamd_url *url = (struct rspamd_url *)value;
struct lua_tree_cb_data *cb = ud;
- if (url->protocol & cb->mask) {
- if (!cb->need_images && (url->flags & RSPAMD_URL_FLAG_IMAGE)) {
- return;
- }
+ if ((url->protocol & cb->protocols_mask) && (url->flags & cb->flags_mask)) {
if (cb->skip_prob > 0) {
gdouble coin = rspamd_random_double_fast_seed (cb->xoroshiro_state);
@@ -955,35 +952,126 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
}
gboolean
-lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd)
+lua_url_cbdata_fill (lua_State *L,
+ gint pos,
+ struct lua_tree_cb_data *cbd,
+ guint default_protocols,
+ guint default_flags,
+ gsize max_urls)
{
- gboolean need_images = FALSE;
gint protocols_mask = 0;
- static const gint default_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS|
- PROTOCOL_FILE|PROTOCOL_FTP;
+
gint pos_arg_type = lua_type (L, pos);
+ guint flags_mask = default_flags;
if (pos_arg_type == LUA_TBOOLEAN) {
- protocols_mask = default_mask;
+ protocols_mask = default_protocols;
if (lua_toboolean (L, 2)) {
protocols_mask |= PROTOCOL_MAILTO;
}
}
else if (pos_arg_type == LUA_TTABLE) {
- for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
- int nmask;
- const gchar *pname = lua_tostring (L, -1);
+ if (rspamd_lua_geti (L, 1, pos) == LUA_TNIL) {
+ /* New method: indexed table */
+
+ lua_getfield (L, pos, "flags");
+ if (lua_istable (L, -1)) {
+ for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+ int nmask = 0;
+ const gchar *fname = lua_tostring (L, -1);
+
+
+ if (rspamd_url_flag_from_string (fname, &nmask)) {
+ flags_mask |= nmask;
+ }
+ else {
+ msg_info ("bad url flag: %s", fname);
+ return FALSE;
+ }
+ }
+ }
+ else {
+ flags_mask |= default_flags;
+ }
+ lua_pop (L, 1);
+
+ lua_getfield (L, pos, "protocols");
+ if (lua_istable (L, -1)) {
+ for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+ int nmask;
+ const gchar *pname = lua_tostring (L, -1);
+
+ nmask = rspamd_url_protocol_from_string (pname);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info ("bad url protocol: %s", pname);
+ return FALSE;
+ }
+ }
+ }
+ else {
+ protocols_mask = default_protocols;
+ }
+ lua_pop (L, 1);
- nmask = rspamd_url_protocol_from_string (pname);
+ lua_getfield (L, pos, "emails");
+ if (lua_isboolean (L, -1)) {
+ if (lua_toboolean (L, -1)) {
+ protocols_mask |= PROTOCOL_MAILTO;
+ }
+ }
+ lua_pop (L, 1);
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
+ lua_getfield (L, pos, "images");
+ if (lua_isboolean (L, -1)) {
+ if (lua_toboolean (L, -1)) {
+ flags_mask |= RSPAMD_URL_FLAG_IMAGE;
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+ }
}
- else {
- msg_info ("bad url protocol: %s", pname);
- return FALSE;
+ lua_pop (L, 1);
+
+ lua_getfield (L, pos, "content");
+ if (lua_isboolean (L, -1)) {
+ if (lua_toboolean (L, -1)) {
+ flags_mask |= RSPAMD_URL_FLAG_CONTENT;
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
+ }
}
+ lua_pop (L, 1);
+
+ lua_getfield (L, pos, "max_urls");
+ if (lua_isnumber (L, -1)) {
+ max_urls = lua_tonumber (L, -1);
+ }
+ lua_pop (L, 1);
}
+ else {
+ /* Plain table of the protocols */
+ for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+ int nmask;
+ const gchar *pname = lua_tostring (L, -1);
+
+ nmask = rspamd_url_protocol_from_string (pname);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info ("bad url protocol: %s", pname);
+ return FALSE;
+ }
+ }
+ }
+
+ lua_pop (L, 1); /* After rspamd_lua_geti */
}
else if (pos_arg_type == LUA_TSTRING) {
const gchar *plist = lua_tostring (L, pos);
@@ -1012,22 +1100,29 @@ lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd)
g_strfreev (strvec);
}
else if (pos_arg_type == LUA_TNONE || pos_arg_type == LUA_TNIL) {
- protocols_mask = default_mask;
+ protocols_mask = default_protocols;
+ flags_mask = default_flags;
}
else {
return FALSE;
}
if (lua_type (L, pos + 1) == LUA_TBOOLEAN) {
- need_images = lua_toboolean (L, pos + 1);
+ if (lua_toboolean (L, pos + 1)) {
+ flags_mask |= RSPAMD_URL_FLAG_IMAGE;
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+ }
}
memset (cbd, 0, sizeof (*cbd));
cbd->i = 1;
cbd->L = L;
- cbd->mask = protocols_mask;
- cbd->need_images = need_images;
+ cbd->max_urls = max_urls;
+ cbd->protocols_mask = protocols_mask;
+ cbd->flags_mask = flags_mask;
/* This needs to be removed from the stack */
rspamd_lua_class_metatable (L, "rspamd{url}");
@@ -1049,11 +1144,10 @@ gsize
lua_url_adjust_skip_prob (gdouble timestamp,
guchar *digest,
struct lua_tree_cb_data *cb,
- gsize sz,
- gsize max_urls)
+ gsize sz)
{
- if (max_urls > 0 && sz > max_urls) {
- cb->skip_prob = 1.0 - ((gdouble)max_urls) / (gdouble)sz;
+ if (cb->max_urls > 0 && sz > cb->max_urls) {
+ cb->skip_prob = 1.0 - ((gdouble)cb->max_urls) / (gdouble)sz;
/*
* Use task dependent probabilistic seed to ensure that
* consequent task:get_urls return the same list of urls
@@ -1062,7 +1156,7 @@ lua_url_adjust_skip_prob (gdouble timestamp,
MIN (sizeof (cb->xoroshiro_state[0]), sizeof (timestamp)));
memcpy (&cb->xoroshiro_state[1], digest,
sizeof (cb->xoroshiro_state[1]) * 3);
- sz = max_urls;
+ sz = cb->max_urls;
}
return sz;
diff --git a/src/lua/lua_url.h b/src/lua/lua_url.h
index 57d20f920..0ea2186d8 100644
--- a/src/lua/lua_url.h
+++ b/src/lua/lua_url.h
@@ -26,8 +26,9 @@ struct lua_tree_cb_data {
lua_State *L;
int i;
int metatable_pos;
- gint mask;
- gint need_images;
+ guint flags_mask;
+ guint protocols_mask;
+ gsize max_urls;
gdouble skip_prob;
guint64 xoroshiro_state[4];
};
@@ -41,7 +42,11 @@ void lua_tree_url_callback (gpointer key, gpointer value, gpointer ud);
* @param cbd
* @return
*/
-gboolean lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd);
+gboolean lua_url_cbdata_fill (lua_State *L, gint pos,
+ struct lua_tree_cb_data *cbd,
+ guint default_protocols,
+ guint default_flags,
+ gsize max_urls);
/**
* Cleanup url cbdata
@@ -61,8 +66,7 @@ void lua_url_cbdata_dtor (struct lua_tree_cb_data *cbd);
gsize lua_url_adjust_skip_prob (gdouble timestamp,
guchar *digest,
struct lua_tree_cb_data *cb,
- gsize sz,
- gsize max_urls);
+ gsize sz);
#ifdef __cplusplus
}
More information about the Commits
mailing list