commit d4bd976: [Rework] Rework urls extraction
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Apr 21 15:21:10 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-04-21 13:09:16 +0100
URL: https://github.com/rspamd/rspamd/commit/d4bd976fd9b084d845829fc659fc56b736a6d0af
[Rework] Rework urls extraction
---
src/lua/lua_common.c | 12 ++++
src/lua/lua_common.h | 15 ++--
src/lua/lua_task.c | 192 ++++-----------------------------------------------
src/lua/lua_url.c | 145 +++++++++++++++++++++++++++++++++++++-
src/lua/lua_url.h | 71 +++++++++++++++++++
5 files changed, 250 insertions(+), 185 deletions(-)
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c
index 2be91140a..9c4a5d8d1 100644
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -197,6 +197,18 @@ rspamd_lua_setclass (lua_State * L, const gchar *classname, gint objidx)
lua_setmetatable (L, objidx);
}
+void
+rspamd_lua_class_metatable (lua_State *L, const gchar *classname)
+{
+ khiter_t k;
+
+ k = kh_get (lua_class_set, lua_classes, classname);
+
+ g_assert (k != kh_end (lua_classes));
+ lua_rawgetp (L, LUA_REGISTRYINDEX,
+ RSPAMD_LIGHTUSERDATA_MASK (kh_key (lua_classes, k)));
+}
+
void
rspamd_lua_add_metamethod (lua_State *L, const gchar *classname,
luaL_Reg *meth)
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 5edec663b..296b8f326 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -63,11 +63,9 @@ static inline void lua_rawsetp (lua_State *L, int i, const void *p) {
#endif
/* Interface definitions */
-#define LUA_FUNCTION_DEF(class, name) static int lua_ ## class ## _ ## name ( \
- lua_State * L)
-#define LUA_PUBLIC_FUNCTION_DEF(class, name) int lua_ ## class ## _ ## name ( \
- lua_State * L)
-#define LUA_INTERFACE_DEF(class, name) { # name, lua_ ## class ## _ ## name }
+#define LUA_FUNCTION_DEF(class, name) static int lua_##class##_##name (lua_State * L)
+#define LUA_PUBLIC_FUNCTION_DEF(class, name) int lua_##class##_##name (lua_State * L)
+#define LUA_INTERFACE_DEF(class, name) { #name, lua_##class##_##name }
#ifdef __cplusplus
extern "C" {
@@ -161,6 +159,13 @@ void rspamd_lua_new_class (lua_State *L,
*/
void rspamd_lua_setclass (lua_State *L, const gchar *classname, gint objidx);
+/**
+ * Pushes the metatable for specific class on top of the stack
+ * @param L
+ * @param classname
+ */
+void rspamd_lua_class_metatable (lua_State *L, const gchar *classname);
+
/**
* Adds a new field to the class (metatable) identified by `classname`
* @param L
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index b891d7d99..5c7a8b0a4 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -14,6 +14,8 @@
* limitations under the License.
*/
#include "lua_common.h"
+#include "lua_url.h"
+
#include "message.h"
#include "images.h"
#include "archives.h"
@@ -2245,61 +2247,7 @@ lua_task_append_message (lua_State * L)
return 0;
}
-struct lua_tree_cb_data {
- lua_State *L;
- int i;
- gint mask;
- gint need_images;
- gdouble skip_prob;
- guint64 xoroshiro_state[4];
-};
-
-static void
-lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
-{
- struct rspamd_lua_url *lua_url;
- struct rspamd_url *url = (struct rspamd_url *)value;
- struct lua_tree_cb_data *cb = ud;
-
- if (url->protocol & cb->mask) {
- if (!cb->need_images && (url->flags & RSPAMD_URL_FLAG_IMAGE)) {
- return;
- }
-
- if (cb->skip_prob > 0) {
- gdouble coin = rspamd_random_double_fast_seed (cb->xoroshiro_state);
-
- if (coin < cb->skip_prob) {
- return;
- }
- }
-
- lua_url = lua_newuserdata (cb->L, sizeof (struct rspamd_lua_url));
- rspamd_lua_setclass (cb->L, "rspamd{url}", -1);
- lua_url->url = url;
- lua_rawseti (cb->L, -2, cb->i++);
- }
-}
-
-static inline gsize
-lua_task_urls_adjust_skip_prob (struct rspamd_task *task,
- struct lua_tree_cb_data *cb, gsize sz, gsize max_urls)
-{
- if (max_urls > 0 && sz > max_urls) {
- cb->skip_prob = 1.0 - ((gdouble)max_urls) / (gdouble)sz;
- /*
- * Use task dependent probabilistic seed to ensure that
- * consequent task:get_urls return the same list of urls
- */
- memcpy (&cb->xoroshiro_state[0], &task->task_timestamp,
- MIN (sizeof (cb->xoroshiro_state[0]), sizeof (task->task_timestamp)));
- memcpy (&cb->xoroshiro_state[1], MESSAGE_FIELD (task, digest),
- sizeof (cb->xoroshiro_state[1]) * 3);
- sz = max_urls;
- }
- return sz;
-}
static gint
lua_task_get_urls (lua_State * L)
@@ -2307,12 +2255,7 @@ lua_task_get_urls (lua_State * L)
LUA_TRACE_POINT;
struct rspamd_task *task = lua_check_task (L, 1);
struct lua_tree_cb_data cb;
- gint protocols_mask = 0;
- static const gint default_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS|
- PROTOCOL_FILE|PROTOCOL_FTP;
- const gchar *cache_name = "emails+urls";
struct rspamd_url *u;
- gboolean need_images = FALSE;
gsize sz, max_urls = 0;
if (task) {
@@ -2326,135 +2269,26 @@ lua_task_get_urls (lua_State * L)
return 1;
}
- if (lua_gettop (L) >= 2) {
- if (lua_type (L, 2) == LUA_TBOOLEAN) {
- protocols_mask = default_mask;
- if (lua_toboolean (L, 2)) {
- protocols_mask |= PROTOCOL_MAILTO;
- }
- }
- else if (lua_type (L, 2) == LUA_TTABLE) {
- for (lua_pushnil (L); lua_next (L, 2); lua_pop (L, 1)) {
- int nmask;
- const gchar *pname = lua_tostring (L, -1);
-
- nmask = rspamd_url_protocol_from_string (pname);
-
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
- }
- else {
- msg_info ("bad url protocol: %s", pname);
- }
- }
- }
- else if (lua_type (L, 2) == LUA_TSTRING) {
- const gchar *plist = lua_tostring (L, 2);
- gchar **strvec;
- gchar * const *cvec;
-
- strvec = g_strsplit_set (plist, ",;", -1);
- cvec = strvec;
-
- while (*cvec) {
- int nmask;
-
- nmask = rspamd_url_protocol_from_string (*cvec);
-
- if (nmask != PROTOCOL_UNKNOWN) {
- protocols_mask |= nmask;
- }
- else {
- msg_info ("bad url protocol: %s", *cvec);
- }
-
- cvec ++;
- }
-
- g_strfreev (strvec);
- }
- else {
- protocols_mask = default_mask;
- }
-
- if (lua_type (L, 3) == LUA_TBOOLEAN) {
- need_images = lua_toboolean (L, 3);
- }
- }
- else {
- protocols_mask = default_mask;
+ if (!lua_url_cbdata_fill (L, 2, &cb)) {
+ return luaL_error (L, "invalid arguments");
}
memset (&cb, 0, sizeof (cb));
- cb.i = 1;
- cb.L = L;
- cb.mask = protocols_mask;
- cb.need_images = need_images;
-
- if (protocols_mask & PROTOCOL_MAILTO) {
- if (need_images) {
- cache_name = "emails+urls+img";
- }
- else {
- cache_name = "emails+urls";
- }
- sz = kh_size (MESSAGE_FIELD (task, urls));
+ sz = kh_size (MESSAGE_FIELD (task, urls));
+ sz = lua_url_adjust_skip_prob (task->task_timestamp,
+ MESSAGE_FIELD (task, digest), &cb, sz, max_urls);
- sz = lua_task_urls_adjust_skip_prob (task, &cb, sz, max_urls);
+ lua_createtable (L, sz, 0);
- if (protocols_mask == (default_mask|PROTOCOL_MAILTO)) {
- /* Can use cached version */
- if (!lua_task_get_cached (L, task, cache_name)) {
- lua_createtable (L, sz, 0);
- kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
- lua_tree_url_callback (u, u, &cb);
- });
- lua_task_set_cached (L, task, cache_name, -1);
- }
- }
- else {
- lua_createtable (L, sz, 0);
- kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
- lua_tree_url_callback (u, u, &cb);
- });
- }
-
- }
- else {
- if (need_images) {
- cache_name = "urls+img";
- }
- else {
- cache_name = "urls";
- }
-
- sz = kh_size (MESSAGE_FIELD (task, urls));
- sz = lua_task_urls_adjust_skip_prob (task, &cb, sz, max_urls);
+ kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
+ lua_tree_url_callback (u, u, &cb);
+ });
- if (protocols_mask == (default_mask)) {
- if (!lua_task_get_cached (L, task, cache_name)) {
- lua_createtable (L, sz, 0);
- kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
- if (!(u->protocol & PROTOCOL_MAILTO)) {
- lua_tree_url_callback (u, u, &cb);
- }
- });
- lua_task_set_cached (L, task, cache_name, -1);
- }
- }
- else {
- lua_createtable (L, sz, 0);
- kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
- if (!(u->protocol & PROTOCOL_MAILTO)) {
- lua_tree_url_callback (u, u, &cb);
- }
- });
- }
- }
+ lua_url_cbdata_dtor (&cb);
}
else {
- return luaL_error (L, "invalid arguments");
+ return luaL_error (L, "invalid arguments, no task");
}
return 1;
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
index efd34dc6c..6540919ea 100644
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -14,7 +14,8 @@
* limitations under the License.
*/
#include "lua_common.h"
-#include "contrib/uthash/utlist.h"
+#include "lua_url.h"
+
/***
* @module rspamd_url
@@ -903,6 +904,148 @@ lua_url_get_flags (lua_State *L)
#undef PUSH_FLAG
+void
+lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
+{
+ struct rspamd_lua_url *lua_url;
+ struct rspamd_url *url = (struct rspamd_url *)value;
+ struct lua_tree_cb_data *cb = ud;
+
+ if (url->protocol & cb->mask) {
+ if (!cb->need_images && (url->flags & RSPAMD_URL_FLAG_IMAGE)) {
+ return;
+ }
+
+ if (cb->skip_prob > 0) {
+ gdouble coin = rspamd_random_double_fast_seed (cb->xoroshiro_state);
+
+ if (coin < cb->skip_prob) {
+ return;
+ }
+ }
+
+ lua_url = lua_newuserdata (cb->L, sizeof (struct rspamd_lua_url));
+ lua_pushvalue (cb->L, cb->metatable_pos);
+ lua_setmetatable (cb->L, -2);
+ lua_url->url = url;
+ lua_rawseti (cb->L, -2, cb->i++);
+ }
+}
+
+gboolean
+lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd)
+{
+ gboolean need_images = FALSE;
+ gint protocols_mask = 0;
+ static const gint default_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS|
+ PROTOCOL_FILE|PROTOCOL_FTP;
+ gint pos_arg_type = lua_type (L, pos);
+
+ if (pos_arg_type == LUA_TBOOLEAN) {
+ protocols_mask = default_mask;
+ if (lua_toboolean (L, 2)) {
+ protocols_mask |= PROTOCOL_MAILTO;
+ }
+ }
+ else if (pos_arg_type == LUA_TTABLE) {
+ for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+ int nmask;
+ const gchar *pname = lua_tostring (L, -1);
+
+ nmask = rspamd_url_protocol_from_string (pname);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info ("bad url protocol: %s", pname);
+ return FALSE;
+ }
+ }
+ }
+ else if (pos_arg_type == LUA_TSTRING) {
+ const gchar *plist = lua_tostring (L, pos);
+ gchar **strvec;
+ gchar * const *cvec;
+
+ strvec = g_strsplit_set (plist, ",;", -1);
+ cvec = strvec;
+
+ while (*cvec) {
+ int nmask;
+
+ nmask = rspamd_url_protocol_from_string (*cvec);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info ("bad url protocol: %s", *cvec);
+ return FALSE;
+ }
+
+ cvec ++;
+ }
+
+ g_strfreev (strvec);
+ }
+ else if (pos_arg_type == LUA_TNONE || pos_arg_type == LUA_TNIL) {
+ protocols_mask = default_mask;
+ }
+ else {
+ return FALSE;
+ }
+
+ if (lua_type (L, pos + 1) == LUA_TBOOLEAN) {
+ need_images = lua_toboolean (L, pos + 1);
+ }
+
+ memset (cbd, 0, sizeof (*cbd));
+
+ cbd->i = 1;
+ cbd->L = L;
+ cbd->mask = protocols_mask;
+ cbd->need_images = need_images;
+
+ /* This needs to be removed from the stack */
+ rspamd_lua_class_metatable (L, "rspamd{url}");
+ cbd->metatable_pos = lua_gettop (L);
+ (void)lua_checkstack (L, cbd->metatable_pos + 4);
+
+ return TRUE;
+}
+
+void
+lua_url_cbdata_dtor (struct lua_tree_cb_data *cbd)
+{
+ if (cbd->metatable_pos != -1) {
+ lua_remove (cbd->L, cbd->metatable_pos);
+ }
+}
+
+gsize
+lua_url_adjust_skip_prob (gdouble timestamp,
+ guchar *digest,
+ struct lua_tree_cb_data *cb,
+ gsize sz,
+ gsize max_urls)
+{
+ if (max_urls > 0 && sz > max_urls) {
+ cb->skip_prob = 1.0 - ((gdouble)max_urls) / (gdouble)sz;
+ /*
+ * Use task dependent probabilistic seed to ensure that
+ * consequent task:get_urls return the same list of urls
+ */
+ memcpy (&cb->xoroshiro_state[0], ×tamp,
+ MIN (sizeof (cb->xoroshiro_state[0]), sizeof (timestamp)));
+ memcpy (&cb->xoroshiro_state[1], digest,
+ sizeof (cb->xoroshiro_state[1]) * 3);
+ sz = max_urls;
+ }
+
+ return sz;
+}
+
static gint
lua_load_url (lua_State * L)
{
diff --git a/src/lua/lua_url.h b/src/lua/lua_url.h
new file mode 100644
index 000000000..57d20f920
--- /dev/null
+++ b/src/lua/lua_url.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright 2020 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_LUA_URL_H
+#define RSPAMD_LUA_URL_H
+
+#include "lua_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct lua_tree_cb_data {
+ lua_State *L;
+ int i;
+ int metatable_pos;
+ gint mask;
+ gint need_images;
+ gdouble skip_prob;
+ guint64 xoroshiro_state[4];
+};
+
+void lua_tree_url_callback (gpointer key, gpointer value, gpointer ud);
+
+/**
+ * Fills a cbdata table based on the parameter at position pos
+ * @param L
+ * @param pos
+ * @param cbd
+ * @return
+ */
+gboolean lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd);
+
+/**
+ * Cleanup url cbdata
+ * @param cbd
+ */
+void lua_url_cbdata_dtor (struct lua_tree_cb_data *cbd);
+
+/**
+ * Adjust probabilistic skip of the urls
+ * @param timestamp
+ * @param digest
+ * @param cb
+ * @param sz
+ * @param max_urls
+ * @return
+ */
+gsize lua_url_adjust_skip_prob (gdouble timestamp,
+ guchar *digest,
+ struct lua_tree_cb_data *cb,
+ gsize sz,
+ gsize max_urls);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
More information about the Commits
mailing list