commit 50a043a: [Rework] Urls: more rework of the urls sets
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Mar 9 10:49:13 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-03-07 12:15:51 +0000
URL: https://github.com/rspamd/rspamd/commit/50a043a7cbce8142d81b7887d263a9573ff568eb
[Rework] Urls: more rework of the urls sets
---
src/libmime/message.c | 8 +-
src/libmime/message.h | 4 +-
src/libserver/html.h | 3 +-
src/libserver/protocol.c | 46 +++++++-----
src/libserver/re_cache.c | 24 +++---
src/libserver/url.c | 191 +++++++++++++++++++++--------------------------
src/libserver/url.h | 24 +++---
7 files changed, 137 insertions(+), 163 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index a43e109b5..40b7fe8bc 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1048,8 +1048,7 @@ rspamd_message_dtor (struct rspamd_message *msg)
g_ptr_array_unref (msg->text_parts);
g_ptr_array_unref (msg->parts);
- g_hash_table_unref (msg->urls);
- g_hash_table_unref (msg->emails);
+ kh_destroy (rspamd_url_hash, msg->urls);
}
struct rspamd_message*
@@ -1060,10 +1059,7 @@ rspamd_message_new (struct rspamd_task *task)
msg = rspamd_mempool_alloc0 (task->task_pool, sizeof (*msg));
msg->raw_headers = rspamd_message_headers_new ();
-
- msg->emails = g_hash_table_new (rspamd_email_hash, rspamd_emails_cmp);
- msg->urls = g_hash_table_new (rspamd_url_hash, rspamd_urls_cmp);
-
+ msg->urls = kh_init (rspamd_url_hash);
msg->parts = g_ptr_array_sized_new (4);
msg->text_parts = g_ptr_array_sized_new (2);
msg->task = task;
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 91d6e13d4..96ed9d5d4 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -13,6 +13,7 @@
#include "libcryptobox/cryptobox.h"
#include "libmime/mime_headers.h"
#include "libmime/content_type.h"
+#include "libserver/url.h"
#include "libutil/ref.h"
#include "libutil/str_util.h"
@@ -175,8 +176,7 @@ struct rspamd_message {
GPtrArray *text_parts; /**< list of text parts */
struct rspamd_message_raw_headers_content raw_headers_content;
struct rspamd_received_header *received; /**< list of received headers */
- GHashTable *urls; /**< list of parsed urls */
- GHashTable *emails; /**< list of parsed emails */
+ khash_t (rspamd_url_hash) *urls;
struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */
struct rspamd_mime_header *headers_order; /**< order of raw headers */
struct rspamd_task *task;
diff --git a/src/libserver/html.h b/src/libserver/html.h
index b369bd890..ee5c242cb 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -141,7 +141,8 @@ GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
struct html_content *hc,
- GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails);
+ GByteArray *in, GList **exceptions,
+ GHashTable *urls, GHashTable *emails);
/*
* Returns true if a specified tag has been seen in a part
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 739d3b950..35d50b909 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -861,7 +861,7 @@ rspamd_protocol_handle_request (struct rspamd_task *task,
/* Structure for writing tree data */
struct tree_cb_data {
ucl_object_t *top;
- GHashTable *seen;
+ khash_t (rspamd_url_host_hash) *seen;
struct rspamd_task *task;
};
@@ -908,10 +908,8 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
* Callback for writing urls
*/
static void
-urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
+urls_protocol_cb (struct rspamd_url *url, struct tree_cb_data *cb)
{
- struct tree_cb_data *cb = ud;
- struct rspamd_url *url = value;
ucl_object_t *obj;
struct rspamd_task *task = cb->task;
const gchar *user_field = "unknown", *encoded = NULL;
@@ -921,7 +919,7 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
if (!(task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS)) {
if (url->hostlen > 0) {
- if (g_hash_table_lookup (cb->seen, url)) {
+ if (rspamd_url_host_set_has (cb->seen, url)) {
return;
}
@@ -941,7 +939,7 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
return;
}
- g_hash_table_insert (cb->seen, url, url);
+ rspamd_url_host_set_add (cb->seen, url);
}
else {
encoded = rspamd_url_encode (url, &enclen, task->task_pool);
@@ -975,28 +973,32 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
}
static ucl_object_t *
-rspamd_urls_tree_ucl (GHashTable *input, struct rspamd_task *task)
+rspamd_urls_tree_ucl (khash_t (rspamd_url_hash) *set,
+ struct rspamd_task *task)
{
struct tree_cb_data cb;
ucl_object_t *obj;
+ struct rspamd_url *u;
obj = ucl_object_typed_new (UCL_ARRAY);
cb.top = obj;
cb.task = task;
- cb.seen = g_hash_table_new (rspamd_url_host_hash, rspamd_urls_host_cmp);
+ cb.seen = kh_init (rspamd_url_host_hash);
- g_hash_table_foreach (input, urls_protocol_cb, &cb);
+ kh_foreach_key (set, u, {
+ if (!(u->protocol & PROTOCOL_MAILTO)) {
+ urls_protocol_cb (u, &cb);
+ }
+ });
- g_hash_table_unref (cb.seen);
+ kh_destroy (rspamd_url_host_hash, cb.seen);
return obj;
}
static void
-emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
+emails_protocol_cb (struct rspamd_url *url, struct tree_cb_data *cb)
{
- struct tree_cb_data *cb = ud;
- struct rspamd_url *url = value;
ucl_object_t *obj;
if (url->userlen > 0 && url->hostlen > 0) {
@@ -1007,16 +1009,23 @@ emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
}
static ucl_object_t *
-rspamd_emails_tree_ucl (GHashTable *input, struct rspamd_task *task)
+rspamd_emails_tree_ucl (khash_t (rspamd_url_hash) *set,
+ struct rspamd_task *task)
{
struct tree_cb_data cb;
ucl_object_t *obj;
+ struct rspamd_url *u;
obj = ucl_object_typed_new (UCL_ARRAY);
cb.top = obj;
cb.task = task;
- g_hash_table_foreach (input, emails_protocol_cb, &cb);
+ kh_foreach_key (set, u, {
+ if ((u->protocol & PROTOCOL_MAILTO)) {
+ emails_protocol_cb (u, &cb);
+ }
+ });
+
return obj;
}
@@ -1446,15 +1455,12 @@ rspamd_protocol_write_ucl (struct rspamd_task *task,
}
if (flags & RSPAMD_PROTOCOL_URLS && task->message) {
- if (g_hash_table_size (MESSAGE_FIELD (task, urls)) > 0) {
+ if (kh_size (MESSAGE_FIELD (task, urls)) > 0) {
ucl_object_insert_key (top,
rspamd_urls_tree_ucl (MESSAGE_FIELD (task, urls), task),
"urls", 0, false);
- }
-
- if (g_hash_table_size (MESSAGE_FIELD (task, emails)) > 0) {
ucl_object_insert_key (top,
- rspamd_emails_tree_ucl (MESSAGE_FIELD (task, emails), task),
+ rspamd_emails_tree_ucl (MESSAGE_FIELD (task, urls), task),
"emails", 0, false);
}
}
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 995af8ddf..257428720 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -1053,7 +1053,6 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
gboolean is_strong)
{
guint ret = 0, i, re_id;
- GHashTableIter it;
struct rspamd_mime_header *rh;
const gchar *in;
const guchar **scvec;
@@ -1062,7 +1061,6 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part;
struct rspamd_mime_part *mime_part;
struct rspamd_url *url;
- gpointer k, v;
guint len, cnt;
const gchar *class_name;
@@ -1164,17 +1162,18 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_URL:
- cnt = g_hash_table_size (MESSAGE_FIELD (task, urls));
+ cnt = kh_size (MESSAGE_FIELD (task, urls));
if (cnt > 0) {
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
- g_hash_table_iter_init (&it, MESSAGE_FIELD (task, urls));
i = 0;
raw = FALSE;
- while (g_hash_table_iter_next (&it, &k, &v)) {
- url = v;
+ kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
+ if ((url->protocol & PROTOCOL_MAILTO)) {
+ continue;
+ }
in = url->string;
len = url->urllen;
@@ -1182,7 +1181,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
scvec[i] = (guchar *) in;
lenvec[i++] = len;
}
- }
+ });
#if 0
g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
@@ -1207,18 +1206,19 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_EMAIL:
- cnt = g_hash_table_size (MESSAGE_FIELD (task, emails));
+ cnt = kh_size (MESSAGE_FIELD (task, urls));
if (cnt > 0) {
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
- g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
i = 0;
raw = FALSE;
- while (g_hash_table_iter_next (&it, &k, &v)) {
- url = v;
+ kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
+ if (!(url->protocol & PROTOCOL_MAILTO)) {
+ continue;
+ }
if (url->userlen == 0 || url->hostlen == 0) {
continue;
}
@@ -1227,7 +1227,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
len = url->userlen + 1 + url->hostlen;
scvec[i] = (guchar *) in;
lenvec[i++] = len;
- }
+ });
ret = rspamd_re_cache_process_regexp_data (rt, re,
task, scvec, lenvec, i, raw, &processed_hyperscan);
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 3449310b2..505d1d150 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -214,6 +214,13 @@ struct url_matcher static_matchers[] = {
URL_FLAG_NOHTML}
};
+
+static inline khint_t rspamd_url_hash (struct rspamd_url *u);
+
+static inline khint_t rspamd_url_host_hash (struct rspamd_url * u);
+static inline bool rspamd_urls_cmp (struct rspamd_url *a, struct rspamd_url *b);
+static inline bool rspamd_urls_host_cmp (struct rspamd_url *a, struct rspamd_url *b);
+
/* Hash table implementation */
__KHASH_IMPL (rspamd_url_hash, kh_inline,struct rspamd_url *, char, false,
rspamd_url_hash, rspamd_urls_cmp);
@@ -3116,7 +3123,6 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
struct rspamd_task *task;
gchar *url_str = NULL;
struct rspamd_url *query_url, *existing;
- GHashTable *target_tbl = NULL;
gint rc;
gboolean prefix_added;
@@ -3141,36 +3147,23 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
}
if (url->protocol == PROTOCOL_MAILTO) {
- if (url->userlen > 0) {
- target_tbl = MESSAGE_FIELD (task, emails);
+ if (url->userlen == 0) {
+ return FALSE;
}
}
- else {
- target_tbl = MESSAGE_FIELD (task, urls);
- }
-
- if (target_tbl) {
- /* Also check max urls */
- if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) {
- if (g_hash_table_size (target_tbl) > cbd->task->cfg->max_urls) {
- msg_err_task ("part has too many URLs, we cannot process more: "
- "%d urls extracted ",
- (guint)g_hash_table_size (target_tbl));
-
- return FALSE;
- }
- }
+ /* Also check max urls */
+ if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) {
+ if (kh_size (MESSAGE_FIELD (task, urls)) > cbd->task->cfg->max_urls) {
+ msg_err_task ("part has too many URLs, we cannot process more: "
+ "%d urls extracted ",
+ (guint)kh_size (MESSAGE_FIELD (task, urls)));
- if ((existing = g_hash_table_lookup (target_tbl, url)) == NULL) {
- url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
- g_hash_table_insert (target_tbl, url, url);
- }
- else {
- existing->count++;
+ return FALSE;
}
}
- target_tbl = NULL;
+ url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+ rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);
cbd->part->exceptions = g_list_prepend (
cbd->part->exceptions,
@@ -3178,7 +3171,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
/* We also search the query for additional url inside */
if (url->querylen > 0) {
- if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen,
+ if (rspamd_url_find (task->task_pool,
+ rspamd_url_query_unsafe (url), url->querylen,
&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
@@ -3198,23 +3192,13 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
}
if (query_url->protocol == PROTOCOL_MAILTO) {
- if (query_url->userlen > 0) {
- target_tbl = MESSAGE_FIELD (task, emails);
+ if (query_url->userlen == 0) {
+ return TRUE;
}
}
- else {
- target_tbl = MESSAGE_FIELD (task, urls);
- }
- if (target_tbl) {
- if ((existing = g_hash_table_lookup (target_tbl, query_url)) == NULL) {
- url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
- g_hash_table_insert (target_tbl, query_url, query_url);
- }
- else {
- existing->count++;
- }
- }
+ query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+ rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), query_url);
}
}
}
@@ -3321,27 +3305,13 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED|RSPAMD_URL_FLAG_SUBJECT;
if (url->protocol == PROTOCOL_MAILTO) {
- if (url->userlen > 0 && url->hostlen > 0) {
- if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, emails),
- url)) == NULL) {
- g_hash_table_insert (MESSAGE_FIELD (task, emails), url,
- url);
- }
- else {
- existing->count ++;
- }
- }
- }
- else {
- if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls),
- url)) == NULL) {
- g_hash_table_insert (MESSAGE_FIELD (task, urls), url, url);
- }
- else {
- existing->count ++;
+ if (url->userlen == 0) {
+ return FALSE;
}
}
+ rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);
+
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen,
@@ -3364,15 +3334,14 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
}
- if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls),
- query_url)) == NULL) {
- g_hash_table_insert (MESSAGE_FIELD (task, urls),
- query_url,
- query_url);
- }
- else {
- existing->count ++;
+ if (query_url->protocol == PROTOCOL_MAILTO) {
+ if (query_url->userlen == 0) {
+ return TRUE;
+ }
}
+
+ rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls),
+ query_url);
}
}
}
@@ -3380,26 +3349,22 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
return TRUE;
}
-inline guint
-rspamd_url_hash (gconstpointer u)
+static inline khint_t
+rspamd_url_hash (struct rspamd_url *url)
{
- const struct rspamd_url *url = u;
-
if (url->urllen > 0) {
- return (guint)rspamd_cryptobox_fast_hash (url->string, url->urllen,
+ return (khint_t)rspamd_cryptobox_fast_hash (url->string, url->urllen,
rspamd_hash_seed ());
}
return 0;
}
-inline guint
-rspamd_url_host_hash (gconstpointer u)
+static inline khint_t
+rspamd_url_host_hash (struct rspamd_url *url)
{
- const struct rspamd_url *url = u;
-
if (url->hostlen > 0) {
- return (guint)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
+ return (khint_t)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
url->hostlen,
rspamd_hash_seed ());
}
@@ -3407,30 +3372,10 @@ rspamd_url_host_hash (gconstpointer u)
return 0;
}
-inline guint
-rspamd_email_hash (gconstpointer u)
-{
- const struct rspamd_url *url = u;
- rspamd_cryptobox_fast_hash_state_t st;
-
- rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
-
- if (url->hostlen > 0) {
- rspamd_cryptobox_fast_hash_update (&st, rspamd_url_host_unsafe (url), url->hostlen);
- }
-
- if (url->userlen > 0) {
- rspamd_cryptobox_fast_hash_update (&st, rspamd_url_user_unsafe(url), url->userlen);
- }
-
- return (guint)rspamd_cryptobox_fast_hash_final (&st);
-}
-
/* Compare two emails for building emails tree */
-inline gboolean
-rspamd_emails_cmp (gconstpointer a, gconstpointer b)
+static inline bool
+rspamd_emails_cmp (struct rspamd_url *u1, struct rspamd_url *u2)
{
- const struct rspamd_url *u1 = a, *u2 = b;
gint r;
if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
@@ -3456,30 +3401,32 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
return FALSE;
}
-inline gboolean
-rspamd_urls_cmp (gconstpointer a, gconstpointer b)
+static inline bool
+rspamd_urls_cmp (struct rspamd_url *u1, struct rspamd_url *u2)
{
- const struct rspamd_url *u1 = a, *u2 = b;
int r = 0;
- if (u1->urllen != u2->urllen) {
- return FALSE;
+ if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) {
+ return false;
}
else {
+ if (u1->protocol & PROTOCOL_MAILTO) {
+ return rspamd_emails_cmp (u1, u2);
+ }
+
r = memcmp (u1->string, u2->string, u1->urllen);
}
return r == 0;
}
-inline gboolean
-rspamd_urls_host_cmp (gconstpointer a, gconstpointer b)
+static inline bool
+rspamd_urls_host_cmp (struct rspamd_url *u1, struct rspamd_url *u2)
{
- const struct rspamd_url *u1 = a, *u2 = b;
int r = 0;
if (u1->hostlen != u2->hostlen) {
- return FALSE;
+ return false;
}
else {
r = memcmp (rspamd_url_host_unsafe (u1), rspamd_url_host_unsafe (u2),
@@ -3834,6 +3781,22 @@ rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
return true;
}
+bool
+rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
+ struct rspamd_url *u)
+{
+ khiter_t k;
+ gint r;
+
+ k = kh_put (rspamd_url_host_hash, set, u, &r);
+
+ if (r == 0) {
+ return false;
+ }
+
+ return true;
+}
+
bool
rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u)
{
@@ -3845,5 +3808,19 @@ rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u)
return false;
}
+ return true;
+}
+
+bool
+rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u)
+{
+ khiter_t k;
+
+ k = kh_get (rspamd_url_hash, set, u);
+
+ if (k == kh_end (set)) {
+ return false;
+ }
+
return true;
}
\ No newline at end of file
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 358c61e16..aff7ccf5f 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -225,21 +225,6 @@ gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
gsize start_offset,
gsize end_offset, gpointer ud);
-guint rspamd_url_hash (gconstpointer u);
-
-guint rspamd_email_hash (gconstpointer u);
-
-guint rspamd_url_host_hash (gconstpointer u);
-
-
-/* Compare two emails for building emails hash */
-gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);
-
-/* Compare two urls for building emails hash */
-gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);
-
-gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b);
-
/**
* Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
* @param dst
@@ -295,6 +280,14 @@ KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char);
*/
bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
struct rspamd_url *u);
+/**
+ * Helper for url host set
+ * @param set
+ * @param u
+ * @return
+ */
+bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
+ struct rspamd_url *u);
/**
* Checks if a url is in set
* @param set
@@ -302,6 +295,7 @@ bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
* @return
*/
bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
+bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
#ifdef __cplusplus
}
More information about the Commits
mailing list