commit ffe51f2: [Project] Libserver migration process
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Jul 12 16:42:14 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-07-12 10:01:44 +0100
URL: https://github.com/rspamd/rspamd/commit/ffe51f25f39e94de2d4155d033f2c0946a2d7180
[Project] Libserver migration process
---
src/libmime/mime_headers.h | 5 +-
src/libserver/re_cache.c | 246 ++++++++++++++++++++++-----------------------
src/libserver/url.c | 24 +++--
3 files changed, 137 insertions(+), 138 deletions(-)
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h
index 81948d11b..9b85e758f 100644
--- a/src/libmime/mime_headers.h
+++ b/src/libmime/mime_headers.h
@@ -51,12 +51,13 @@ enum rspamd_mime_header_flags {
};
struct rspamd_mime_header {
- gchar *name; /* Also used for key */
- gchar *value;
const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */
gsize raw_len;
guint order;
int flags; /* see enum rspamd_mime_header_flags */
+ /* These are zero terminated (historically) */
+ gchar *name; /* Also used for key */
+ gchar *value;
gchar *separator;
gchar *decoded;
struct rspamd_mime_header *prev, *next; /* Headers with the same name */
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 5517ad875..592cc31d4 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -24,6 +24,7 @@
#include "libutil/regexp.h"
#include "lua/lua_common.h"
#include "libstat/stat_api.h"
+#include "contrib/uthash/utlist.h"
#include "khash.h"
@@ -950,6 +951,72 @@ rspamd_process_words_vector (GArray *words,
return cnt;
}
+static guint
+rspamd_re_cache_process_headers_list (struct rspamd_task *task,
+ struct rspamd_re_runtime *rt,
+ rspamd_regexp_t *re,
+ struct rspamd_re_class *re_class,
+ struct rspamd_mime_header *rh,
+ gboolean is_strong)
+{
+ const guchar **scvec, *in;
+ gboolean raw = FALSE;
+ guint *lenvec;
+ struct rspamd_mime_header *cur;
+ guint cnt = 0, i = 0, ret = 0;
+
+ DL_COUNT (rh, cur, cnt);
+
+ scvec = g_malloc (sizeof (*scvec) * cnt);
+ lenvec = g_malloc (sizeof (*lenvec) * cnt);
+
+ DL_FOREACH (rh, cur) {
+
+ if (is_strong && strcmp (cur->name, re_class->type_data) != 0) {
+ /* Skip a different case */
+ continue;
+ }
+
+ if (re_class->type == RSPAMD_RE_RAWHEADER) {
+ in = (const guchar *)cur->value;
+ lenvec[i] = strlen (cur->value);
+
+ if (!g_utf8_validate (in, lenvec[i], NULL)) {
+ raw = TRUE;
+ }
+ }
+ else {
+ in = (const guchar *)cur->decoded;
+ /* Validate input^W^WNo need to validate as it is already valid */
+ if (!in) {
+ lenvec[i] = 0;
+ scvec[i] = (guchar *)"";
+ continue;
+ }
+
+ lenvec[i] = strlen (in);
+ }
+
+ scvec[i] = in;
+
+ i ++;
+ }
+
+ if (i > 0) {
+ ret = rspamd_re_cache_process_regexp_data (rt, re,
+ task, scvec, lenvec, i, raw);
+ msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d",
+ re_class->type_data,
+ rspamd_regexp_get_pattern (re),
+ (int) lenvec[0], scvec[0], ret);
+ }
+
+ g_free (scvec);
+ g_free (lenvec);
+
+ return ret;
+}
+
/*
* Calculates the specified regexp for the specified class if it's not calculated
*/
@@ -961,14 +1028,14 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
gboolean is_strong)
{
guint ret = 0, i, re_id;
- GPtrArray *headerlist;
GHashTableIter it;
struct rspamd_mime_header *rh;
- const gchar *in, *end;
+ const gchar *in;
const guchar **scvec;
guint *lenvec;
gboolean raw = FALSE;
- struct rspamd_mime_text_part *part;
+ struct rspamd_mime_text_part *text_part;
+ struct rspamd_mime_part *mime_part;
struct rspamd_url *url;
gpointer k, v;
guint len, cnt;
@@ -982,140 +1049,70 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
case RSPAMD_RE_HEADER:
case RSPAMD_RE_RAWHEADER:
/* Get list of specified headers */
- headerlist = rspamd_message_get_header_array (task,
- re_class->type_data,
- is_strong);
+ rh = rspamd_message_get_header_array (task,
+ re_class->type_data);
- if (headerlist && headerlist->len > 0) {
- scvec = g_malloc (sizeof (*scvec) * headerlist->len);
- lenvec = g_malloc (sizeof (*lenvec) * headerlist->len);
-
- for (i = 0; i < headerlist->len; i ++) {
- rh = g_ptr_array_index (headerlist, i);
-
- if (re_class->type == RSPAMD_RE_RAWHEADER) {
- in = rh->value;
- lenvec[i] = strlen (rh->value);
-
- if (!g_utf8_validate (in, lenvec[i], NULL)) {
- raw = TRUE;
- }
- }
- else {
- in = rh->decoded;
- /* Validate input */
- if (!in || !g_utf8_validate (in, -1, &end)) {
- lenvec[i] = 0;
- scvec[i] = (guchar *)"";
- continue;
- }
- lenvec[i] = end - in;
- }
-
- scvec[i] = (guchar *)in;
- }
-
- ret = rspamd_re_cache_process_regexp_data (rt, re,
- task, scvec, lenvec, headerlist->len, raw);
- msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d",
- re_class->type_data,
- rspamd_regexp_get_pattern (re),
- (int)lenvec[0], scvec[0], ret);
- g_free (scvec);
- g_free (lenvec);
+ if (rh) {
+ ret = rspamd_re_cache_process_headers_list (task, rt, re,
+ re_class, rh, is_strong);
}
break;
case RSPAMD_RE_ALLHEADER:
raw = TRUE;
- in = task->raw_headers_content.begin;
- len = task->raw_headers_content.len;
+ in = MESSAGE_FIELD (task, raw_headers_content).begin;
+ len = MESSAGE_FIELD (task, raw_headers_content).len;
ret = rspamd_re_cache_process_regexp_data (rt, re,
task, (const guchar **)&in, &len, 1, raw);
msg_debug_re_task ("checking allheader regexp: %s -> %d",
rspamd_regexp_get_pattern (re), ret);
break;
case RSPAMD_RE_MIMEHEADER:
- headerlist = rspamd_message_get_mime_header_array (task,
- re_class->type_data,
- is_strong);
-
- if (headerlist && headerlist->len > 0) {
- scvec = g_malloc (sizeof (*scvec) * headerlist->len);
- lenvec = g_malloc (sizeof (*lenvec) * headerlist->len);
-
- for (i = 0; i < headerlist->len; i ++) {
- rh = g_ptr_array_index (headerlist, i);
-
- if (re_class->type == RSPAMD_RE_RAWHEADER) {
- in = rh->value;
- lenvec[i] = strlen (rh->value);
+ PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, mime_part) {
+ rh = rspamd_message_get_header_from_hash (mime_part->raw_headers,
+ re_class->type_data);
- if (!g_utf8_validate (in, lenvec[i], NULL)) {
- raw = TRUE;
- }
- }
- else {
- in = rh->decoded;
- /* Validate input */
- if (!in || !g_utf8_validate (in, -1, &end)) {
- lenvec[i] = 0;
- scvec[i] = (guchar *)"";
- continue;
- }
-
- lenvec[i] = end - in;
- }
-
- scvec[i] = (guchar *)in;
+ if (rh) {
+ ret += rspamd_re_cache_process_headers_list (task, rt, re,
+ re_class, rh, is_strong);
}
-
- ret = rspamd_re_cache_process_regexp_data (rt, re,
- task, scvec, lenvec, headerlist->len, raw);
- msg_debug_re_task ("checking mime header %s regexp: %s -> %d",
- re_class->type_data,
- rspamd_regexp_get_pattern (re), ret);
- g_free (scvec);
- g_free (lenvec);
}
break;
case RSPAMD_RE_MIME:
case RSPAMD_RE_RAWMIME:
/* Iterate through text parts */
- if (task->text_parts->len > 0) {
- cnt = task->text_parts->len;
+ if (MESSAGE_FIELD (task, text_parts)->len > 0) {
+ cnt = MESSAGE_FIELD (task, text_parts)->len;
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
- for (i = 0; i < task->text_parts->len; i++) {
- part = g_ptr_array_index (task->text_parts, i);
-
+ PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
/* Select data for regexp */
if (re_class->type == RSPAMD_RE_RAWMIME) {
- if (part->raw.len == 0) {
+ if (text_part->raw.len == 0) {
len = 0;
in = "";
}
else {
- in = part->raw.begin;
- len = part->raw.len;
+ in = text_part->raw.begin;
+ len = text_part->raw.len;
}
raw = TRUE;
}
else {
/* Skip empty parts */
- if (IS_PART_EMPTY (part)) {
+ if (IS_PART_EMPTY (text_part)) {
len = 0;
in = "";
}
else {
/* Check raw flags */
- if (!IS_PART_UTF (part)) {
+ if (!IS_PART_UTF (text_part)) {
raw = TRUE;
}
- in = part->utf_content->data;
- len = part->utf_content->len;
+ in = text_part->utf_content->data;
+ len = text_part->utf_content->len;
}
}
@@ -1132,12 +1129,13 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_URL:
- cnt = g_hash_table_size (task->urls) + g_hash_table_size (task->emails);
+ cnt = g_hash_table_size (MESSAGE_FIELD (task, urls)) +
+ g_hash_table_size (MESSAGE_FIELD (task, emails));
if (cnt > 0) {
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
- g_hash_table_iter_init (&it, task->urls);
+ g_hash_table_iter_init (&it, MESSAGE_FIELD (task, urls));
i = 0;
while (g_hash_table_iter_next (&it, &k, &v)) {
@@ -1150,7 +1148,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
lenvec[i++] = len;
}
- g_hash_table_iter_init (&it, task->emails);
+ g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
while (g_hash_table_iter_next (&it, &k, &v)) {
url = v;
@@ -1191,7 +1189,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
* paragraph when running the rules. All HTML tags and line breaks will
* be removed before matching.
*/
- cnt = task->text_parts->len + 1;
+ cnt = MESSAGE_FIELD (task, text_parts)->len + 1;
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
@@ -1200,11 +1198,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
* of the body content.
*/
- headerlist = rspamd_message_get_header_array (task, "Subject", FALSE);
-
- if (headerlist && headerlist->len > 0) {
- rh = g_ptr_array_index (headerlist, 0);
+ rh = rspamd_message_get_header_array (task, "Subject");
+ if (rh) {
scvec[0] = (guchar *)rh->decoded;
lenvec[0] = strlen (rh->decoded);
}
@@ -1212,14 +1208,13 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
scvec[0] = (guchar *)"";
lenvec[0] = 0;
}
- for (i = 0; i < task->text_parts->len; i++) {
- part = g_ptr_array_index (task->text_parts, i);
- if (part->utf_stripped_content) {
- scvec[i + 1] = (guchar *)part->utf_stripped_content->data;
- lenvec[i + 1] = part->utf_stripped_content->len;
+ PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
+ if (text_part->utf_stripped_content) {
+ scvec[i + 1] = (guchar *)text_part->utf_stripped_content->data;
+ lenvec[i + 1] = text_part->utf_stripped_content->len;
- if (!IS_PART_UTF (part)) {
+ if (!IS_PART_UTF (text_part)) {
raw = TRUE;
}
}
@@ -1244,19 +1239,19 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
* Multiline expressions will need to be used to match strings that are
* broken by line breaks.
*/
- if (task->text_parts->len > 0) {
- cnt = task->text_parts->len;
+ if (MESSAGE_FIELD (task, text_parts)->len > 0) {
+ cnt = MESSAGE_FIELD (task, text_parts)->len;
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
- for (i = 0; i < task->text_parts->len; i++) {
- part = g_ptr_array_index (task->text_parts, i);
+ for (i = 0; i < cnt; i++) {
+ text_part = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), i);
- if (part->parsed.len > 0) {
- scvec[i] = (guchar *)part->parsed.begin;
- lenvec[i] = part->parsed.len;
+ if (text_part->parsed.len > 0) {
+ scvec[i] = (guchar *)text_part->parsed.begin;
+ lenvec[i] = text_part->parsed.len;
- if (!IS_PART_UTF (part)) {
+ if (!IS_PART_UTF (text_part)) {
raw = TRUE;
}
}
@@ -1277,13 +1272,13 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
case RSPAMD_RE_WORDS:
case RSPAMD_RE_STEMWORDS:
case RSPAMD_RE_RAWWORDS:
- if (task->text_parts->len > 0) {
+ if (MESSAGE_FIELD (task, text_parts)->len > 0) {
cnt = 0;
raw = FALSE;
- PTR_ARRAY_FOREACH (task->text_parts, i, part) {
- if (part->utf_words) {
- cnt += part->utf_words->len;
+ PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
+ if (text_part->utf_words) {
+ cnt += text_part->utf_words->len;
}
}
@@ -1297,9 +1292,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
cnt = 0;
- PTR_ARRAY_FOREACH (task->text_parts, i, part) {
- if (part->utf_words) {
- cnt = rspamd_process_words_vector (part->utf_words,
+ PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
+ if (text_part->utf_words) {
+ cnt = rspamd_process_words_vector (text_part->utf_words,
scvec, lenvec, re_class, cnt, &raw);
}
}
@@ -1522,6 +1517,7 @@ rspamd_re_cache_type_to_string (enum rspamd_re_type type)
ret = "stem_words";
break;
case RSPAMD_RE_MAX:
+ default:
ret = "invalid class";
break;
}
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 0b31007bb..26e328a6d 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2949,11 +2949,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
if (url->protocol == PROTOCOL_MAILTO) {
if (url->userlen > 0) {
- target_tbl = task->emails;
+ target_tbl = MESSAGE_FIELD (task, emails);
}
}
else {
- target_tbl = task->urls;
+ target_tbl = MESSAGE_FIELD (task, urls);
}
if (target_tbl) {
@@ -2996,11 +2996,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
if (query_url->protocol == PROTOCOL_MAILTO) {
if (query_url->userlen > 0) {
- target_tbl = task->emails;
+ target_tbl = MESSAGE_FIELD (task, emails);
}
}
else {
- target_tbl = task->urls;
+ target_tbl = MESSAGE_FIELD (task, urls);
}
if (target_tbl) {
@@ -3115,9 +3115,10 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED|RSPAMD_URL_FLAG_SUBJECT;
if (url->protocol == PROTOCOL_MAILTO) {
- if (url->userlen > 0) {
- if ((existing = g_hash_table_lookup (task->emails, url)) == NULL) {
- g_hash_table_insert (task->emails, url,
+ if (url->userlen > 0 && url->hostlen > 0) {
+ if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, emails),
+ url)) == NULL) {
+ g_hash_table_insert (MESSAGE_FIELD (task, emails), url,
url);
}
else {
@@ -3126,8 +3127,9 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
}
}
else {
- if ((existing = g_hash_table_lookup (task->urls, url)) == NULL) {
- g_hash_table_insert (task->urls, url, url);
+ if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls),
+ url)) == NULL) {
+ g_hash_table_insert (MESSAGE_FIELD (task, urls), url, url);
}
else {
existing->count ++;
@@ -3156,9 +3158,9 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
}
- if ((existing = g_hash_table_lookup (task->urls,
+ if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls),
query_url)) == NULL) {
- g_hash_table_insert (task->urls,
+ g_hash_table_insert (MESSAGE_FIELD (task, urls),
query_url,
query_url);
}
More information about the Commits
mailing list