commit e040d66: [Project] Rework received headers parsing to C++
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Oct 5 11:14:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-10-03 12:52:45 +0100
URL: https://github.com/rspamd/rspamd/commit/e040d66c354b135e1281cd438958ecb3e7a8983e
[Project] Rework received headers parsing to C++
---
src/libmime/CMakeLists.txt | 4 +-
src/libmime/email_addr.h | 1 -
src/libmime/message.h | 2 +-
src/libmime/mime_headers.c | 793 +--------------------------------------------
src/libmime/mime_headers.h | 46 ---
src/libmime/received.cxx | 745 ++++++++++++++++++++++++++++++++++++++++++
src/libmime/received.h | 69 ++++
src/libmime/smtp_parsers.h | 4 -
8 files changed, 819 insertions(+), 845 deletions(-)
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt
index 878ac8149..4a64aac58 100644
--- a/src/libmime/CMakeLists.txt
+++ b/src/libmime/CMakeLists.txt
@@ -1,5 +1,6 @@
# Librspamd mime
SET(LIBRSPAMDMIMESRC
+ ${CMAKE_CURRENT_SOURCE_DIR}/received.cxx
${CMAKE_CURRENT_SOURCE_DIR}/email_addr.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_expressions.c
${CMAKE_CURRENT_SOURCE_DIR}/scan_result.c
@@ -11,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
- ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx)
+ ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
+ )
SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/libmime/email_addr.h b/src/libmime/email_addr.h
index fe9fd9e9d..7e150f80d 100644
--- a/src/libmime/email_addr.h
+++ b/src/libmime/email_addr.h
@@ -58,7 +58,6 @@ struct rspamd_email_address {
guint flags;
};
-struct rspamd_received_header;
struct rspamd_task;
/**
diff --git a/src/libmime/message.h b/src/libmime/message.h
index a391daf0d..d5329efa7 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -174,7 +174,7 @@ struct rspamd_message {
GPtrArray *parts; /**< list of parsed parts */
GPtrArray *text_parts; /**< list of text parts */
struct rspamd_message_raw_headers_content raw_headers_content;
- struct rspamd_received_header *received; /**< list of received headers */
+ void *received_headers; /**< list of received headers */
khash_t (rspamd_url_hash) *urls;
struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */
struct rspamd_mime_header *headers_order; /**< order of raw headers */
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
index 7b5011be4..7afb0e7a6 100644
--- a/src/libmime/mime_headers.c
+++ b/src/libmime/mime_headers.c
@@ -17,9 +17,9 @@
#include "mime_headers.h"
#include "smtp_parsers.h"
#include "mime_encoding.h"
+#include "received.h"
#include "contrib/uthash/utlist.h"
#include "libserver/mempool_vars_internal.h"
-#include "libserver/url.h"
#include "libserver/cfg_file.h"
#include "libutil/util.h"
#include <unicode/utf8.h>
@@ -33,9 +33,6 @@ struct rspamd_mime_headers_table {
ref_entry_t ref;
};
-#define RSPAMD_INET_ADDRESS_PARSE_RECEIVED \
- (RSPAMD_INET_ADDRESS_PARSE_REMOTE|RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)
-
static void
rspamd_mime_header_check_special (struct rspamd_task *task,
struct rspamd_mime_header *rh)
@@ -913,794 +910,6 @@ rspamd_mime_message_id_generate (const gchar *fqdn)
return g_string_free (out, FALSE);
}
-enum rspamd_received_part_type {
- RSPAMD_RECEIVED_PART_FROM,
- RSPAMD_RECEIVED_PART_BY,
- RSPAMD_RECEIVED_PART_FOR,
- RSPAMD_RECEIVED_PART_WITH,
- RSPAMD_RECEIVED_PART_ID,
- RSPAMD_RECEIVED_PART_UNKNOWN,
-};
-
-struct rspamd_received_comment {
- gchar *data;
- gsize dlen;
- struct rspamd_received_comment *prev;
-};
-
-struct rspamd_received_part {
- enum rspamd_received_part_type type;
- gchar *data;
- gsize dlen;
- struct rspamd_received_comment *tail_comment;
- struct rspamd_received_comment *head_comment;
- struct rspamd_received_part *prev, *next;
-};
-
-static void
-rspamd_smtp_received_part_set_or_append (struct rspamd_task *task,
- const gchar *begin,
- gsize len,
- gchar **dest,
- gsize *destlen)
-{
- if (len == 0) {
- return;
- }
-
- if (*dest) {
- /* Append */
- gsize total_len = *destlen + len;
- gchar *new_dest;
-
- new_dest = rspamd_mempool_alloc (task->task_pool, total_len);
- memcpy (new_dest, *dest, *destlen);
- memcpy (new_dest + *destlen, begin, len);
- rspamd_str_lc (new_dest + *destlen, len);
- *dest = new_dest;
- *destlen = total_len;
- }
- else {
- /* Set */
- *dest = rspamd_mempool_alloc (task->task_pool, len);
- memcpy (*dest, begin, len);
- rspamd_str_lc (*dest, len);
- *dest = (gchar *)rspamd_string_len_strip (*dest, &len, " \t");
- *destlen = len;
- }
-}
-
-static struct rspamd_received_part *
-rspamd_smtp_received_process_part (struct rspamd_task *task,
- const char *data,
- size_t len,
- enum rspamd_received_part_type type,
- goffset *last)
-{
- struct rspamd_received_part *npart;
- const guchar *p, *c, *end;
- guint obraces = 0, ebraces = 0;
- gboolean seen_tcpinfo = FALSE;
- enum _parse_state {
- skip_spaces,
- in_comment,
- read_data,
- read_tcpinfo,
- all_done
- } state, next_state;
-
- npart = rspamd_mempool_alloc0 (task->task_pool, sizeof (*npart));
- npart->type = type;
-
- /* In this function, we just process comments and data separately */
- p = data;
- end = data + len;
- c = data;
- state = skip_spaces;
- next_state = read_data;
-
- while (p < end) {
- switch (state) {
- case skip_spaces:
- if (!g_ascii_isspace (*p)) {
- c = p;
- state = next_state;
- }
- else {
- p ++;
- }
- break;
- case in_comment:
- if (*p == '(') {
- obraces ++;
- }
- else if (*p == ')') {
- ebraces ++;
-
- if (ebraces >= obraces) {
- if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
- if (p > c) {
- struct rspamd_received_comment *comment;
-
-
- comment = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (*comment));
- rspamd_smtp_received_part_set_or_append (task,
- c, p - c,
- &comment->data, &comment->dlen);
-
- if (!npart->head_comment) {
- comment->prev = NULL;
- npart->head_comment = comment;
- npart->tail_comment = comment;
- }
- else {
- comment->prev = npart->tail_comment;
- npart->tail_comment = comment;
- }
- }
- }
-
- p ++;
- c = p;
- state = skip_spaces;
- next_state = read_data;
-
- continue;
- }
- }
-
- p ++;
- break;
- case read_data:
- if (*p == '(') {
- if (p > c) {
- if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
- rspamd_smtp_received_part_set_or_append (task,
- c, p - c,
- &npart->data, &npart->dlen);
- }
- }
-
- state = in_comment;
- obraces = 1;
- ebraces = 0;
- p ++;
- c = p;
- }
- else if (g_ascii_isspace (*p)) {
- if (p > c) {
- if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
- rspamd_smtp_received_part_set_or_append (task,
- c, p - c,
- &npart->data, &npart->dlen);
- }
- }
-
- state = skip_spaces;
- next_state = read_data;
- c = p;
- }
- else if (*p == ';') {
- /* It is actually delimiter of date part if not in the comments */
- if (p > c) {
- if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
- rspamd_smtp_received_part_set_or_append (task,
- c, p - c,
- &npart->data, &npart->dlen);
- }
- }
-
- state = all_done;
- continue;
- }
- else if (npart->dlen > 0) {
- /* We have already received data and find something with no ( */
- if (!seen_tcpinfo && type == RSPAMD_RECEIVED_PART_FROM) {
- /* Check if we have something special here, such as TCPinfo */
- if (*c == '[') {
- state = read_tcpinfo;
- p ++;
- }
- else {
- state = all_done;
- continue;
- }
- }
- else {
- state = all_done;
- continue;
- }
- }
- else {
- p ++;
- }
- break;
- case read_tcpinfo:
- if (*p == ']') {
- rspamd_smtp_received_part_set_or_append (task,
- c, p - c + 1,
- &npart->data, &npart->dlen);
- seen_tcpinfo = TRUE;
- state = skip_spaces;
- next_state = read_data;
- c = p;
- }
- p ++;
- break;
- case all_done:
- if (p > (const guchar *)data) {
- *last = p - (const guchar *) data;
- return npart;
- }
- else {
- /* Empty element */
- return NULL;
- }
- break;
- }
- }
-
- /* Leftover */
- switch (state) {
- case read_data:
- if (p > c) {
- if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
- rspamd_smtp_received_part_set_or_append (task,
- c, p - c,
- &npart->data, &npart->dlen);
- }
-
- *last = p - (const guchar *)data;
-
- return npart;
- }
- break;
- case skip_spaces:
- if (p > (const guchar *)data) {
- *last = p - (const guchar *) data;
-
- return npart;
- }
- default:
- break;
- }
-
- return NULL;
-}
-
-static struct rspamd_received_part *
-rspamd_smtp_received_spill (struct rspamd_task *task,
- const char *data,
- size_t len,
- goffset *date_pos)
-{
- const guchar *p, *end;
- struct rspamd_received_part *cur_part, *head = NULL;
- goffset pos = 0;
-
- p = data;
- end = data + len;
-
- while (p < end && g_ascii_isspace (*p)) {
- p ++;
- }
-
- len = end - p;
-
- /* Ignore all received but those started from from part */
- if (len <= 4 || (lc_map[p[0]] != 'f' &&
- lc_map[p[1]] != 'r' &&
- lc_map[p[2]] != 'o' &&
- lc_map[p[3]] != 'm')) {
- return NULL;
- }
-
- p += sizeof ("from") - 1;
-
- /* We can now store from part */
- cur_part = rspamd_smtp_received_process_part (task, p, end - p,
- RSPAMD_RECEIVED_PART_FROM, &pos);
-
- if (!cur_part) {
- return NULL;
- }
-
- g_assert (pos != 0);
- p += pos;
- len = end > p ? end - p : 0;
- DL_APPEND (head, cur_part);
-
- if (len > 2 && (lc_map[p[0]] == 'b' &&
- lc_map[p[1]] == 'y')) {
- p += sizeof ("by") - 1;
-
- cur_part = rspamd_smtp_received_process_part (task, p, end - p,
- RSPAMD_RECEIVED_PART_BY, &pos);
-
- if (!cur_part) {
- return NULL;
- }
-
- g_assert (pos != 0);
- p += pos;
- len = end > p ? end - p : 0;
- DL_APPEND (head, cur_part);
- }
-
- while (p < end) {
- if (*p == ';') {
- /* We are at the date separator, stop here */
- *date_pos = p - (const guchar *)data + 1;
- break;
- }
- else {
- if (len > sizeof ("with") && (lc_map[p[0]] == 'w' &&
- lc_map[p[1]] == 'i' &&
- lc_map[p[2]] == 't' &&
- lc_map[p[3]] == 'h')) {
- p += sizeof ("with") - 1;
-
- cur_part = rspamd_smtp_received_process_part (task, p, end - p,
- RSPAMD_RECEIVED_PART_WITH, &pos);
- }
- else if (len > sizeof ("for") && (lc_map[p[0]] == 'f' &&
- lc_map[p[1]] == 'o' &&
- lc_map[p[2]] == 'r')) {
- p += sizeof ("for") - 1;
- cur_part = rspamd_smtp_received_process_part (task, p, end - p,
- RSPAMD_RECEIVED_PART_FOR, &pos);
- }
- else if (len > sizeof ("id") && (lc_map[p[0]] == 'i' &&
- lc_map[p[1]] == 'd')) {
- p += sizeof ("id") - 1;
- cur_part = rspamd_smtp_received_process_part (task, p, end - p,
- RSPAMD_RECEIVED_PART_ID, &pos);
- }
- else {
- while (p < end) {
- if (!(g_ascii_isspace (*p) || *p == '(' || *p == ';')) {
- p ++;
- }
- else {
- break;
- }
- }
-
- if (p == end) {
- return NULL;
- }
- else if (*p == ';') {
- *date_pos = p - (const guchar *)data + 1;
- break;
- }
- else {
- cur_part = rspamd_smtp_received_process_part (task, p, end - p,
- RSPAMD_RECEIVED_PART_UNKNOWN, &pos);
- }
- }
-
- if (!cur_part) {
- p ++;
- len = end > p ? end - p : 0;
- }
- else {
- g_assert (pos != 0);
- p += pos;
- len = end > p ? end - p : 0;
- DL_APPEND (head, cur_part);
- }
- }
- }
-
- return head;
-}
-
-static gboolean
-rspamd_smtp_received_process_rdns (struct rspamd_task *task,
- const gchar *begin,
- gsize len,
- const gchar **pdest)
-{
- const gchar *p, *end;
- gsize hlen = 0;
- gboolean seen_dot = FALSE;
-
- p = begin;
- end = begin + len;
-
- if (len == 0) {
- return FALSE;
- }
-
- if (*p == '[' && *(end - 1) == ']' && len > 2) {
- /* We have enclosed ip address */
- rspamd_inet_addr_t *addr = rspamd_parse_inet_address_pool (p + 1,
- (end - p) - 2,
- task->task_pool,
- RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
- if (addr) {
- const gchar *addr_str;
- gchar *dest;
-
- if (rspamd_inet_address_get_port (addr) != 0) {
- addr_str = rspamd_inet_address_to_string_pretty (addr);
- }
- else {
- addr_str = rspamd_inet_address_to_string (addr);
- }
- dest = rspamd_mempool_strdup (task->task_pool, addr_str);
- *pdest = dest;
-
- return TRUE;
- }
- }
-
- while (p < end) {
- if (!g_ascii_isspace (*p) && rspamd_url_is_domain (*p)) {
- if (*p == '.') {
- seen_dot = TRUE;
- }
-
- hlen ++;
- }
- else {
- break;
- }
-
- p ++;
- }
-
- if (hlen > 0) {
- if (p == end) {
- /* All data looks like a hostname */
- gchar *dest;
-
- dest = rspamd_mempool_alloc (task->task_pool,
- hlen + 1);
- rspamd_strlcpy (dest, begin, hlen + 1);
- *pdest = dest;
-
- return TRUE;
- }
- else if (seen_dot && (g_ascii_isspace (*p) || *p == '[' || *p == '(')) {
- gchar *dest;
-
- dest = rspamd_mempool_alloc (task->task_pool,
- hlen + 1);
- rspamd_strlcpy (dest, begin, hlen + 1);
- *pdest = dest;
-
- return TRUE;
- }
- }
-
- return FALSE;
-}
-
-static gboolean
-rspamd_smtp_received_process_host_tcpinfo (struct rspamd_task *task,
- struct rspamd_received_header *rh,
- const gchar *data,
- gsize len)
-{
- rspamd_inet_addr_t *addr = NULL;
- gboolean ret = FALSE;
-
- if (data[0] == '[') {
- /* Likely Exim version */
-
- const gchar *brace_pos = memchr (data, ']', len);
-
- if (brace_pos) {
- addr = rspamd_parse_inet_address_pool (data + 1,
- brace_pos - data - 1,
- task->task_pool,
- RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
- if (addr) {
- rh->addr = addr;
- rh->real_ip = rspamd_mempool_strdup (task->task_pool,
- rspamd_inet_address_to_string (addr));
- rh->from_ip = rh->real_ip;
- }
- }
- }
- else {
- if (g_ascii_isxdigit (data[0])) {
- /* Try to parse IP address */
- addr = rspamd_parse_inet_address_pool (data,
- len, task->task_pool, RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
- if (addr) {
- rh->addr = addr;
- rh->real_ip = rspamd_mempool_strdup (task->task_pool,
- rspamd_inet_address_to_string (addr));
- rh->from_ip = rh->real_ip;
- }
- }
-
- if (!addr) {
- /* Try canonical Postfix version: rdns [ip] */
- const gchar *obrace_pos = memchr (data, '[', len),
- *ebrace_pos, *dend;
-
- if (obrace_pos) {
- dend = data + len;
- ebrace_pos = memchr (obrace_pos, ']', dend - obrace_pos);
-
- if (ebrace_pos) {
- addr = rspamd_parse_inet_address_pool (obrace_pos + 1,
- ebrace_pos - obrace_pos - 1,
- task->task_pool,
- RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
- if (addr) {
- rh->addr = addr;
- rh->real_ip = rspamd_mempool_strdup (task->task_pool,
- rspamd_inet_address_to_string (addr));
- rh->from_ip = rh->real_ip;
-
- /* Process with rDNS */
- if (rspamd_smtp_received_process_rdns (task,
- data,
- obrace_pos - data,
- &rh->real_hostname)) {
- ret = TRUE;
- }
- }
- }
- }
- else {
- /* Hostname or some crap, sigh... */
- if (rspamd_smtp_received_process_rdns (task,
- data,
- len,
- &rh->real_hostname)) {
- ret = TRUE;
- }
- }
- }
- }
-
- return ret;
-}
-
-static void
-rspamd_smtp_received_process_from (struct rspamd_task *task,
- struct rspamd_received_part *rpart,
- struct rspamd_received_header *rh)
-{
- if (rpart->dlen > 0) {
- /* We have seen multiple cases:
- * - [ip] (hostname/unknown [real_ip])
- * - helo (hostname/unknown [real_ip])
- * - [ip]
- * - hostname
- * - hostname ([ip]:port helo=xxx)
- * Maybe more...
- */
- gboolean seen_ip_in_data = FALSE;
-
- if (rpart->head_comment && rpart->head_comment->dlen > 0) {
- /* We can have info within comment as part of RFC */
- rspamd_smtp_received_process_host_tcpinfo (
- task, rh,
- rpart->head_comment->data, rpart->head_comment->dlen);
- }
-
- if (!rh->real_ip) {
- if (rpart->data[0] == '[') {
- /* No comment, just something that looks like SMTP IP */
- const gchar *brace_pos = memchr (rpart->data, ']', rpart->dlen);
- rspamd_inet_addr_t *addr;
-
- if (brace_pos) {
- addr = rspamd_parse_inet_address_pool (rpart->data + 1,
- brace_pos - rpart->data - 1,
- task->task_pool,
- RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
- if (addr) {
- seen_ip_in_data = TRUE;
- rh->addr = addr;
- rh->real_ip = rspamd_mempool_strdup (task->task_pool,
- rspamd_inet_address_to_string (addr));
- rh->from_ip = rh->real_ip;
- }
- }
- }
- else if (g_ascii_isxdigit (rpart->data[0])) {
- /* Try to parse IP address */
- rspamd_inet_addr_t *addr;
- addr = rspamd_parse_inet_address_pool (rpart->data,
- rpart->dlen, task->task_pool,
- RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
- if (addr) {
- seen_ip_in_data = TRUE;
- rh->addr = addr;
- rh->real_ip = rspamd_mempool_strdup (task->task_pool,
- rspamd_inet_address_to_string (addr));
*** OUTPUT TRUNCATED, 1081 LINES SKIPPED ***
More information about the Commits
mailing list