commit e040d66: [Project] Rework received headers parsing to C++

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Oct 5 11:14:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-10-03 12:52:45 +0100
URL: https://github.com/rspamd/rspamd/commit/e040d66c354b135e1281cd438958ecb3e7a8983e

[Project] Rework received headers parsing to C++

---
 src/libmime/CMakeLists.txt |   4 +-
 src/libmime/email_addr.h   |   1 -
 src/libmime/message.h      |   2 +-
 src/libmime/mime_headers.c | 793 +--------------------------------------------
 src/libmime/mime_headers.h |  46 ---
 src/libmime/received.cxx   | 745 ++++++++++++++++++++++++++++++++++++++++++
 src/libmime/received.h     |  69 ++++
 src/libmime/smtp_parsers.h |   4 -
 8 files changed, 819 insertions(+), 845 deletions(-)

diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt
index 878ac8149..4a64aac58 100644
--- a/src/libmime/CMakeLists.txt
+++ b/src/libmime/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Librspamd mime
 SET(LIBRSPAMDMIMESRC
+		${CMAKE_CURRENT_SOURCE_DIR}/received.cxx
 				${CMAKE_CURRENT_SOURCE_DIR}/email_addr.c
 				${CMAKE_CURRENT_SOURCE_DIR}/mime_expressions.c
         ${CMAKE_CURRENT_SOURCE_DIR}/scan_result.c
@@ -11,6 +12,7 @@ SET(LIBRSPAMDMIMESRC
 				${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
 				${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
 				${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
-		${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx)
+		${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
+		)
 
 SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/libmime/email_addr.h b/src/libmime/email_addr.h
index fe9fd9e9d..7e150f80d 100644
--- a/src/libmime/email_addr.h
+++ b/src/libmime/email_addr.h
@@ -58,7 +58,6 @@ struct rspamd_email_address {
 	guint flags;
 };
 
-struct rspamd_received_header;
 struct rspamd_task;
 
 /**
diff --git a/src/libmime/message.h b/src/libmime/message.h
index a391daf0d..d5329efa7 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -174,7 +174,7 @@ struct rspamd_message {
 	GPtrArray *parts;				/**< list of parsed parts							*/
 	GPtrArray *text_parts;			/**< list of text parts								*/
 	struct rspamd_message_raw_headers_content raw_headers_content;
-	struct rspamd_received_header *received;	/**< list of received headers						*/
+	void *received_headers;			/**< list of received headers						*/
 	khash_t (rspamd_url_hash) *urls;
 	struct rspamd_mime_headers_table *raw_headers;	/**< list of raw headers						*/
 	struct rspamd_mime_header *headers_order;	/**< order of raw headers							*/
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
index 7b5011be4..7afb0e7a6 100644
--- a/src/libmime/mime_headers.c
+++ b/src/libmime/mime_headers.c
@@ -17,9 +17,9 @@
 #include "mime_headers.h"
 #include "smtp_parsers.h"
 #include "mime_encoding.h"
+#include "received.h"
 #include "contrib/uthash/utlist.h"
 #include "libserver/mempool_vars_internal.h"
-#include "libserver/url.h"
 #include "libserver/cfg_file.h"
 #include "libutil/util.h"
 #include <unicode/utf8.h>
@@ -33,9 +33,6 @@ struct rspamd_mime_headers_table {
 	ref_entry_t ref;
 };
 
-#define RSPAMD_INET_ADDRESS_PARSE_RECEIVED \
-	(RSPAMD_INET_ADDRESS_PARSE_REMOTE|RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)
-
 static void
 rspamd_mime_header_check_special (struct rspamd_task *task,
 		struct rspamd_mime_header *rh)
@@ -913,794 +910,6 @@ rspamd_mime_message_id_generate (const gchar *fqdn)
 	return g_string_free (out, FALSE);
 }
 
-enum rspamd_received_part_type {
-	RSPAMD_RECEIVED_PART_FROM,
-	RSPAMD_RECEIVED_PART_BY,
-	RSPAMD_RECEIVED_PART_FOR,
-	RSPAMD_RECEIVED_PART_WITH,
-	RSPAMD_RECEIVED_PART_ID,
-	RSPAMD_RECEIVED_PART_UNKNOWN,
-};
-
-struct rspamd_received_comment {
-	gchar *data;
-	gsize dlen;
-	struct rspamd_received_comment *prev;
-};
-
-struct rspamd_received_part {
-	enum rspamd_received_part_type type;
-	gchar *data;
-	gsize dlen;
-	struct rspamd_received_comment *tail_comment;
-	struct rspamd_received_comment *head_comment;
-	struct rspamd_received_part *prev, *next;
-};
-
-static void
-rspamd_smtp_received_part_set_or_append (struct rspamd_task *task,
-										 const gchar *begin,
-										 gsize len,
-										 gchar **dest,
-										 gsize *destlen)
-{
-	if (len == 0) {
-		return;
-	}
-
-	if (*dest) {
-		/* Append */
-		gsize total_len = *destlen + len;
-		gchar *new_dest;
-
-		new_dest = rspamd_mempool_alloc (task->task_pool, total_len);
-		memcpy (new_dest, *dest, *destlen);
-		memcpy (new_dest + *destlen, begin, len);
-		rspamd_str_lc (new_dest + *destlen, len);
-		*dest = new_dest;
-		*destlen = total_len;
-	}
-	else {
-		/* Set */
-		*dest = rspamd_mempool_alloc (task->task_pool, len);
-		memcpy (*dest, begin, len);
-		rspamd_str_lc (*dest, len);
-		*dest = (gchar *)rspamd_string_len_strip (*dest, &len, " \t");
-		*destlen = len;
-	}
-}
-
-static struct rspamd_received_part *
-rspamd_smtp_received_process_part (struct rspamd_task *task,
-								   const char *data,
-								   size_t len,
-								   enum rspamd_received_part_type type,
-								   goffset *last)
-{
-	struct rspamd_received_part *npart;
-	const guchar *p, *c, *end;
-	guint obraces = 0, ebraces = 0;
-	gboolean seen_tcpinfo = FALSE;
-	enum _parse_state {
-		skip_spaces,
-		in_comment,
-		read_data,
-		read_tcpinfo,
-		all_done
-	} state, next_state;
-
-	npart = rspamd_mempool_alloc0 (task->task_pool, sizeof (*npart));
-	npart->type = type;
-
-	/* In this function, we just process comments and data separately */
-	p = data;
-	end = data + len;
-	c = data;
-	state = skip_spaces;
-	next_state = read_data;
-
-	while (p < end) {
-		switch (state) {
-		case skip_spaces:
-			if (!g_ascii_isspace (*p)) {
-				c = p;
-				state = next_state;
-			}
-			else {
-				p ++;
-			}
-			break;
-		case in_comment:
-			if (*p == '(') {
-				obraces ++;
-			}
-			else if (*p == ')') {
-				ebraces ++;
-
-				if (ebraces >= obraces) {
-					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-						if (p > c) {
-							struct rspamd_received_comment *comment;
-
-
-							comment = rspamd_mempool_alloc0 (task->task_pool,
-									sizeof (*comment));
-							rspamd_smtp_received_part_set_or_append (task,
-									c, p - c,
-									&comment->data, &comment->dlen);
-
-							if (!npart->head_comment) {
-								comment->prev = NULL;
-								npart->head_comment = comment;
-								npart->tail_comment = comment;
-							}
-							else {
-								comment->prev = npart->tail_comment;
-								npart->tail_comment = comment;
-							}
-						}
-					}
-
-					p ++;
-					c = p;
-					state = skip_spaces;
-					next_state = read_data;
-
-					continue;
-				}
-			}
-
-			p ++;
-			break;
-		case read_data:
-			if (*p == '(') {
-				if (p > c) {
-					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-						rspamd_smtp_received_part_set_or_append (task,
-								c, p - c,
-								&npart->data, &npart->dlen);
-					}
-				}
-
-				state = in_comment;
-				obraces = 1;
-				ebraces = 0;
-				p ++;
-				c = p;
-			}
-			else if (g_ascii_isspace (*p)) {
-				if (p > c) {
-					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-						rspamd_smtp_received_part_set_or_append (task,
-								c, p - c,
-								&npart->data, &npart->dlen);
-					}
-				}
-
-				state = skip_spaces;
-				next_state = read_data;
-				c = p;
-			}
-			else if (*p == ';') {
-				/* It is actually delimiter of date part if not in the comments */
-				if (p > c) {
-					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-						rspamd_smtp_received_part_set_or_append (task,
-								c, p - c,
-								&npart->data, &npart->dlen);
-					}
-				}
-
-				state = all_done;
-				continue;
-			}
-			else if (npart->dlen > 0) {
-				/* We have already received data and find something with no ( */
-				if (!seen_tcpinfo && type == RSPAMD_RECEIVED_PART_FROM) {
-					/* Check if we have something special here, such as TCPinfo */
-					if (*c == '[') {
-						state = read_tcpinfo;
-						p ++;
-					}
-					else {
-						state = all_done;
-						continue;
-					}
-				}
-				else {
-					state = all_done;
-					continue;
-				}
-			}
-			else {
-				p ++;
-			}
-			break;
-		case read_tcpinfo:
-			if (*p == ']') {
-				rspamd_smtp_received_part_set_or_append (task,
-						c, p - c + 1,
-						&npart->data, &npart->dlen);
-				seen_tcpinfo = TRUE;
-				state = skip_spaces;
-				next_state = read_data;
-				c = p;
-			}
-			p ++;
-			break;
-		case all_done:
-			if (p > (const guchar *)data) {
-				*last = p - (const guchar *) data;
-				return npart;
-			}
-			else {
-				/* Empty element */
-				return NULL;
-			}
-			break;
-		}
-	}
-
-	/* Leftover */
-	switch (state) {
-	case read_data:
-		if (p > c) {
-			if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
-				rspamd_smtp_received_part_set_or_append (task,
-						c, p - c,
-						&npart->data, &npart->dlen);
-			}
-
-			*last = p - (const guchar *)data;
-
-			return npart;
-		}
-		break;
-	case skip_spaces:
-		if (p > (const guchar *)data) {
-			*last = p - (const guchar *) data;
-
-			return npart;
-		}
-	default:
-		break;
-	}
-
-	return NULL;
-}
-
-static struct rspamd_received_part *
-rspamd_smtp_received_spill (struct rspamd_task *task,
-							const char *data,
-							size_t len,
-							goffset *date_pos)
-{
-	const guchar *p, *end;
-	struct rspamd_received_part *cur_part, *head = NULL;
-	goffset pos = 0;
-
-	p = data;
-	end = data + len;
-
-	while (p < end && g_ascii_isspace (*p)) {
-		p ++;
-	}
-
-	len = end - p;
-
-	/* Ignore all received but those started from from part */
-	if (len <= 4 || (lc_map[p[0]] != 'f' &&
-					 lc_map[p[1]] != 'r' &&
-					 lc_map[p[2]] != 'o' &&
-					 lc_map[p[3]] != 'm')) {
-		return NULL;
-	}
-
-	p += sizeof ("from") - 1;
-
-	/* We can now store from part */
-	cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-			RSPAMD_RECEIVED_PART_FROM, &pos);
-
-	if (!cur_part) {
-		return NULL;
-	}
-
-	g_assert (pos != 0);
-	p += pos;
-	len = end > p ? end - p : 0;
-	DL_APPEND (head, cur_part);
-
-	if (len > 2 && (lc_map[p[0]] == 'b' &&
-					lc_map[p[1]] == 'y')) {
-		p += sizeof ("by") - 1;
-
-		cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-				RSPAMD_RECEIVED_PART_BY, &pos);
-
-		if (!cur_part) {
-			return NULL;
-		}
-
-		g_assert (pos != 0);
-		p += pos;
-		len = end > p ? end - p : 0;
-		DL_APPEND (head, cur_part);
-	}
-
-	while (p < end) {
-		if (*p == ';') {
-			/* We are at the date separator, stop here */
-			*date_pos = p - (const guchar *)data + 1;
-			break;
-		}
-		else {
-			if (len > sizeof ("with") && (lc_map[p[0]] == 'w' &&
-										  lc_map[p[1]] == 'i' &&
-										  lc_map[p[2]] == 't' &&
-										  lc_map[p[3]] == 'h')) {
-				p += sizeof ("with") - 1;
-
-				cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-						RSPAMD_RECEIVED_PART_WITH, &pos);
-			}
-			else if (len > sizeof ("for") && (lc_map[p[0]] == 'f' &&
-											  lc_map[p[1]] == 'o' &&
-											  lc_map[p[2]] == 'r')) {
-				p += sizeof ("for") - 1;
-				cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-						RSPAMD_RECEIVED_PART_FOR, &pos);
-			}
-			else if (len > sizeof ("id") && (lc_map[p[0]] == 'i' &&
-											  lc_map[p[1]] == 'd')) {
-				p += sizeof ("id") - 1;
-				cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-						RSPAMD_RECEIVED_PART_ID, &pos);
-			}
-			else {
-				while (p < end) {
-					if (!(g_ascii_isspace (*p) || *p == '(' || *p == ';')) {
-						p ++;
-					}
-					else {
-						break;
-					}
-				}
-
-				if (p == end) {
-					return NULL;
-				}
-				else if (*p == ';') {
-					*date_pos = p - (const guchar *)data + 1;
-					break;
-				}
-				else {
-					cur_part = rspamd_smtp_received_process_part (task, p, end - p,
-							RSPAMD_RECEIVED_PART_UNKNOWN, &pos);
-				}
-			}
-
-			if (!cur_part) {
-				p ++;
-				len = end > p ? end - p : 0;
-			}
-			else {
-				g_assert (pos != 0);
-				p += pos;
-				len = end > p ? end - p : 0;
-				DL_APPEND (head, cur_part);
-			}
-		}
-	}
-
-	return head;
-}
-
-static gboolean
-rspamd_smtp_received_process_rdns (struct rspamd_task *task,
-								   const gchar *begin,
-								   gsize len,
-								   const gchar **pdest)
-{
-	const gchar *p, *end;
-	gsize hlen = 0;
-	gboolean seen_dot = FALSE;
-
-	p = begin;
-	end = begin + len;
-
-	if (len == 0) {
-		return FALSE;
-	}
-
-	if (*p == '[' && *(end - 1) == ']' && len > 2) {
-		/* We have enclosed ip address */
-		rspamd_inet_addr_t  *addr = rspamd_parse_inet_address_pool (p + 1,
-				(end - p) - 2,
-				task->task_pool,
-				RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-		if (addr) {
-			const gchar *addr_str;
-			gchar *dest;
-
-			if (rspamd_inet_address_get_port (addr) != 0) {
-				addr_str = rspamd_inet_address_to_string_pretty (addr);
-			}
-			else {
-				addr_str = rspamd_inet_address_to_string (addr);
-			}
-			dest = rspamd_mempool_strdup (task->task_pool, addr_str);
-			*pdest = dest;
-
-			return TRUE;
-		}
-	}
-
-	while (p < end) {
-		if (!g_ascii_isspace (*p) && rspamd_url_is_domain (*p)) {
-			if (*p == '.') {
-				seen_dot = TRUE;
-			}
-
-			hlen ++;
-		}
-		else {
-			break;
-		}
-
-		p ++;
-	}
-
-	if (hlen > 0) {
-		if (p == end) {
-			/* All data looks like a hostname */
-			gchar *dest;
-
-			dest = rspamd_mempool_alloc (task->task_pool,
-					hlen + 1);
-			rspamd_strlcpy (dest, begin, hlen + 1);
-			*pdest = dest;
-
-			return TRUE;
-		}
-		else if (seen_dot && (g_ascii_isspace (*p) || *p == '[' || *p == '(')) {
-			gchar *dest;
-
-			dest = rspamd_mempool_alloc (task->task_pool,
-					hlen + 1);
-			rspamd_strlcpy (dest, begin, hlen + 1);
-			*pdest = dest;
-
-			return TRUE;
-		}
-	}
-
-	return FALSE;
-}
-
-static gboolean
-rspamd_smtp_received_process_host_tcpinfo (struct rspamd_task *task,
-										   struct rspamd_received_header *rh,
-										   const gchar *data,
-										   gsize len)
-{
-	rspamd_inet_addr_t *addr = NULL;
-	gboolean ret = FALSE;
-
-	if (data[0] == '[') {
-		/* Likely Exim version */
-
-		const gchar *brace_pos = memchr (data, ']', len);
-
-		if (brace_pos) {
-			addr = rspamd_parse_inet_address_pool (data + 1,
-					brace_pos - data - 1,
-					task->task_pool,
-					RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-			if (addr) {
-				rh->addr = addr;
-				rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-						rspamd_inet_address_to_string (addr));
-				rh->from_ip = rh->real_ip;
-			}
-		}
-	}
-	else {
-		if (g_ascii_isxdigit (data[0])) {
-			/* Try to parse IP address */
-			addr = rspamd_parse_inet_address_pool (data,
-					len, task->task_pool, RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-			if (addr) {
-				rh->addr = addr;
-				rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-						rspamd_inet_address_to_string (addr));
-				rh->from_ip = rh->real_ip;
-			}
-		}
-
-		if (!addr) {
-			/* Try canonical Postfix version: rdns [ip] */
-			const gchar *obrace_pos = memchr (data, '[', len),
-					*ebrace_pos, *dend;
-
-			if (obrace_pos) {
-				dend = data + len;
-				ebrace_pos = memchr (obrace_pos, ']', dend - obrace_pos);
-
-				if (ebrace_pos) {
-					addr = rspamd_parse_inet_address_pool (obrace_pos + 1,
-							ebrace_pos - obrace_pos - 1,
-							task->task_pool,
-							RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-					if (addr) {
-						rh->addr = addr;
-						rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-								rspamd_inet_address_to_string (addr));
-						rh->from_ip = rh->real_ip;
-
-						/* Process with rDNS */
-						if (rspamd_smtp_received_process_rdns (task,
-								data,
-								obrace_pos - data,
-								&rh->real_hostname)) {
-							ret = TRUE;
-						}
-					}
-				}
-			}
-			else {
-				/* Hostname or some crap, sigh... */
-				if (rspamd_smtp_received_process_rdns (task,
-						data,
-						len,
-						&rh->real_hostname)) {
-					ret = TRUE;
-				}
-			}
-		}
-	}
-
-	return ret;
-}
-
-static void
-rspamd_smtp_received_process_from (struct rspamd_task *task,
-								   struct rspamd_received_part *rpart,
-								   struct rspamd_received_header *rh)
-{
-	if (rpart->dlen > 0) {
-		/* We have seen multiple cases:
-		 * - [ip] (hostname/unknown [real_ip])
-		 * - helo (hostname/unknown [real_ip])
-		 * - [ip]
-		 * - hostname
-		 * - hostname ([ip]:port helo=xxx)
-		 * Maybe more...
-		 */
-		gboolean seen_ip_in_data = FALSE;
-
-		if (rpart->head_comment && rpart->head_comment->dlen > 0) {
-			/* We can have info within comment as part of RFC */
-			rspamd_smtp_received_process_host_tcpinfo (
-					task, rh,
-					rpart->head_comment->data, rpart->head_comment->dlen);
-		}
-
-		if (!rh->real_ip) {
-			if (rpart->data[0] == '[') {
-				/* No comment, just something that looks like SMTP IP */
-				const gchar *brace_pos = memchr (rpart->data, ']', rpart->dlen);
-				rspamd_inet_addr_t *addr;
-
-				if (brace_pos) {
-					addr = rspamd_parse_inet_address_pool (rpart->data + 1,
-							brace_pos - rpart->data - 1,
-							task->task_pool,
-							RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-
-					if (addr) {
-						seen_ip_in_data = TRUE;
-						rh->addr = addr;
-						rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-								rspamd_inet_address_to_string (addr));
-						rh->from_ip = rh->real_ip;
-					}
-				}
-			}
-			else if (g_ascii_isxdigit (rpart->data[0])) {
-				/* Try to parse IP address */
-				rspamd_inet_addr_t *addr;
-				addr = rspamd_parse_inet_address_pool (rpart->data,
-						rpart->dlen, task->task_pool,
-						RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
-				if (addr) {
-					seen_ip_in_data = TRUE;
-					rh->addr = addr;
-					rh->real_ip = rspamd_mempool_strdup (task->task_pool,
-							rspamd_inet_address_to_string (addr));
*** OUTPUT TRUNCATED, 1081 LINES SKIPPED ***


More information about the Commits mailing list