commit d525194: [Project] Add spilling machine for received headers

Thu Feb 7 15:14:07 UTC 2019

Author: Vsevolod Stakhov
Date: 2019-02-06 12:36:10 +0000
URL: https://github.com/rspamd/rspamd/commit/d525194397181456bba6edea4680a10403c3415c

[Project] Add spilling machine for received headers

---
 src/libmime/message.h      |  30 ----
 src/libmime/mime_headers.c | 331 +++++++++++++++++++++++++++++++++++++++++++++
 src/libmime/mime_headers.h |  31 +++++
 src/libmime/smtp_parsers.h |   3 +
 4 files changed, 365 insertions(+), 30 deletions(-)

diff --git a/src/libmime/message.h b/src/libmime/message.h
index 19e8b40b5..eb260cd77 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -125,36 +125,6 @@ struct rspamd_mime_text_part {
 	guint unicode_scripts;
 };
 
-enum rspamd_received_type {
-	RSPAMD_RECEIVED_SMTP = 0,
-	RSPAMD_RECEIVED_ESMTP,
-	RSPAMD_RECEIVED_ESMTPA,
-	RSPAMD_RECEIVED_ESMTPS,
-	RSPAMD_RECEIVED_ESMTPSA,
-	RSPAMD_RECEIVED_LMTP,
-	RSPAMD_RECEIVED_IMAP,
-	RSPAMD_RECEIVED_UNKNOWN
-};
-
-#define RSPAMD_RECEIVED_FLAG_ARTIFICIAL (1 << 0)
-#define RSPAMD_RECEIVED_FLAG_SSL (1 << 1)
-#define RSPAMD_RECEIVED_FLAG_AUTHENTICATED (1 << 2)
-
-struct received_header {
-	gchar *from_hostname;
-	gchar *from_ip;
-	gchar *real_hostname;
-	gchar *real_ip;
-	gchar *by_hostname;
-	gchar *for_mbox;
-	gchar *comment_ip;
-	rspamd_inet_addr_t *addr;
-	struct rspamd_mime_header *hdr;
-	time_t timestamp;
-	enum rspamd_received_type type;
-	gint flags;
-};
-
 /**
  * Parse and pre-process mime message
  * @param task worker_task object
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
index 2769ae633..19ad3262e 100644
--- a/src/libmime/mime_headers.c
+++ b/src/libmime/mime_headers.c
@@ -17,6 +17,7 @@
 #include "mime_headers.h"
 #include "smtp_parsers.h"
 #include "mime_encoding.h"
+#include "contrib/uthash/utlist.h"
 #include "libserver/mempool_vars_internal.h"
 #include <unicode/utf8.h>
 
@@ -848,3 +849,333 @@ rspamd_mime_message_id_generate (const gchar *fqdn)
 
 	return g_string_free (out, FALSE);
 }
+
+enum rspamd_received_part_type {
+	RSPAMD_RECEIVED_PART_FROM,
+	RSPAMD_RECEIVED_PART_BY,
+	RSPAMD_RECEIVED_PART_FOR,
+	RSPAMD_RECEIVED_PART_WITH,
+	RSPAMD_RECEIVED_PART_UNKNOWN,
+};
+
+struct rspamd_received_comment {
+	gchar *data;
+	gsize dlen;
+	struct rspamd_received_comment *prev;
+};
+
+struct rspamd_received_part {
+	enum rspamd_received_part_type type;
+	gchar *data;
+	gsize dlen;
+	struct rspamd_received_comment *tail_comment;
+	struct rspamd_received_comment *head_comment;
+	struct rspamd_received_part *prev, *next;
+};
+
+static struct rspamd_received_part *
+rspamd_smtp_received_process_part (struct rspamd_task *task,
+								   const char *data,
+								   size_t len,
+								   enum rspamd_received_part_type type,
+								   goffset *last)
+{
+	struct rspamd_received_part *npart;
+	const guchar *p, *c, *end;
+	guint obraces = 0, ebraces = 0;
+	enum _parse_state {
+		skip_spaces,
+		in_comment,
+		read_data,
+		all_done
+	} state, next_state;
+
+	npart = rspamd_mempool_alloc0 (task->task_pool, sizeof (*npart));
+	npart->type = type;
+
+	/* In this function, we just process comments and data separately */
+	p = data;
+	end = data + len;
+	c = data;
+	state = skip_spaces;
+	next_state = read_data;
+
+	while (p < end) {
+		switch (state) {
+		case skip_spaces:
+			if (!g_ascii_isspace (*p)) {
+				c = p;
+				state = next_state;
+			}
+			else {
+				p ++;
+			}
+			break;
+		case in_comment:
+			if (*p == '(') {
+				obraces ++;
+			}
+			else if (*p == ')') {
+				ebraces ++;
+
+				if (ebraces >= obraces) {
+					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
+						if (p > c) {
+							struct rspamd_received_comment *comment;
+
+							comment = rspamd_mempool_alloc (task->task_pool,
+									sizeof (*comment));
+
+							comment->data = rspamd_mempool_alloc (task->task_pool,
+									p - c);
+							memcpy (comment->data, c, p - c);
+							rspamd_str_lc (comment->data, p - c);
+							comment->dlen = p - c;
+
+							if (!npart->head_comment) {
+								comment->prev = NULL;
+								npart->head_comment = comment;
+								npart->tail_comment = comment;
+							}
+							else {
+								comment->prev = npart->tail_comment;
+								npart->tail_comment = comment;
+							}
+						}
+					}
+
+					p ++;
+					c = p;
+					state = skip_spaces;
+					next_state = read_data;
+
+					continue;
+				}
+			}
+
+			p ++;
+			break;
+		case read_data:
+			if (*p == '(') {
+				if (p > c) {
+					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
+						npart->data = rspamd_mempool_alloc (task->task_pool,
+								p - c);
+						memcpy (npart->data, c, p - c);
+						rspamd_str_lc (npart->data, p - c);
+						npart->dlen = p - c;
+					}
+				}
+
+				state = in_comment;
+				obraces = 1;
+				ebraces = 0;
+				p ++;
+				c = p;
+			}
+			else if (g_ascii_isspace (*p)) {
+				if (p > c) {
+					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
+						npart->data = rspamd_mempool_alloc (task->task_pool,
+								p - c);
+						memcpy (npart->data, c, p - c);
+						rspamd_str_lc (npart->data, p - c);
+						npart->dlen = p - c;
+					}
+				}
+
+				state = skip_spaces;
+				next_state = read_data;
+				c = p;
+			}
+			else if (*p == ';') {
+				/* It is actually delimiter of date part if not in the comments */
+				if (p > c) {
+					if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
+						npart->data = rspamd_mempool_alloc (task->task_pool,
+								p - c);
+						memcpy (npart->data, c, p - c);
+						rspamd_str_lc (npart->data, p - c);
+						npart->dlen = p - c;
+					}
+				}
+
+				state = all_done;
+				continue;
+			}
+			else if (npart->dlen > 0) {
+				/* We have already received data and find something with no ( */
+				state = all_done;
+				continue;
+			}
+			else {
+				p ++;
+			}
+			break;
+		case all_done:
+			*last = p - (const guchar *)data;
+			return npart;
+			break;
+		}
+	}
+
+	/* Leftover */
+	switch (state) {
+	case read_data:
+		if (p > c) {
+			if (type != RSPAMD_RECEIVED_PART_UNKNOWN) {
+				npart->data = rspamd_mempool_alloc (task->task_pool,
+						p - c);
+				memcpy (npart->data, c, p - c);
+				rspamd_str_lc (npart->data, p - c);
+				npart->dlen = p - c;
+			}
+
+			return npart;
+		}
+		break;
+	case skip_spaces:
+		return npart;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct rspamd_received_part *
+rspamd_smtp_received_spill (struct rspamd_task *task,
+							const char *data,
+							size_t len,
+							goffset *date_pos)
+{
+	const guchar *p, *end;
+	struct rspamd_received_part *cur_part, *head = NULL;
+	goffset pos = 0;
+
+	p = data;
+	end = data + len;
+
+	while (p < end && g_ascii_isspace (*p)) {
+		p ++;
+	}
+
+	len = end - p;
+
+	/* Ignore all received but those started from from part */
+	if (len <= 4 || (lc_map[p[0]] != 'f' &&
+					 lc_map[p[1]] != 'r' &&
+					 lc_map[p[2]] != 'o' &&
+					 lc_map[p[3]] != 'm')) {
+		return NULL;
+	}
+
+	p += sizeof ("from") - 1;
+
+	/* We can now store from part */
+	cur_part = rspamd_smtp_received_process_part (task, p, end - p,
+			RSPAMD_RECEIVED_PART_FROM, &pos);
+
+	if (!cur_part) {
+		return NULL;
+	}
+
+	p += pos;
+	len = end > p ? end - p : 0;
+	DL_APPEND (head, cur_part);
+
+
+	if (len > 2 && (lc_map[p[0]] == 'b' &&
+					lc_map[p[1]] == 'y')) {
+		p += sizeof ("by") - 1;
+
+		cur_part = rspamd_smtp_received_process_part (task, p, end - p,
+				RSPAMD_RECEIVED_PART_BY, &pos);
+
+		if (!cur_part) {
+			return NULL;
+		}
+
+		p += pos;
+		len = end > p ? end - p : 0;
+		DL_APPEND (head, cur_part);
+	}
+
+	while (p > end) {
+		if (*p == ';') {
+			/* We are at the date separator, stop here */
+			*date_pos = p - (const guchar *)data + 1;
+			break;
+		}
+		else {
+			if (len > sizeof ("with") && (lc_map[p[0]] == 'w' &&
+										  lc_map[p[1]] == 'i' &&
+										  lc_map[p[2]] == 't' &&
+										  lc_map[p[3]] == 'h')) {
+				p += sizeof ("with") - 1;
+
+				cur_part = rspamd_smtp_received_process_part (task, p, end - p,
+						RSPAMD_RECEIVED_PART_WITH, &pos);
+			}
+			else if (len > sizeof ("for") && (lc_map[p[0]] == 'f' &&
+											  lc_map[p[1]] == 'o' &&
+											  lc_map[p[2]] == 'r')) {
+				p += sizeof ("for") - 1;
+				cur_part = rspamd_smtp_received_process_part (task, p, end - p,
+						RSPAMD_RECEIVED_PART_FOR, &pos);
+			}
+			else {
+				while (p < end) {
+					if (!(g_ascii_isspace (*p) || *p == '(' || *p == ';')) {
+						p ++;
+					}
+					else {
+						break;
+					}
+				}
+
+				if (p == end) {
+					return NULL;
+				}
+				else if (*p == ';') {
+					*date_pos = p - (const guchar *)data + 1;
+					break;
+				}
+				else {
+					cur_part = rspamd_smtp_received_process_part (task, p, end - p,
+							RSPAMD_RECEIVED_PART_UNKNOWN, &pos);
+				}
+			}
+
+			if (!cur_part) {
+				return NULL;
+			}
+			else {
+				p += pos;
+				len = end > p ? end - p : 0;
+				DL_APPEND (head, cur_part);
+			}
+		}
+	}
+
+	return head;
+}
+
+int
+rspamd_smtp_received_parse (struct rspamd_task *task,
+							const char *data,
+							size_t len,
+							struct received_header *rh)
+{
+	const gchar *p, *c, *end;
+	goffset date_pos = 0;
+	struct rspamd_received_part *head, *cur;
+
+	head = rspamd_smtp_received_spill (task, data, len, &date_pos);
+
+	if (head == NULL) {
+		return -1;
+	}
+
+	return 0;
+}
\ No newline at end of file
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h
index 3c0c23a36..ceed5ab06 100644
--- a/src/libmime/mime_headers.h
+++ b/src/libmime/mime_headers.h
@@ -18,6 +18,7 @@
 
 #include "config.h"
 #include "libutil/mem_pool.h"
+#include "libutil/addr.h"
 
 struct rspamd_task;
 
@@ -55,6 +56,36 @@ struct rspamd_mime_header {
 	gchar *decoded;
 };
 
+enum rspamd_received_type {
+	RSPAMD_RECEIVED_SMTP = 0,
+	RSPAMD_RECEIVED_ESMTP,
+	RSPAMD_RECEIVED_ESMTPA,
+	RSPAMD_RECEIVED_ESMTPS,
+	RSPAMD_RECEIVED_ESMTPSA,
+	RSPAMD_RECEIVED_LMTP,
+	RSPAMD_RECEIVED_IMAP,
+	RSPAMD_RECEIVED_UNKNOWN
+};
+
+#define RSPAMD_RECEIVED_FLAG_ARTIFICIAL (1 << 0)
+#define RSPAMD_RECEIVED_FLAG_SSL (1 << 1)
+#define RSPAMD_RECEIVED_FLAG_AUTHENTICATED (1 << 2)
+
+struct received_header {
+	gchar *from_hostname;
+	gchar *from_ip;
+	gchar *real_hostname;
+	gchar *real_ip;
+	gchar *by_hostname;
+	gchar *for_mbox;
+	gchar *comment_ip;
+	rspamd_inet_addr_t *addr;
+	struct rspamd_mime_header *hdr;
+	time_t timestamp;
+	enum rspamd_received_type type;
+	gint flags;
+};
+
 /**
  * Process headers and store them in `target`
  * @param task
diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h
index fdd390f22..6904bece0 100644
--- a/src/libmime/smtp_parsers.h
+++ b/src/libmime/smtp_parsers.h
@@ -34,6 +34,9 @@ rspamd_rfc2047_parser (const gchar *in, gsize len, gint *pencoding,
 		const gchar **charset, gsize *charset_len,
 		const gchar **encoded, gsize *encoded_len);
 
+rspamd_inet_addr_t* rspamd_parse_smtp_ip (const char *data, size_t len,
+		rspamd_mempool_t *pool);
+
 guint64 rspamd_parse_smtp_date (const char *data, size_t len);
 
 #endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */