commit 2f78615: [Feature] Core: Support RFC2231 encoding in headers

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jan 28 12:14:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-01-28 12:01:21 +0000
URL: https://github.com/rspamd/rspamd/commit/2f78615bf95f30484af69abd64db0e29765d1de1 (HEAD -> master)

[Feature] Core: Support RFC2231 encoding in headers

---
 src/libmime/content_type.c | 341 ++++++++++++++++++++++++++++++++++++++-------
 src/libmime/content_type.h |   9 ++
 src/libmime/mime_parser.c  |   6 +-
 3 files changed, 299 insertions(+), 57 deletions(-)

diff --git a/src/libmime/content_type.c b/src/libmime/content_type.c
index 6b99953f2..ca371ce30 100644
--- a/src/libmime/content_type.c
+++ b/src/libmime/content_type.c
@@ -17,72 +17,312 @@
 #include "libmime/content_type.h"
 #include "smtp_parsers.h"
 #include "utlist.h"
+#include "libserver/url.h"
+#include "libmime/mime_encoding.h"
 
-void
-rspamd_content_type_add_param (rspamd_mempool_t *pool,
-		struct rspamd_content_type *ct,
-		gchar *name_start, gchar *name_end,
+static gboolean
+rspamd_rfc2231_decode (rspamd_mempool_t *pool,
+		struct rspamd_content_type_param *param,
 		gchar *value_start, gchar *value_end)
 {
-	rspamd_ftok_t srch;
-	struct rspamd_content_type_param *found = NULL, *nparam;
+	gchar *quote_pos;
 
-	g_assert (ct != NULL);
+	quote_pos = memchr (value_start, '\'', value_end - value_start);
+
+	if (quote_pos == NULL) {
+		/* Plain percent encoding */
+		gsize r = rspamd_url_decode (value_start, value_start,
+				value_end - value_start);
+		param->value.begin = value_start;
+		param->value.len = r;
+	}
+	else {
+		/*
+		 * We can have encoding'language'data, or
+		 * encoding'data (in theory).
+		 * Try to handle both...
+		 */
+		const gchar *charset = NULL;
+		rspamd_ftok_t ctok;
 
+		ctok.begin = value_start;
+		ctok.len = quote_pos - value_start;
 
-	nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
-	nparam->name.begin = name_start;
-	nparam->name.len = name_end - name_start;
-	rspamd_str_lc (name_start, name_end - name_start);
+		charset = rspamd_mime_detect_charset (&ctok, pool);
+
+		if (charset == NULL) {
+			msg_warn_pool ("cannot convert parameter from charset %T", &ctok);
+
+			return FALSE;
+		}
+
+		/* Now, we can check for either next quote sign or, eh, ignore that */
+		value_start = quote_pos + 1;
+
+		quote_pos = memchr (value_start, '\'', value_end - value_start);
+
+		if (quote_pos) {
+			/* Ignore language */
+			value_start = quote_pos + 1;
+		}
+
+		/* Perform percent decoding */
+		gsize r = rspamd_url_decode (value_start, value_start,
+				value_end - value_start);
+		GError *err = NULL;
+
+		param->value.begin = rspamd_mime_text_to_utf8 (pool,
+				value_start, r,
+				charset, &param->value.len, &err);
+
+		if (param->value.begin == NULL) {
+			msg_warn_pool ("cannot convert parameter from charset %s: %e",
+					charset, err);
+
+			if (err) {
+				g_error_free (err);
+			}
+
+			return FALSE;
+		}
+	}
+
+	param->flags |= RSPAMD_CONTENT_PARAM_RFC2231;
+
+	return TRUE;
+}
+
+static gboolean
+rspamd_param_maybe_rfc2231_process (rspamd_mempool_t *pool,
+									struct rspamd_content_type_param *param,
+									gchar *name_start, gchar *name_end,
+									gchar *value_start, gchar *value_end)
+{
+	const gchar *star_pos;
+
+	star_pos = memchr (name_start, '*', name_end - name_start);
+
+	if (star_pos == NULL) {
+		return FALSE;
+	}
+
+	/* We have three possibilities here:
+	 * 1. name* (just name + 2231 encoding)
+	 * 2. name*(\d+) (piecewise stuff but no rfc2231 encoding)
+	 * 3. name*(\d+)* (piecewise stuff and rfc2231 encoding)
+	 */
+
+	if (star_pos == name_end - 1) {
+		/* First */
+		if (rspamd_rfc2231_decode (pool, param, value_start, value_end)) {
+			param->name.begin = name_start;
+			param->name.len = name_end - name_start - 1;
+		}
+	}
+	else if (*(name_end - 1) == '*') {
+		/* Third */
+		/* Check number */
+		gulong tmp;
+
+		if (!rspamd_strtoul (star_pos + 1, name_end - star_pos - 2, &tmp)) {
+			return FALSE;
+		}
+
+		param->flags |= RSPAMD_CONTENT_PARAM_PIECEWISE|RSPAMD_CONTENT_PARAM_RFC2231;
+		param->rfc2231_id = tmp;
+		param->name.begin = name_start;
+		param->name.len = star_pos - name_start;
+		param->value.begin = value_start;
+		param->value.len = value_end - value_start;
+
+		/* Deal with that later... */
+	}
+	else {
+		/* Second case */
+		gulong tmp;
+
+		if (!rspamd_strtoul (star_pos + 1, name_end - star_pos - 1, &tmp)) {
+			return FALSE;
+		}
+
+		param->flags |= RSPAMD_CONTENT_PARAM_PIECEWISE;
+		param->rfc2231_id = tmp;
+		param->name.begin = name_start;
+		param->name.len = star_pos - name_start;
+		param->value.begin = value_start;
+		param->value.len = value_end - value_start;
+	}
+
+	return TRUE;
+}
+
+static gint32
+rspamd_cmp_pieces (struct rspamd_content_type_param *p1, struct rspamd_content_type_param *p2)
+{
+	return p1->rfc2231_id - p2->rfc2231_id;
+}
+
+static void
+rspamd_postprocess_ct_attributes (rspamd_mempool_t *pool,
+								  GHashTable *htb,
+								  void (*proc)(rspamd_mempool_t *, struct rspamd_content_type_param *, gpointer ud),
+								  gpointer procd)
+{
+	GHashTableIter it;
+	gpointer k, v;
+	struct rspamd_content_type_param *param, *sorted, *cur;
+
+	if (htb == NULL) {
+		return;
+	}
+
+	g_hash_table_iter_init (&it, htb);
+
+	while (g_hash_table_iter_next (&it, &k, &v)) {
+		param = (struct rspamd_content_type_param *)v;
+
+		if (param->flags & RSPAMD_CONTENT_PARAM_PIECEWISE) {
+			/* Reconstruct param */
+			gsize tlen = 0;
+			gchar *ndata, *pos;
+
+			sorted = param;
+			DL_SORT (sorted, rspamd_cmp_pieces);
+
+			DL_FOREACH (sorted, cur) {
+				tlen += cur->value.len;
+			}
+
+			ndata = rspamd_mempool_alloc (pool, tlen);
+			pos = ndata;
+
+			DL_FOREACH (sorted, cur) {
+				memcpy (pos, cur->value.begin, cur->value.len);
+				pos += cur->value.len;
+			}
+
+			if (param->flags & RSPAMD_CONTENT_PARAM_RFC2231) {
+				if (!rspamd_rfc2231_decode (pool, param,
+						ndata, pos)) {
+					param->flags |= RSPAMD_CONTENT_PARAM_BROKEN;
+					param->value.begin = ndata;
+					param->value.len = tlen;
+				}
+			}
+			else {
+				param->value.begin = ndata;
+				param->value.len = tlen;
+			}
+
+			/* Detach from list */
+			param->next = NULL;
+			param->prev = param;
+		}
+
+		proc (pool, param, procd);
+	}
+}
+
+static void
+rspamd_content_type_postprocess (rspamd_mempool_t *pool,
+								 struct rspamd_content_type_param *param,
+								 gpointer ud)
+{
+	rspamd_ftok_t srch;
+	struct rspamd_content_type_param *found = NULL;
 
-	nparam->value.begin = value_start;
-	nparam->value.len = value_end - value_start;
+	struct rspamd_content_type *ct = (struct rspamd_content_type *)ud;
 
 	RSPAMD_FTOK_ASSIGN (&srch, "charset");
 
-	if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
+	if (rspamd_ftok_cmp (&param->name, &srch) == 0) {
 		/* Adjust charset */
-		found = nparam;
-		ct->charset.begin = nparam->value.begin;
-		ct->charset.len = nparam->value.len;
+		found = param;
+		ct->charset.begin = param->value.begin;
+		ct->charset.len = param->value.len;
 	}
 
 	RSPAMD_FTOK_ASSIGN (&srch, "boundary");
 
-	if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
-		found = nparam;
+	if (rspamd_ftok_cmp (&param->name, &srch) == 0) {
+		found = param;
 		gchar *lc_boundary;
 		/* Adjust boundary */
-		lc_boundary = rspamd_mempool_alloc (pool, nparam->value.len);
-		memcpy (lc_boundary, nparam->value.begin, nparam->value.len);
-		rspamd_str_lc (lc_boundary, nparam->value.len);
+		lc_boundary = rspamd_mempool_alloc (pool, param->value.len);
+		memcpy (lc_boundary, param->value.begin, param->value.len);
+		rspamd_str_lc (lc_boundary, param->value.len);
 		ct->boundary.begin = lc_boundary;
-		ct->boundary.len = nparam->value.len;
+		ct->boundary.len = param->value.len;
 		/* Preserve original (case sensitive) boundary */
-		ct->orig_boundary.begin = nparam->value.begin;
-		ct->orig_boundary.len = nparam->value.len;
+		ct->orig_boundary.begin = param->value.begin;
+		ct->orig_boundary.len = param->value.len;
 	}
 
 	if (!found) {
-		srch.begin = nparam->name.begin;
-		srch.len = nparam->name.len;
+		/* Just lowercase */
+		rspamd_str_lc ((gchar *)param->value.begin, param->value.len);
+	}
+}
 
-		rspamd_str_lc (value_start, value_end - value_start);
+static void
+rspamd_content_disposition_postprocess (rspamd_mempool_t *pool,
+										struct rspamd_content_type_param *param,
+										gpointer ud)
+{
+	rspamd_ftok_t srch;
+	struct rspamd_content_disposition *cd = (struct rspamd_content_disposition *)ud;
 
-		if (ct->attrs) {
-			found = g_hash_table_lookup (ct->attrs, &srch);
-		} else {
-			ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
-					rspamd_ftok_icase_equal);
-		}
+	srch.begin = "filename";
+	srch.len = 8;
 
-		if (!found) {
-			DL_APPEND (found, nparam);
-			g_hash_table_insert (ct->attrs, &nparam->name, nparam);
-		}
-		else {
-			DL_APPEND (found, nparam);
-		}
+	if (rspamd_ftok_cmp (&param->name, &srch) == 0) {
+		/* Adjust filename */
+		cd->filename.begin = param->value.begin;
+		cd->filename.len = param->value.len;
+	}
+}
+
+void
+rspamd_content_type_add_param (rspamd_mempool_t *pool,
+		struct rspamd_content_type *ct,
+		gchar *name_start, gchar *name_end,
+		gchar *value_start, gchar *value_end)
+{
+	struct rspamd_content_type_param *nparam;
+	rspamd_ftok_t srch;
+	struct rspamd_content_type_param *found = NULL;
+
+	g_assert (ct != NULL);
+
+	nparam = rspamd_mempool_alloc0 (pool, sizeof (*nparam));
+	rspamd_str_lc (name_start, name_end - name_start);
+
+	if (!rspamd_param_maybe_rfc2231_process (pool, nparam, name_start,
+			name_end, value_start, value_end)) {
+		nparam->name.begin = name_start;
+		nparam->name.len = name_end - name_start;
+		nparam->value.begin = value_start;
+		nparam->value.len = value_end - value_start;
+	}
+
+	RSPAMD_FTOK_ASSIGN (&srch, "charset");
+
+	srch.begin = nparam->name.begin;
+	srch.len = nparam->name.len;
+
+	if (ct->attrs) {
+		found = g_hash_table_lookup (ct->attrs, &srch);
+	} else {
+		ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
+				rspamd_ftok_icase_equal);
+	}
+
+	if (!found) {
+		DL_APPEND (found, nparam);
+		g_hash_table_insert (ct->attrs, &nparam->name, nparam);
+	}
+	else {
+		DL_APPEND (found, nparam);
 	}
 }
 
@@ -361,9 +601,6 @@ rspamd_content_type_parser (gchar *in, gsize len, rspamd_mempool_t *pool)
 	if (val.type.len > 0) {
 		res = rspamd_mempool_alloc (pool, sizeof (val));
 		memcpy (res, &val, sizeof (val));
-
-		/* Lowercase common thingies */
-
 	}
 
 	return res;
@@ -384,6 +621,9 @@ rspamd_content_type_parse (const gchar *in,
 		if (res->attrs) {
 			rspamd_mempool_add_destructor (pool,
 					(rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
+
+			rspamd_postprocess_ct_attributes (pool, res->attrs,
+					rspamd_content_type_postprocess, res);
 		}
 
 		/* Now do some hacks to work with broken content types */
@@ -491,7 +731,7 @@ rspamd_content_disposition_add_param (rspamd_mempool_t *pool,
 				(rspamd_mempool_destruct_t)g_hash_table_unref, cd->attrs);
 	}
 
-	nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
+	nparam = rspamd_mempool_alloc0 (pool, sizeof (*nparam));
 	nparam->name.begin = name_start;
 	nparam->name.len = name_end - name_start;
 	decoded = rspamd_mime_header_decode (pool, value_start,
@@ -503,15 +743,6 @@ rspamd_content_disposition_add_param (rspamd_mempool_t *pool,
 	}
 
 	DL_APPEND (found, nparam);
-
-	srch.begin = "filename";
-	srch.len = 8;
-
-	if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
-		/* Adjust filename */
-		cd->filename.begin = nparam->value.begin;
-		cd->filename.len = nparam->value.len;
-	}
 }
 
 struct rspamd_content_disposition *
@@ -526,6 +757,8 @@ rspamd_content_disposition_parse (const gchar *in,
 		res->lc_data = rspamd_mempool_alloc (pool, len + 1);
 		rspamd_strlcpy (res->lc_data, in, len + 1);
 		rspamd_str_lc (res->lc_data, len);
+		rspamd_postprocess_ct_attributes (pool, res->attrs,
+				rspamd_content_disposition_postprocess, res);
 	}
 	else {
 		msg_warn_pool ("cannot parse content disposition: %*s",
diff --git a/src/libmime/content_type.h b/src/libmime/content_type.h
index 554aad6a1..68b1b8107 100644
--- a/src/libmime/content_type.h
+++ b/src/libmime/content_type.h
@@ -34,9 +34,18 @@ enum rspamd_content_type_flags {
 #define IS_CT_TEXT(ct) ((ct) && ((ct)->flags & RSPAMD_CONTENT_TYPE_TEXT))
 #define IS_CT_MESSAGE(ct) ((ct) &&((ct)->flags & RSPAMD_CONTENT_TYPE_MESSAGE))
 
+enum rspamd_content_param_flags {
+	RSPAMD_CONTENT_PARAM_NORMAL = 0,
+	RSPAMD_CONTENT_PARAM_RFC2231 = (1 << 0),
+	RSPAMD_CONTENT_PARAM_PIECEWISE = (1 << 1),
+	RSPAMD_CONTENT_PARAM_BROKEN = (1 << 2),
+};
+
 struct rspamd_content_type_param {
 	rspamd_ftok_t name;
 	rspamd_ftok_t value;
+	guint rfc2231_id;
+	enum rspamd_content_param_flags flags;
 	struct rspamd_content_type_param *prev, *next;
 };
 
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
index 82a0cbafb..538e55d42 100644
--- a/src/libmime/mime_parser.c
+++ b/src/libmime/mime_parser.c
@@ -376,9 +376,6 @@ rspamd_mime_part_get_cd (struct rspamd_task *task, struct rspamd_mime_part *part
 					task->task_pool);
 
 			if (cd) {
-				msg_debug_mime ("processed content disposition: %s",
-						cd->lc_data);
-
 				/* We still need to check filename */
 				if (cd->filename.len == 0) {
 					if (part->ct && part->ct->attrs) {
@@ -397,6 +394,9 @@ rspamd_mime_part_get_cd (struct rspamd_task *task, struct rspamd_mime_part *part
 						}
 					}
 				}
+
+				msg_debug_mime ("processed content disposition: %s, file: \"%T\"",
+						cd->lc_data, &cd->filename);
 				break;
 			}
 		}


More information about the Commits mailing list