commit 1912eac: [Feature] Core: Add libmagic detection for all parts

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Dec 27 18:28:06 UTC 2018


Author: Vsevolod Stakhov
Date: 2018-12-11 12:01:52 +0000
URL: https://github.com/rspamd/rspamd/commit/1912eac2d678b2993b4ef1fa41e36ca7a38e8239

[Feature] Core: Add libmagic detection for all parts

---
 src/libmime/archives.c |  4 +--
 src/libmime/message.c  | 51 ++++++++++++++++++++++-------
 src/libmime/message.h  |  1 +
 src/lua/lua_mimepart.c | 87 ++++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 113 insertions(+), 30 deletions(-)

diff --git a/src/libmime/archives.c b/src/libmime/archives.c
index 9cfce6968..1f9a5c634 100644
--- a/src/libmime/archives.c
+++ b/src/libmime/archives.c
@@ -1509,8 +1509,8 @@ rspamd_archive_cheat_detect (struct rspamd_mime_part *part, const gchar *str,
 		}
 
 		if (magic_start != NULL) {
-			if (part->parsed_data.len > magic_len && memcmp (part->parsed_data.begin,
-					magic_start, magic_len) == 0) {
+			if (part->parsed_data.len > magic_len &&
+				memcmp (part->parsed_data.begin, magic_start, magic_len) == 0) {
 				return TRUE;
 			}
 		}
diff --git a/src/libmime/message.c b/src/libmime/message.c
index a5faaf017..bbae5e426 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -703,7 +703,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
 	return TRUE;
 }
 
-static void
+static gboolean
 rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 										struct rspamd_mime_part *mime_part)
 {
@@ -812,11 +812,11 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 			mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT &&
 			(task->cfg && !task->cfg->check_text_attachements)) {
 		debug_task ("skip attachments for checking as text parts");
-		return;
+		return TRUE;
 	}
 	else if (!(found_txt || found_html)) {
 		/* Not a text part */
-		return;
+		return FALSE;
 	}
 
 	text_part = rspamd_mempool_alloc0 (task->task_pool,
@@ -830,12 +830,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 
 	if (found_html) {
 		if (!rspamd_message_process_html_text_part (task, text_part)) {
-			return;
+			return FALSE;
 		}
 	}
 	else {
 		if (!rspamd_message_process_plain_text_part (task, text_part)) {
-			return;
+			return FALSE;
 		}
 	}
 
@@ -866,7 +866,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 
 		rspamd_task_insert_result (task, GTUBE_SYMBOL, 0, NULL);
 
-		return;
+		return TRUE;
 	}
 
 	/* Post process part */
@@ -885,6 +885,8 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 	}
 
 	rspamd_mime_part_create_words (task, text_part);
+
+	return TRUE;
 }
 
 /* Creates message from various data using libmagic to detect type */
@@ -900,15 +902,18 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 
 	g_assert (start != NULL);
 
+	part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part));
+
 	tok = rspamd_task_get_request_header (task, "Content-Type");
 
 	if (tok) {
 		/* We have Content-Type defined */
 		ct = rspamd_content_type_parse (tok->begin, tok->len,
 				task->task_pool);
+		part->ct = ct;
 	}
-	else if (task->cfg && task->cfg->libs_ctx) {
-		/* Try to predict it by content (slow) */
+
+	if (task->cfg && task->cfg->libs_ctx) {
 		mb = magic_buffer (task->cfg->libs_ctx->libmagic,
 				start,
 				len);
@@ -918,12 +923,16 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 			srch.len = strlen (mb);
 			ct = rspamd_content_type_parse (srch.begin, srch.len,
 					task->task_pool);
+			msg_warn_task ("construct fake mime of type: %s", mb);
+
+			if (!part->ct) {
+				part->ct = ct;
+			}
+
+			part->detected_ct = ct;
 		}
 	}
 
-	msg_warn_task ("construct fake mime of type: %s", mb);
-	part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part));
-	part->ct = ct;
 	part->raw_data.begin = start;
 	part->raw_data.len = len;
 	part->parsed_data.begin = start;
@@ -1189,7 +1198,25 @@ rspamd_message_process (struct rspamd_task *task)
 		struct rspamd_mime_part *part;
 
 		part = g_ptr_array_index (task->parts, i);
-		rspamd_message_process_text_part_maybe (task, part);
+
+
+		if (!rspamd_message_process_text_part_maybe (task, part) &&
+				part->parsed_data.len > 0) {
+			const gchar *mb = magic_buffer (task->cfg->libs_ctx->libmagic,
+					part->parsed_data.begin,
+					part->parsed_data.len);
+
+			if (mb) {
+				rspamd_ftok_t srch;
+
+				srch.begin = mb;
+				srch.len = strlen (mb);
+				part->detected_ct = rspamd_content_type_parse (srch.begin,
+						srch.len,
+						task->task_pool);
+			}
+
+		}
 	}
 
 	rspamd_images_process (task);
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 29f777c3b..25c88cc3a 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -47,6 +47,7 @@ struct rspamd_mime_multipart {
 
 struct rspamd_mime_part {
 	struct rspamd_content_type *ct;
+	struct rspamd_content_type *detected_ct;
 	struct rspamd_content_disposition *cd;
 	rspamd_ftok_t raw_data;
 	rspamd_ftok_t parsed_data;
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 3617a145b..3019cf577 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -333,6 +333,20 @@ LUA_FUNCTION_DEF (mimepart, get_type);
  */
 LUA_FUNCTION_DEF (mimepart, get_type_full);
 
+/***
+ * @method mime_part:get_detected_type()
+ * Extract content-type string of the mime part. Use libmagic detection
+ * @return {string,string} content type in form 'type','subtype'
+ */
+LUA_FUNCTION_DEF (mimepart, get_detected_type);
+
+/***
+ * @method mime_part:get_detected_type_full()
+ * Extract content-type string of the mime part with all attributes. Use libmagic detection
+ * @return {string,string,table} content type in form 'type','subtype', {attrs}
+ */
+LUA_FUNCTION_DEF (mimepart, get_detected_type_full);
+
 /***
  * @method mime_part:get_cte()
  * Extract content-transfer-encoding for a part
@@ -457,6 +471,8 @@ static const struct luaL_reg mimepartlib_m[] = {
 	LUA_INTERFACE_DEF (mimepart, get_length),
 	LUA_INTERFACE_DEF (mimepart, get_type),
 	LUA_INTERFACE_DEF (mimepart, get_type_full),
+	LUA_INTERFACE_DEF (mimepart, get_detected_type),
+	LUA_INTERFACE_DEF (mimepart, get_detected_type_full),
 	LUA_INTERFACE_DEF (mimepart, get_cte),
 	LUA_INTERFACE_DEF (mimepart, get_filename),
 	LUA_INTERFACE_DEF (mimepart, get_header),
@@ -1189,48 +1205,49 @@ lua_mimepart_get_length (lua_State * L)
 }
 
 static gint
-lua_mimepart_get_type_common (lua_State * L, gboolean full)
+lua_mimepart_get_type_common (lua_State * L, struct rspamd_content_type *ct,
+		gboolean full)
 {
-	struct rspamd_mime_part *part = lua_check_mimepart (L);
+
 	GHashTableIter it;
 	gpointer k, v;
 	struct rspamd_content_type_param *param;
 
-	if (part == NULL) {
+	if (ct == NULL) {
 		lua_pushnil (L);
 		lua_pushnil (L);
 		return 2;
 	}
 
-	lua_pushlstring (L, part->ct->type.begin, part->ct->type.len);
-	lua_pushlstring (L, part->ct->subtype.begin, part->ct->subtype.len);
+	lua_pushlstring (L, ct->type.begin, ct->type.len);
+	lua_pushlstring (L, ct->subtype.begin, ct->subtype.len);
 
 	if (!full) {
 		return 2;
 	}
 
-	lua_createtable (L, 0, 2 + (part->ct->attrs ?
-			g_hash_table_size (part->ct->attrs) : 0));
+	lua_createtable (L, 0, 2 + (ct->attrs ?
+			g_hash_table_size (ct->attrs) : 0));
 
-	if (part->ct->charset.len > 0) {
+	if (ct->charset.len > 0) {
 		lua_pushstring (L, "charset");
-		lua_pushlstring (L, part->ct->charset.begin, part->ct->charset.len);
+		lua_pushlstring (L, ct->charset.begin, ct->charset.len);
 		lua_settable (L, -3);
 	}
 
-	if (part->ct->boundary.len > 0) {
+	if (ct->boundary.len > 0) {
 		lua_pushstring (L, "charset");
-		lua_pushlstring (L, part->ct->boundary.begin, part->ct->boundary.len);
+		lua_pushlstring (L, ct->boundary.begin, ct->boundary.len);
 		lua_settable (L, -3);
 	}
 
-	if (part->ct->attrs) {
-		g_hash_table_iter_init (&it, part->ct->attrs);
+	if (ct->attrs) {
+		g_hash_table_iter_init (&it, ct->attrs);
 
 		while (g_hash_table_iter_next (&it, &k, &v)) {
 			param = v;
 
-			if (param->name.len > 0 && param->name.len > 0) {
+			if (param->name.len > 0 && param->value.len > 0) {
 				/* TODO: think about multiple values here */
 				lua_pushlstring (L, param->name.begin, param->name.len);
 				lua_pushlstring (L, param->value.begin, param->value.len);
@@ -1246,14 +1263,52 @@ static gint
 lua_mimepart_get_type (lua_State * L)
 {
 	LUA_TRACE_POINT;
-	return lua_mimepart_get_type_common (L, FALSE);
+	struct rspamd_mime_part *part = lua_check_mimepart (L);
+
+	if (part == NULL) {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	return lua_mimepart_get_type_common (L, part->ct, FALSE);
 }
 
 static gint
 lua_mimepart_get_type_full (lua_State * L)
 {
 	LUA_TRACE_POINT;
-	return lua_mimepart_get_type_common (L, TRUE);
+	struct rspamd_mime_part *part = lua_check_mimepart (L);
+
+	if (part == NULL) {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	return lua_mimepart_get_type_common (L, part->ct, TRUE);
+}
+
+static gint
+lua_mimepart_get_detected_type (lua_State * L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_mime_part *part = lua_check_mimepart (L);
+
+	if (part == NULL) {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	return lua_mimepart_get_type_common (L, part->detected_ct, FALSE);
+}
+
+static gint
+lua_mimepart_get_detected_type_full (lua_State * L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_mime_part *part = lua_check_mimepart (L);
+
+	if (part == NULL) {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	return lua_mimepart_get_type_common (L, part->detected_ct, TRUE);
 }
 
 static gint


More information about the Commits mailing list