commit 56e236e: [Rework] No more magic

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Sep 9 16:14:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-09-09 16:26:16 +0100
URL: https://github.com/rspamd/rspamd/commit/56e236efa012c4be6b3893314ce4d3a570e16327

[Rework] No more magic

---
 CMakeLists.txt           |   2 -
 src/libmime/message.c    | 225 ++++++++++++++++++++++++-----------------------
 src/libmime/message.h    |   2 +
 src/libserver/cfg_file.h |   1 -
 src/libserver/cfg_rcl.c  |   6 --
 src/libutil/util.c       |  37 --------
 src/rspamd.h             |   3 -
 7 files changed, 116 insertions(+), 160 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22c4b817b..952214391 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -638,8 +638,6 @@ ProcessPackage(LIBCRYPT LIBRARY crypto INCLUDE openssl/evp.h
 	ROOT ${OPENSSL_ROOT_DIR} MODULES openssl libcrypt)
 ProcessPackage(LIBSSL LIBRARY ssl INCLUDE openssl/ssl.h
 	ROOT ${OPENSSL_ROOT_DIR} MODULES openssl libssl)
-ProcessPackage(MAGIC LIBRARY magic INCLUDE magic.h INCLUDE_SUFFIXES include/libmagic
-	ROOT ${LIBMAGIC_ROOT_DIR} MODULES magic)
 ProcessPackage(LIBZ LIBRARY z INCLUDE zlib.h INCLUDE_SUFFIXES include/zlib
 		ROOT ${LIBZ_ROOT_DIR} MODULES z)
 ProcessPackage(SODIUM LIBRARY sodium INCLUDE sodium.h
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 92fa1f51b..00067ee83 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -818,98 +818,19 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 
 	if (IS_CT_TEXT (mime_part->ct) && (!mime_part->detected_ct ||
 									   IS_CT_TEXT (mime_part->detected_ct))) {
+		found_txt = TRUE;
+
 		html_tok.begin = "html";
 		html_tok.len = 4;
 		xhtml_tok.begin = "xhtml";
 		xhtml_tok.len = 5;
 
 		if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0 ||
-				rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0) {
+			rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0 ||
+			(mime_part->detected_ct &&
+				rspamd_ftok_casecmp (&mime_part->detected_ct->subtype, &html_tok) == 0)) {
 			found_html = TRUE;
 		}
-		else {
-			/*
-			 * We also need to apply heuristic for text parts that are actually
-			 * HTML.
-			 */
-			RSPAMD_FTOK_ASSIGN (&html_tok, "<!DOCTYPE html");
-			RSPAMD_FTOK_ASSIGN (&xhtml_tok, "<html");
-
-			if (mime_part->parsed_data.len >= xhtml_tok.len &&
-					rspamd_lc_cmp (mime_part->parsed_data.begin,
-							xhtml_tok.begin, xhtml_tok.len) == 0) {
-				found_html = TRUE;
-			}
-			else if (mime_part->parsed_data.len >= html_tok.len &&
-					rspamd_lc_cmp (mime_part->parsed_data.begin,
-							html_tok.begin, html_tok.len) == 0) {
-				found_html = TRUE;
-			}
-			else {
-				/* We need to be extra careful with some stupid things here */
-
-				html_tok.begin = "plain";
-				html_tok.len = 5;
-
-				if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0) {
-					found_txt = TRUE;
-				}
-				else {
-					if (mime_part->cd && mime_part->cd->filename.len > 4) {
-						const gchar *pos = mime_part->cd->filename.begin +
-										   mime_part->cd->filename.len -
-										   sizeof (".txt") + 1;
-						if (rspamd_lc_cmp (pos, ".txt", sizeof ("txt") - 1) == 0) {
-							found_txt = TRUE;
-						}
-						else {
-							msg_debug_task ("found mime part with incorrect content-type: %T/%T, "
-										   "filename: %T",
-									&mime_part->ct->type,
-									&mime_part->ct->subtype,
-									&mime_part->cd->filename);
-						}
-					}
-					else {
-						/* For something like Content-Type: text */
-						found_txt = TRUE;
-					}
-				}
-			}
-
-			if (found_html) {
-				msg_info_task ("found html part pretending to be text/plain part");
-			}
-		}
-	}
-	else {
-		/* Apply heuristic */
-
-		if (mime_part->cd && mime_part->cd->filename.len > 4) {
-			const gchar *pos = mime_part->cd->filename.begin +
-					mime_part->cd->filename.len - sizeof (".htm") + 1;
-
-			if (rspamd_lc_cmp (pos, ".htm", sizeof (".htm") - 1) == 0) {
-				found_html = TRUE;
-			}
-			else if (rspamd_lc_cmp (pos, ".txt", sizeof ("txt") - 1) == 0) {
-				found_txt = TRUE;
-			}
-			else if ( mime_part->cd->filename.len > 5) {
-				pos = mime_part->cd->filename.begin +
-						mime_part->cd->filename.len - sizeof (".html") + 1;
-				if (rspamd_lc_cmp (pos, ".html", sizeof (".html") - 1) == 0) {
-					found_html = TRUE;
-				}
-			}
-		}
-
-		if (found_txt || found_html) {
-			msg_info_task ("found %s part with incorrect content-type: %T/%T",
-					found_html ? "html" : "text",
-					&mime_part->ct->type, &mime_part->ct->subtype);
-			mime_part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
-		}
 	}
 
 	/* Skip attachments */
@@ -1006,7 +927,7 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 {
 	struct rspamd_content_type *ct = NULL;
 	struct rspamd_mime_part *part;
-	const char *mb = NULL;
+	const char *mb = "application/octet-stream";
 	gchar *mid;
 	rspamd_ftok_t srch, *tok;
 	gchar cdbuf[1024];
@@ -1015,6 +936,14 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 
 	part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part));
 
+	part->raw_data.begin = start;
+	part->raw_data.len = len;
+	part->parsed_data.begin = start;
+	part->parsed_data.len = len;
+	part->id = MESSAGE_FIELD (task, parts)->len;
+	part->raw_headers = rspamd_message_headers_new ();
+	part->headers_order = NULL;
+
 	tok = rspamd_task_get_request_header (task, "Content-Type");
 
 	if (tok) {
@@ -1023,11 +952,42 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 				task->task_pool);
 		part->ct = ct;
 	}
+	else if (task->cfg && task->cfg->libs_ctx) {
+		lua_State *L = task->cfg->lua_state;
+
+		if (rspamd_lua_require_function (L,
+				"lua_magic", "detect_mime_part")) {
+
+			struct rspamd_mime_part **pmime;
+			struct rspamd_task **ptask;
 
-	if (task->cfg && task->cfg->libs_ctx) {
-		mb = magic_buffer (task->cfg->libs_ctx->libmagic,
-				start,
-				len);
+			pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
+			rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
+			*pmime = part;
+			ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
+			rspamd_lua_setclass (L, "rspamd{task}", -1);
+			*ptask = task;
+
+			if (lua_pcall (L, 2, 2, 0) != 0) {
+				msg_err_task ("cannot detect type: %s", lua_tostring (L, -1));
+			}
+			else {
+				if (lua_istable (L, -1)) {
+					lua_pushstring (L, "ct");
+					lua_gettable (L, -2);
+
+					if (lua_isstring (L, -1)) {
+						mb = rspamd_mempool_strdup (task->task_pool,
+								lua_tostring (L, -1));
+					}
+				}
+			}
+
+			lua_settop (L, 0);
+		}
+		else {
+			msg_err_task ("cannot require lua_magic.detect_mime_part");
+		}
 
 		if (mb) {
 			srch.begin = mb;
@@ -1059,13 +1019,6 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 		}
 	}
 
-	part->raw_data.begin = start;
-	part->raw_data.len = len;
-	part->parsed_data.begin = start;
-	part->parsed_data.len = len;
-	part->id = MESSAGE_FIELD (task, parts)->len;
-	part->raw_headers = rspamd_message_headers_new ();
-	part->headers_order = NULL;
 
 	tok = rspamd_task_get_request_header (task, "Filename");
 
@@ -1408,31 +1361,81 @@ rspamd_message_process (struct rspamd_task *task)
 	gdouble diff, *pdiff;
 	guint tw, *ptw, dw;
 	struct rspamd_mime_part *part;
+	lua_State *L = task->cfg->lua_state;
+	gint func_pos = -1;
 
 	rspamd_images_process (task);
 	rspamd_archives_process (task);
 
+	if (rspamd_lua_require_function (L,
+			"lua_magic", "detect_mime_part")) {
+		func_pos = lua_gettop (L);
+	}
+	else {
+		msg_err_task ("cannot require lua_magic.detect_mime_part");
+	}
+
 	PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
-		if (!rspamd_message_process_text_part_maybe (task, part) &&
-				part->parsed_data.len > 0) {
-			if (task->cfg) {
-				const gchar *mb = magic_buffer (task->cfg->libs_ctx->libmagic,
-						part->parsed_data.begin,
-						part->parsed_data.len);
-
-				if (mb) {
-					rspamd_ftok_t srch;
-
-					srch.begin = mb;
-					srch.len = strlen (mb);
-					part->detected_ct = rspamd_content_type_parse (srch.begin,
-							srch.len,
-							task->task_pool);
+		if (func_pos != -1) {
+			struct rspamd_mime_part **pmime;
+			struct rspamd_task **ptask;
+
+			lua_pushvalue (L, func_pos);
+			pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
+			rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
+			*pmime = part;
+			ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
+			rspamd_lua_setclass (L, "rspamd{task}", -1);
+			*ptask = task;
+
+			if (lua_pcall (L, 2, 2, 0) != 0) {
+				msg_err_task ("cannot detect type: %s", lua_tostring (L, -1));
+			}
+			else {
+				if (lua_istable (L, -1)) {
+					const gchar *mb;
+
+					/* First returned value */
+					part->detected_ext = rspamd_mempool_strdup (task->task_pool,
+							lua_tostring (L, -2));
+
+					lua_pushstring (L, "ct");
+					lua_gettable (L, -2);
+
+					if (lua_isstring (L, -1)) {
+						mb = lua_tostring (L, -1);
+
+						if (mb) {
+							rspamd_ftok_t srch;
+
+							srch.begin = mb;
+							srch.len = strlen (mb);
+							part->detected_ct = rspamd_content_type_parse (srch.begin,
+									srch.len,
+									task->task_pool);
+						}
+					}
+
+					lua_pop (L, 1);
+
+					lua_pushstring (L, "type");
+					lua_gettable (L, -2);
+
+					if (lua_isstring (L, -1)) {
+						part->detected_type = rspamd_mempool_strdup (task->task_pool,
+								lua_tostring (L, -1));
+					}
 				}
 			}
+
+			lua_settop (L, func_pos);
 		}
+
+		rspamd_message_process_text_part_maybe (task, part);
 	}
 
+	lua_settop (L, 0);
+
 	/* Calculate average words length and number of short words */
 	struct rspamd_mime_text_part *text_part;
 	gdouble *var;
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 651e1d457..374d3a7f9 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -56,6 +56,8 @@ struct rspamd_mime_multipart {
 struct rspamd_mime_part {
 	struct rspamd_content_type *ct;
 	struct rspamd_content_type *detected_ct;
+	gchar *detected_type;
+	gchar *detected_ext;
 	struct rspamd_content_disposition *cd;
 	rspamd_ftok_t raw_data;
 	rspamd_ftok_t parsed_data;
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 4faca7b56..263d00f38 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -437,7 +437,6 @@ struct rspamd_config {
 	gchar *history_file;                           /**< file to save rolling history						*/
 	gchar *tld_file;                               /**< file to load effective tld list from				*/
 	gchar *hs_cache_dir;                           /**< directory to save hyperscan databases				*/
-	gchar *magic_file;                             /**< file to initialize libmagic						*/
 
 	gdouble dns_timeout;                            /**< timeout in milliseconds for waiting for dns reply	*/
 	guint32 dns_retransmits;                        /**< maximum retransmits count							*/
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index 2bdb6adc6..fb2cbf052 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -2092,12 +2092,6 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
 				G_STRUCT_OFFSET (struct rspamd_config, ssl_ciphers),
 				0,
 				"List of ssl ciphers (e.g. HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4)");
-		rspamd_rcl_add_default_handler (sub,
-				"magic_file",
-				rspamd_rcl_parse_struct_string,
-				G_STRUCT_OFFSET (struct rspamd_config, magic_file),
-				0,
-				"Path to a custom libmagic file");
 		rspamd_rcl_add_default_handler (sub,
 				"max_message",
 				rspamd_rcl_parse_struct_integer,
diff --git a/src/libutil/util.c b/src/libutil/util.c
index 86358e46e..7877582c2 100644
--- a/src/libutil/util.c
+++ b/src/libutil/util.c
@@ -2364,35 +2364,6 @@ rspamd_init_libs (void)
 	rlim.rlim_max = rlim.rlim_cur;
 	setrlimit (RLIMIT_STACK, &rlim);
 
-	gint magic_flags = 0;
-
-	/* Unless trusty and other crap is supported... */
-#if 0
-#ifdef MAGIC_NO_CHECK_BUILTIN
-	magic_flags = MAGIC_NO_CHECK_BUILTIN;
-#endif
-#endif
-	magic_flags |= MAGIC_MIME|MAGIC_NO_CHECK_COMPRESS|
-				   MAGIC_NO_CHECK_ELF|MAGIC_NO_CHECK_TAR;
-#ifdef MAGIC_NO_CHECK_CDF
-	magic_flags |= MAGIC_NO_CHECK_CDF;
-#endif
-#ifdef MAGIC_NO_CHECK_ENCODING
-	magic_flags |= MAGIC_NO_CHECK_ENCODING;
-#endif
-#ifdef MAGIC_NO_CHECK_TAR
-	magic_flags |= MAGIC_NO_CHECK_TAR;
-#endif
-#ifdef MAGIC_NO_CHECK_TEXT
-	magic_flags |= MAGIC_NO_CHECK_TEXT;
-#endif
-#ifdef MAGIC_NO_CHECK_TOKENS
-	magic_flags |= MAGIC_NO_CHECK_TOKENS;
-#endif
-#ifdef MAGIC_NO_CHECK_JSON
-	magic_flags |= MAGIC_NO_CHECK_JSON;
-#endif
-	ctx->libmagic = magic_open (magic_flags);
 	ctx->local_addrs = rspamd_inet_library_init ();
 	REF_INIT_RETAIN (ctx, rspamd_deinit_libs);
 
@@ -2473,10 +2444,6 @@ rspamd_config_libs (struct rspamd_external_libs_ctx *ctx,
 			}
 		}
 
-		if (ctx->libmagic) {
-			magic_load (ctx->libmagic, cfg->magic_file);
-		}
-
 		rspamd_free_zstd_dictionary (ctx->in_dict);
 		rspamd_free_zstd_dictionary (ctx->out_dict);
 
@@ -2586,10 +2553,6 @@ void
 rspamd_deinit_libs (struct rspamd_external_libs_ctx *ctx)
 {
 	if (ctx != NULL) {
-		if (ctx->libmagic) {
-			magic_close (ctx->libmagic);
-		}
-
 		g_free (ctx->ottery_cfg);
 
 #ifdef HAVE_OPENSSL
diff --git a/src/rspamd.h b/src/rspamd.h
index 0a0fb45fc..ea11965fb 100644
--- a/src/rspamd.h
+++ b/src/rspamd.h
@@ -33,8 +33,6 @@
 #include "libserver/task.h"
 
 #include <openssl/ssl.h>
-#include <magic.h>
-
 
 /* Default values */
 #define FIXED_CONFIG_FILE RSPAMD_CONFDIR "/rspamd.conf"
@@ -353,7 +351,6 @@ struct zstd_dictionary {
 struct rspamd_radix_map_helper;
 
 struct rspamd_external_libs_ctx {
-	magic_t libmagic;
 	struct rspamd_radix_map_helper **local_addrs;
 	struct rspamd_cryptobox_library_ctx *crypto_ctx;
 	struct ottery_config *ottery_cfg;


More information about the Commits mailing list