commit ec439ea: [Feature] Core: Detect charset in archived files

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Dec 28 08:21:03 UTC 2018


Author: Vsevolod Stakhov
Date: 2018-12-28 07:53:42 +0000
URL: https://github.com/rspamd/rspamd/commit/ec439ea252e82a6500bbc589d09599ce8db46fdf

[Feature] Core: Detect charset in archived files

---
 src/libmime/archives.c      | 102 +++++++++++++++++++++++++++++++++++++++-----
 src/libmime/mime_encoding.c |  10 ++---
 src/libmime/mime_encoding.h |  38 +++++++++++++++++
 3 files changed, 134 insertions(+), 16 deletions(-)

diff --git a/src/libmime/archives.c b/src/libmime/archives.c
index 183232e6f..8497fdf70 100644
--- a/src/libmime/archives.c
+++ b/src/libmime/archives.c
@@ -18,9 +18,12 @@
 #include "message.h"
 #include "task.h"
 #include "archives.h"
+#include "libmime/mime_encoding.h"
 #include <unicode/uchar.h>
 #include <unicode/utf8.h>
 #include <unicode/utf16.h>
+#include <unicode/ucnv.h>
+
 
 static void
 rspamd_archive_dtor (gpointer p)
@@ -42,6 +45,79 @@ rspamd_archive_dtor (gpointer p)
 	g_ptr_array_free (arch->files, TRUE);
 }
 
+static GString *
+rspamd_archive_file_try_utf (const gchar *in, gsize inlen)
+{
+	const gchar *charset = NULL, *p, *end;
+	GString *res;
+
+	charset = rspamd_mime_charset_find_by_content (in, inlen);
+
+	if (charset) {
+		UChar *tmp;
+		UErrorCode uc_err = U_ZERO_ERROR;
+		gint32 r, clen, dlen;
+		struct rspamd_charset_converter *conv;
+		UConverter *utf8_converter;
+
+		conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+		utf8_converter = rspamd_get_utf8_converter ();
+
+		if (conv == NULL) {
+			msg_err ("cannot open converter for %s: %s",
+					charset, u_errorName (uc_err));
+
+			return NULL;
+		}
+
+		tmp = g_malloc (sizeof (*tmp) * (inlen + 1));
+		r = rspamd_converter_to_uchars (conv, tmp, inlen + 1,
+				in, inlen, &uc_err);
+		if (!U_SUCCESS (uc_err)) {
+			msg_err ("cannot convert data to unicode from %s: %s",
+					charset, u_errorName (uc_err));
+			g_free (tmp);
+
+			return NULL;
+		}
+
+		clen = ucnv_getMaxCharSize (utf8_converter);
+		dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+		res = g_string_sized_new (dlen);
+		r = ucnv_fromUChars (utf8_converter, res->str, dlen, tmp, r, &uc_err);
+
+		if (!U_SUCCESS (uc_err)) {
+			msg_err ("cannot convert data from unicode from %s: %s",
+					charset, u_errorName (uc_err));
+			g_free (tmp);
+			g_string_free (res, TRUE);
+
+			return NULL;
+		}
+
+		res->len = r;
+	}
+	else {
+		/* Convert unsafe characters to '?' */
+		res = g_string_sized_new (inlen);
+		p = in;
+		end = in + inlen;
+
+		while (p < end) {
+			if (g_ascii_isgraph (*p)) {
+				g_string_append_c (res, *p);
+			}
+			else {
+				g_string_append_c (res, '?');
+			}
+
+			p ++;
+		}
+	}
+
+	return res;
+}
+
 static void
 rspamd_archive_process_zip (struct rspamd_task *task,
 		struct rspamd_mime_part *part)
@@ -147,11 +223,17 @@ rspamd_archive_process_zip (struct rspamd_task *task,
 		}
 
 		f = g_malloc0 (sizeof (*f));
-		f->fname = g_string_new_len (cd + cd_basic_len, fname_len);
+		f->fname = rspamd_archive_file_try_utf (cd + cd_basic_len, fname_len);
 		f->compressed_size = comp_size;
 		f->uncompressed_size = uncomp_size;
-		g_ptr_array_add (arch->files, f);
-		msg_debug_task ("found file in zip archive: %v", f->fname);
+
+		if (f->fname) {
+			g_ptr_array_add (arch->files, f);
+			msg_debug_task ("found file in zip archive: %v", f->fname);
+		}
+		else {
+			g_free (f);
+		}
 
 		cd += fname_len + comment_len + extra_len + cd_basic_len;
 	}
@@ -1227,7 +1309,10 @@ rspamd_7zip_ucs2_to_utf8 (struct rspamd_task *task, const guchar *p,
 
 	while (src_pos < len) {
 		U16_NEXT (up, src_pos, len, wc);
-		U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error);
+
+		if (wc > 0) {
+			U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error);
+		}
 
 		if (is_error) {
 			g_string_free (res, TRUE);
@@ -1519,19 +1604,14 @@ rspamd_archive_process_gzip (struct rspamd_task *task,
 					struct rspamd_archive_file *f;
 
 					f = g_malloc0 (sizeof (*f));
-					f->fname = g_string_new (fname_start);
+					f->fname = rspamd_archive_file_try_utf (fname_start,
+							p - fname_start);
 
 					g_ptr_array_add (arch->files, f);
 
 					goto set;
 				}
 			}
-			else if (!g_ascii_isgraph (*p)) {
-				msg_debug_task ("gzip archive is invalid, bad filename at pos %d",
-						(int)(p - start));
-
-				return;
-			}
 
 			p ++;
 		}
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index d7ac5d416..213817747 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -98,7 +98,7 @@ rspamd_converter_dtor (gpointer p)
 	g_free (c);
 }
 
-static int32_t
+int32_t
 rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
 							UChar *dest,
 							int32_t destCapacity,
@@ -132,7 +132,7 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
 }
 
 
-static struct rspamd_charset_converter *
+struct rspamd_charset_converter *
 rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
 {
 	const gchar *canon_name;
@@ -497,8 +497,8 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
 	}
 }
 
-static const char *
-rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
+const char *
+rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
 {
 	static UCharsetDetector *csd;
 	const UCharsetMatch **csm, *sel = NULL;
@@ -524,7 +524,7 @@ rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
 detect:
 
 	ucsdet_setText (csd, in, inlen, &uc_err);
-	csm = ucsdet_detectAll(csd, &matches, &uc_err);
+	csm = ucsdet_detectAll (csd, &matches, &uc_err);
 
 	for (i = 0; i < matches; i ++) {
 		if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) {
diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h
index 5f436d99d..1a61339ca 100644
--- a/src/libmime/mime_encoding.h
+++ b/src/libmime/mime_encoding.h
@@ -19,10 +19,12 @@
 #include "config.h"
 #include "mem_pool.h"
 #include "fstring.h"
+#include <unicode/uchar.h>
 
 struct rspamd_task;
 struct rspamd_mime_part;
 struct rspamd_mime_text_part;
+struct rspamd_charset_converter;
 
 /**
  * Convert charset to a valid iconv charset
@@ -87,5 +89,41 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  */
 void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
 
+/**
+ * Gets cached converter
+ * @param enc
+ * @param err
+ * @return
+ */
+struct rspamd_charset_converter *rspamd_mime_get_converter_cached (
+		const gchar *enc,
+		UErrorCode *err);
+
+/**
+ * Performs charset->utf16 conversion
+ * @param cnv
+ * @param dest
+ * @param destCapacity
+ * @param src
+ * @param srcLength
+ * @param pErrorCode
+ * @return
+ */
+gint32
+rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
+							UChar *dest,
+							gint32 destCapacity,
+							const char *src,
+							gint32 srcLength,
+							UErrorCode *pErrorCode);
+
+/**
+ * Detect charset in text
+ * @param in
+ * @param inlen
+ * @return detected charset name or NULL
+ */
+const char *rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen);
+
 
 #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */


More information about the Commits mailing list