commit ec439ea: [Feature] Core: Detect charset in archived files
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Dec 28 08:21:03 UTC 2018
Author: Vsevolod Stakhov
Date: 2018-12-28 07:53:42 +0000
URL: https://github.com/rspamd/rspamd/commit/ec439ea252e82a6500bbc589d09599ce8db46fdf
[Feature] Core: Detect charset in archived files
---
src/libmime/archives.c | 102 +++++++++++++++++++++++++++++++++++++++-----
src/libmime/mime_encoding.c | 10 ++---
src/libmime/mime_encoding.h | 38 +++++++++++++++++
3 files changed, 134 insertions(+), 16 deletions(-)
diff --git a/src/libmime/archives.c b/src/libmime/archives.c
index 183232e6f..8497fdf70 100644
--- a/src/libmime/archives.c
+++ b/src/libmime/archives.c
@@ -18,9 +18,12 @@
#include "message.h"
#include "task.h"
#include "archives.h"
+#include "libmime/mime_encoding.h"
#include <unicode/uchar.h>
#include <unicode/utf8.h>
#include <unicode/utf16.h>
+#include <unicode/ucnv.h>
+
static void
rspamd_archive_dtor (gpointer p)
@@ -42,6 +45,79 @@ rspamd_archive_dtor (gpointer p)
g_ptr_array_free (arch->files, TRUE);
}
+static GString *
+rspamd_archive_file_try_utf (const gchar *in, gsize inlen)
+{
+ const gchar *charset = NULL, *p, *end;
+ GString *res;
+
+ charset = rspamd_mime_charset_find_by_content (in, inlen);
+
+ if (charset) {
+ UChar *tmp;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gint32 r, clen, dlen;
+ struct rspamd_charset_converter *conv;
+ UConverter *utf8_converter;
+
+ conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter ();
+
+ if (conv == NULL) {
+ msg_err ("cannot open converter for %s: %s",
+ charset, u_errorName (uc_err));
+
+ return NULL;
+ }
+
+ tmp = g_malloc (sizeof (*tmp) * (inlen + 1));
+ r = rspamd_converter_to_uchars (conv, tmp, inlen + 1,
+ in, inlen, &uc_err);
+ if (!U_SUCCESS (uc_err)) {
+ msg_err ("cannot convert data to unicode from %s: %s",
+ charset, u_errorName (uc_err));
+ g_free (tmp);
+
+ return NULL;
+ }
+
+ clen = ucnv_getMaxCharSize (utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+ res = g_string_sized_new (dlen);
+ r = ucnv_fromUChars (utf8_converter, res->str, dlen, tmp, r, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_err ("cannot convert data from unicode from %s: %s",
+ charset, u_errorName (uc_err));
+ g_free (tmp);
+ g_string_free (res, TRUE);
+
+ return NULL;
+ }
+
+ res->len = r;
+ }
+ else {
+ /* Convert unsafe characters to '?' */
+ res = g_string_sized_new (inlen);
+ p = in;
+ end = in + inlen;
+
+ while (p < end) {
+ if (g_ascii_isgraph (*p)) {
+ g_string_append_c (res, *p);
+ }
+ else {
+ g_string_append_c (res, '?');
+ }
+
+ p ++;
+ }
+ }
+
+ return res;
+}
+
static void
rspamd_archive_process_zip (struct rspamd_task *task,
struct rspamd_mime_part *part)
@@ -147,11 +223,17 @@ rspamd_archive_process_zip (struct rspamd_task *task,
}
f = g_malloc0 (sizeof (*f));
- f->fname = g_string_new_len (cd + cd_basic_len, fname_len);
+ f->fname = rspamd_archive_file_try_utf (cd + cd_basic_len, fname_len);
f->compressed_size = comp_size;
f->uncompressed_size = uncomp_size;
- g_ptr_array_add (arch->files, f);
- msg_debug_task ("found file in zip archive: %v", f->fname);
+
+ if (f->fname) {
+ g_ptr_array_add (arch->files, f);
+ msg_debug_task ("found file in zip archive: %v", f->fname);
+ }
+ else {
+ g_free (f);
+ }
cd += fname_len + comment_len + extra_len + cd_basic_len;
}
@@ -1227,7 +1309,10 @@ rspamd_7zip_ucs2_to_utf8 (struct rspamd_task *task, const guchar *p,
while (src_pos < len) {
U16_NEXT (up, src_pos, len, wc);
- U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error);
+
+ if (wc > 0) {
+ U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error);
+ }
if (is_error) {
g_string_free (res, TRUE);
@@ -1519,19 +1604,14 @@ rspamd_archive_process_gzip (struct rspamd_task *task,
struct rspamd_archive_file *f;
f = g_malloc0 (sizeof (*f));
- f->fname = g_string_new (fname_start);
+ f->fname = rspamd_archive_file_try_utf (fname_start,
+ p - fname_start);
g_ptr_array_add (arch->files, f);
goto set;
}
}
- else if (!g_ascii_isgraph (*p)) {
- msg_debug_task ("gzip archive is invalid, bad filename at pos %d",
- (int)(p - start));
-
- return;
- }
p ++;
}
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index d7ac5d416..213817747 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -98,7 +98,7 @@ rspamd_converter_dtor (gpointer p)
g_free (c);
}
-static int32_t
+int32_t
rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
UChar *dest,
int32_t destCapacity,
@@ -132,7 +132,7 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
}
-static struct rspamd_charset_converter *
+struct rspamd_charset_converter *
rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
{
const gchar *canon_name;
@@ -497,8 +497,8 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
}
}
-static const char *
-rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
+const char *
+rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
{
static UCharsetDetector *csd;
const UCharsetMatch **csm, *sel = NULL;
@@ -524,7 +524,7 @@ rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
detect:
ucsdet_setText (csd, in, inlen, &uc_err);
- csm = ucsdet_detectAll(csd, &matches, &uc_err);
+ csm = ucsdet_detectAll (csd, &matches, &uc_err);
for (i = 0; i < matches; i ++) {
if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) {
diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h
index 5f436d99d..1a61339ca 100644
--- a/src/libmime/mime_encoding.h
+++ b/src/libmime/mime_encoding.h
@@ -19,10 +19,12 @@
#include "config.h"
#include "mem_pool.h"
#include "fstring.h"
+#include <unicode/uchar.h>
struct rspamd_task;
struct rspamd_mime_part;
struct rspamd_mime_text_part;
+struct rspamd_charset_converter;
/**
* Convert charset to a valid iconv charset
@@ -87,5 +89,41 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
*/
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
+/**
+ * Gets cached converter
+ * @param enc
+ * @param err
+ * @return
+ */
+struct rspamd_charset_converter *rspamd_mime_get_converter_cached (
+ const gchar *enc,
+ UErrorCode *err);
+
+/**
+ * Performs charset->utf16 conversion
+ * @param cnv
+ * @param dest
+ * @param destCapacity
+ * @param src
+ * @param srcLength
+ * @param pErrorCode
+ * @return
+ */
+gint32
+rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
+ UChar *dest,
+ gint32 destCapacity,
+ const char *src,
+ gint32 srcLength,
+ UErrorCode *pErrorCode);
+
+/**
+ * Detect charset in text
+ * @param in
+ * @param inlen
+ * @return detected charset name or NULL
+ */
+const char *rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen);
+
#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
More information about the Commits
mailing list