commit b28a5df: [Feature] Core: Process data urls for images

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Jan 15 12:56:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-01-15 11:26:20 +0000
URL: https://github.com/rspamd/rspamd/commit/b28a5df9446c5e5ed71a51a71bab2b6e108e91c2

[Feature] Core: Process data urls for images

---
 src/libserver/html.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 9bf15e0a2..e8856db35 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -23,6 +23,7 @@
 #include "html_entities.h"
 #include "url.h"
 #include "contrib/libucl/khash.h"
+#include "libmime/images.h"
 
 #include <unicode/uversion.h>
 #include <unicode/ucnv.h>
@@ -1482,6 +1483,58 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
 	}
 }
 
+static void
+rspamd_html_process_data_image (rspamd_mempool_t *pool,
+								struct html_image *img,
+								struct html_tag_component *src)
+{
+	/*
+	 * Here, we do very basic processing of the data:
+	 * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
+	 * We only parse base64 encoded data.
+	 * We ignore content type so far
+	 */
+	struct rspamd_image *parsed_image;
+	const gchar *semicolon_pos = NULL, *end = src->start + src->len;
+
+	semicolon_pos = src->start;
+
+	while ((semicolon_pos = memchr (semicolon_pos, ';', end - semicolon_pos)) != NULL) {
+		if (end - semicolon_pos > sizeof ("base64,")) {
+			if (memcmp (semicolon_pos + 1, "base64,", sizeof ("base64,") - 1) == 0) {
+				const gchar *data_pos = semicolon_pos + sizeof ("base64,");
+				gchar *decoded;
+				gsize encoded_len = end - data_pos, decoded_len;
+				rspamd_ftok_t inp;
+
+				decoded_len = (encoded_len / 4 * 3) + 12;
+				decoded = rspamd_mempool_alloc (pool, decoded_len);
+				rspamd_cryptobox_base64_decode (data_pos, encoded_len,
+						decoded, &decoded_len);
+				inp.begin = decoded;
+				inp.len = decoded_len;
+
+				parsed_image = rspamd_maybe_process_image (pool, &inp);
+
+				if (parsed_image) {
+					msg_debug_html ("detected %s image of size %ud x %ud in data url",
+							rspamd_image_type_str (parsed_image->type),
+							parsed_image->width, parsed_image->height);
+					img->embedded_image = parsed_image;
+				}
+			}
+
+			break;
+		}
+		else {
+			/* Nothing useful */
+			return;
+		}
+
+		semicolon_pos ++;
+	}
+}
+
 static void
 rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 		struct html_content *hc)
@@ -1517,7 +1570,7 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 				/* We have an embedded image in HTML tag */
 				img->flags |=
 						(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED|RSPAMD_HTML_FLAG_IMAGE_DATA);
-
+				rspamd_html_process_data_image (pool, img, comp);
 			}
 			else {
 				img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
@@ -1593,6 +1646,15 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 				hc->images);
 	}
 
+	if (img->embedded_image) {
+		if (!seen_height) {
+			img->height = img->embedded_image->height;
+		}
+		if (!seen_width) {
+			img->width = img->embedded_image->width;
+		}
+	}
+
 	g_ptr_array_add (hc->images, img);
 	tag->extra = img;
 }


More information about the Commits mailing list