commit 141617d: [Rework] Html: Add images processing logic

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jun 2 19:56:07 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-01 20:18:47 +0100
URL: https://github.com/rspamd/rspamd/commit/141617dab47bf741af4578c656d4cda5f18742ed

[Rework] Html: Add images processing logic

---
 src/libmime/images.c        | 49 +++++++++++++++++----------------------------
 src/libserver/html/html.cxx | 32 +++++++++++++++++++++++++++++
 src/libserver/html/html.h   | 10 +++++++++
 3 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/src/libmime/images.c b/src/libmime/images.c
index 960036d78..4e0872f38 100644
--- a/src/libmime/images.c
+++ b/src/libmime/images.c
@@ -658,8 +658,8 @@ rspamd_image_process_part (struct rspamd_task *task, struct rspamd_mime_part *pa
 	struct rspamd_mime_header *rh;
 	struct rspamd_mime_text_part *tp;
 	struct html_image *himg;
-	const gchar *cid, *html_cid;
-	guint cid_len, i, j;
+	const gchar *cid;
+	guint cid_len, i;
 	struct rspamd_image *img;
 
 	img = (struct rspamd_image *)part->specific.img;
@@ -684,35 +684,22 @@ rspamd_image_process_part (struct rspamd_task *task, struct rspamd_mime_part *pa
 				}
 
 				PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, tp) {
-					if (IS_TEXT_PART_HTML (tp) && tp->html != NULL &&
-						tp->html->images != NULL) {
-						for (j = 0; j < tp->html->images->len; j ++) {
-							himg = g_ptr_array_index (tp->html->images, j);
-
-							if ((himg->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED) &&
-								himg->src) {
-								html_cid = himg->src;
-
-								if (strncmp (html_cid, "cid:", 4) == 0) {
-									html_cid += 4;
-								}
-
-								if (strlen (html_cid) == cid_len &&
-									memcmp (html_cid, cid, cid_len) == 0) {
-									img->html_image = himg;
-									himg->embedded_image = img;
-
-									msg_debug_images ("found linked image by cid: <%s>",
-											cid);
-
-									if (himg->height == 0) {
-										himg->height = img->height;
-									}
-
-									if (himg->width == 0) {
-										himg->width = img->width;
-									}
-								}
+					if (IS_TEXT_PART_HTML (tp) && tp->html != NULL) {
+						himg = rspamd_html_find_embedded_image(tp->html, cid, cid_len);
+
+						if (himg != NULL) {
+							img->html_image = himg;
+							himg->embedded_image = img;
+
+							msg_debug_images ("found linked image by cid: <%s>",
+									cid);
+
+							if (himg->height == 0) {
+								himg->height = img->height;
+							}
+
+							if (himg->width == 0) {
+								himg->width = img->width;
 							}
 						}
 					}
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 45a9afa18..00f1d331f 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -2277,6 +2277,23 @@ html_process_part_full (rspamd_mempool_t *pool,
 	return hc;
 }
 
+static auto
+html_find_image_by_cid(const html_content &hc, std::string_view cid)
+	-> std::optional<const html_image *>
+{
+	for (const auto *html_image : hc.images) {
+		/* Filter embedded images */
+		if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
+				html_image->src != nullptr) {
+			if (cid == html_image->src) {
+				return html_image;
+			}
+		}
+	}
+
+	return std::nullopt;
+}
+
 }
 
 void *
@@ -2355,4 +2372,19 @@ rspamd_html_tag_name(void *p, gsize *len)
 	}
 
 	return tag->name.data();
+}
+
+struct html_image*
+rspamd_html_find_embedded_image(void *html_content,
+								const char *cid, gsize cid_len)
+{
+	auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+	auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
+
+	if (maybe_img) {
+		return (html_image *)maybe_img.value();
+	}
+
+	return nullptr;
 }
\ No newline at end of file
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 94063b9be..1e71d0c2d 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -144,6 +144,16 @@ gint rspamd_html_tag_by_name(const gchar *name);
  */
 const gchar *rspamd_html_tag_name(void *tag, gsize *len);
 
+/**
+ * Find HTML image by content id
+ * @param html_content
+ * @param cid
+ * @param cid_len
+ * @return
+ */
+struct html_image* rspamd_html_find_embedded_image(void *html_content,
+		const char *cid, gsize cid_len);
+
 
 #ifdef  __cplusplus
 }


More information about the Commits mailing list