commit 5dd1ccb: [Minor] Allow attaching of urls to the mime parts

Vsevolod Stakhov vsevolod at highsecure.ru
Tue May 5 14:56:07 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-05-05 14:59:33 +0100
URL: https://github.com/rspamd/rspamd/commit/5dd1ccbb33e53b9a3903970bbd623569869ad008

[Minor] Allow attaching of urls to the mime parts

---
 src/libmime/message.c     |  8 +++++++-
 src/libmime/message.h     |  1 +
 src/libmime/mime_parser.c |  2 ++
 src/libserver/html.c      | 36 +++++++++++++++++++++++++++---------
 src/libserver/html.h      |  3 ++-
 src/libserver/url.c       | 14 ++++++++++++--
 6 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index 4b00d2dd0..eec992552 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -758,7 +758,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
 			text_part->html,
 			text_part->utf_raw_content,
 			&text_part->exceptions,
-			MESSAGE_FIELD (task, urls));
+			MESSAGE_FIELD (task, urls),
+			text_part->mime_part->urls);
 
 	if (text_part->utf_content->len == 0) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
@@ -925,6 +926,7 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 	part->parsed_data.begin = start;
 	part->parsed_data.len = len;
 	part->part_number = MESSAGE_FIELD (task, parts)->len;
+	part->urls = g_ptr_array_new ();
 	part->raw_headers = rspamd_message_headers_new ();
 	part->headers_order = NULL;
 
@@ -1052,6 +1054,10 @@ rspamd_message_dtor (struct rspamd_message *msg)
 					LUA_REGISTRYINDEX,
 					p->specific.lua_specific.cbref);
 		}
+
+		if (p->urls) {
+			g_ptr_array_unref (p->urls);
+		}
 	}
 
 	PTR_ARRAY_FOREACH (msg->text_parts, i, tp) {
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 96ed9d5d4..a921d6f38 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -91,6 +91,7 @@ struct rspamd_mime_part {
 
 	struct rspamd_mime_header *headers_order;
 	struct rspamd_mime_headers_table *raw_headers;
+	GPtrArray *urls;
 
 	gchar *raw_headers_str;
 	gsize raw_headers_len;
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
index 590ee57d6..4fc37ad3d 100644
--- a/src/libmime/mime_parser.c
+++ b/src/libmime/mime_parser.c
@@ -683,6 +683,7 @@ rspamd_mime_parse_normal_part (struct rspamd_task *task,
 	}
 
 	part->part_number = MESSAGE_FIELD (task, parts)->len;
+	part->urls = g_ptr_array_new ();
 	g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
 	msg_debug_mime ("parsed data part %T/%T of length %z (%z orig), %s cte",
 			&part->ct->type, &part->ct->subtype, part->parsed_data.len,
@@ -1017,6 +1018,7 @@ rspamd_mime_parse_multipart_part (struct rspamd_task *task,
 	}
 
 	part->part_number = MESSAGE_FIELD (task, parts)->len;
+	part->urls = g_ptr_array_new ();
 	g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
 	st->nesting ++;
 	rspamd_mime_part_get_cte (task, part->raw_headers, part, FALSE);
diff --git a/src/libserver/html.c b/src/libserver/html.c
index f8c43bdd5..b916019d9 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1548,7 +1548,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 
 static struct rspamd_url *
 rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
-		struct html_content *hc)
+							 struct html_content *hc)
 {
 	struct html_tag_component *comp;
 	GList *cur;
@@ -1628,6 +1628,7 @@ struct rspamd_html_url_query_cbd {
 	rspamd_mempool_t *pool;
 	khash_t (rspamd_url_hash) *url_set;
 	struct rspamd_url *url;
+	GPtrArray *part_urls;
 };
 
 static gboolean
@@ -1651,14 +1652,18 @@ rspamd_html_url_query_callback (struct rspamd_url *url, gsize start_offset,
 					cbd->url->querylen, rspamd_url_query_unsafe (cbd->url));
 
 	url->flags |= RSPAMD_URL_FLAG_QUERY;
-	rspamd_url_set_add_or_increase (cbd->url_set, url);
+
+	if (rspamd_url_set_add_or_increase (cbd->url_set, url) && cbd->part_urls) {
+		g_ptr_array_add (cbd->part_urls, url);
+	}
 
 	return TRUE;
 }
 
 static void
 rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
-						 khash_t (rspamd_url_hash) *url_set)
+						 khash_t (rspamd_url_hash) *url_set,
+						 GPtrArray *part_urls)
 {
 	if (url->flags & RSPAMD_URL_FLAG_UNNORMALISED) {
 		url->flags |= RSPAMD_URL_FLAG_OBSCURED;
@@ -1670,12 +1675,17 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
 		qcbd.pool = pool;
 		qcbd.url_set = url_set;
 		qcbd.url = url;
+		qcbd.part_urls = part_urls;
 
 		rspamd_url_find_multiple(pool,
 				rspamd_url_query_unsafe (url), url->querylen,
 				RSPAMD_URL_FIND_ALL, NULL,
 				rspamd_html_url_query_callback, &qcbd);
 	}
+
+	if (part_urls) {
+		g_ptr_array_add (part_urls, url);
+	}
 }
 
 static void
@@ -1732,7 +1742,8 @@ rspamd_html_process_data_image (rspamd_mempool_t *pool,
 
 static void
 rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
-		struct html_content *hc, khash_t (rspamd_url_hash) *url_set)
+							 struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
+							 GPtrArray *part_urls)
 {
 	struct html_tag_component *comp;
 	struct html_image *img;
@@ -1778,7 +1789,11 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 
 						if (img->url) {
 							img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
-							rspamd_url_set_add_or_increase (url_set, img->url);
+
+							if (rspamd_url_set_add_or_increase (url_set, img->url) &&
+								part_urls) {
+								g_ptr_array_add (part_urls, img->url);
+							}
 						}
 					}
 				}
@@ -2603,7 +2618,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 							   struct html_content *hc,
 							   GByteArray *in,
 							   GList **exceptions,
-							   khash_t (rspamd_url_hash) *url_set)
+							   khash_t (rspamd_url_hash) *url_set,
+							   GPtrArray *part_urls)
 {
 	const guchar *p, *c, *end, *savep = NULL;
 	guchar t;
@@ -3067,7 +3083,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 
 							if (url_set != NULL) {
 								if (rspamd_url_set_add_or_increase (url_set, url)) {
-									rspamd_process_html_url (pool, url, url_set);
+									rspamd_process_html_url (pool, url, url_set,
+											part_urls);
 								}
 							}
 
@@ -3129,7 +3146,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 				}
 
 				if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
-					rspamd_html_process_img_tag (pool, cur_tag, hc, url_set);
+					rspamd_html_process_img_tag (pool, cur_tag, hc, url_set,
+							part_urls);
 				}
 				else if (cur_tag->flags & FL_BLOCK) {
 					struct html_block *bl;
@@ -3194,5 +3212,5 @@ rspamd_html_process_part (rspamd_mempool_t *pool,
 		struct html_content *hc,
 		GByteArray *in)
 {
-	return rspamd_html_process_part_full (pool, hc, in, NULL, NULL);
+	return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
 }
diff --git a/src/libserver/html.h b/src/libserver/html.h
index 72eac8d79..b319964ce 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -143,7 +143,8 @@ GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
 GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
 										   struct html_content *hc,
 										   GByteArray *in, GList **exceptions,
-										   khash_t (rspamd_url_hash) *url_set);
+										   khash_t (rspamd_url_hash) *url_set,
+										   GPtrArray *part_urls);
 
 /*
  * Returns true if a specified tag has been seen in a part
diff --git a/src/libserver/url.c b/src/libserver/url.c
index a47d732f7..c10073dcb 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -3296,7 +3296,13 @@ rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset,
 	}
 
 	url->flags |= RSPAMD_URL_FLAG_QUERY;
-	rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);
+
+
+	if (rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url)) {
+		if (cbd->part && cbd->part->mime_part->urls) {
+			g_ptr_array_add (cbd->part->mime_part->urls, url);
+		}
+	}
 
 	return TRUE;
 }
@@ -3347,7 +3353,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 	}
 
 	url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
-	rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);
+
+	if (rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url) &&
+			cbd->part->mime_part->urls) {
+		g_ptr_array_add (cbd->part->mime_part->urls, url);
+	}
 
 	cbd->part->exceptions = g_list_prepend (
 			cbd->part->exceptions,


More information about the Commits mailing list