commit a59ff17: [Regression] Fix urls output in the protocol

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Feb 4 16:42:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-02-04 16:35:21 +0000
URL: https://github.com/rspamd/rspamd/commit/a59ff17c8be4d6ebe6db7b6fc2cebf724c4fd865 (HEAD -> master)

[Regression] Fix urls output in the protocol

---
 src/libserver/protocol.c | 67 +++++++++++++++++++++++++++++++++---------------
 src/libserver/url.c      | 48 +++++++++++++++++++++++++++++-----
 src/libserver/url.h      |  3 +++
 3 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index ba468ee5f..5bcfbc37a 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -435,14 +435,17 @@ rspamd_protocol_handle_headers (struct rspamd_task *task,
 					}
 				}
 				IF_HEADER (URLS_HEADER) {
+					msg_debug_protocol ("read urls header, value: %V", hv);
+
 					srch.begin = "extended";
 					srch.len = 8;
 
-					msg_debug_protocol ("read urls header, value: %V", hv);
 					if (rspamd_ftok_casecmp (hv_tok, &srch) == 0) {
 						task->flags |= RSPAMD_TASK_FLAG_EXT_URLS;
 						msg_debug_protocol ("extended urls information");
 					}
+
+					/* TODO: add more formats there */
 				}
 				IF_HEADER (USER_AGENT_HEADER) {
 					msg_debug_protocol ("read user-agent header, value: %V", hv);
@@ -665,6 +668,7 @@ rspamd_protocol_handle_request (struct rspamd_task *task,
 /* Structure for writing tree data */
 struct tree_cb_data {
 	ucl_object_t *top;
+	GHashTable *seen;
 	struct rspamd_task *task;
 };
 
@@ -715,17 +719,37 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 	struct rspamd_url *url = value;
 	ucl_object_t *obj;
 	struct rspamd_task *task = cb->task;
-	const gchar *user_field = "unknown", *encoded;
+	const gchar *user_field = "unknown", *encoded = NULL;
 	gboolean has_user = FALSE;
 	guint len = 0;
-	gsize enclen;
-
-	encoded = rspamd_url_encode (url, &enclen, task->task_pool);
+	gsize enclen = 0;
 
 	if (!(task->flags & RSPAMD_TASK_FLAG_EXT_URLS)) {
-		obj = ucl_object_fromlstring (encoded, enclen);
+		if (url->hostlen > 0) {
+			if (g_hash_table_lookup (cb->seen, url)) {
+				return;
+			}
+
+			const gchar *end = NULL;
+
+			if (g_utf8_validate (url->host, url->hostlen, &end)) {
+				obj = ucl_object_fromlstring (url->host, url->hostlen);
+			}
+			else if (end - url->host > 0) {
+				obj = ucl_object_fromlstring (url->host, end - url->host);
+			}
+			else {
+				return;
+			}
+		}
+		else {
+			return;
+		}
+
+		g_hash_table_insert (cb->seen, url, url);
 	}
 	else {
+		encoded = rspamd_url_encode (url, &enclen, task->task_pool);
 		obj = rspamd_protocol_extended_url (task, url, encoded, enclen);
 	}
 
@@ -742,6 +766,10 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 			len = task->from_envelope->addr_len;
 		}
 
+		if (!encoded) {
+			encoded = rspamd_url_encode (url, &enclen, task->task_pool);
+		}
+
 		msg_notice_task_encrypted ("<%s> %s: %*s; ip: %s; URL: %*s",
 			task->message_id,
 			has_user ? "user" : "from",
@@ -760,9 +788,12 @@ rspamd_urls_tree_ucl (GHashTable *input, struct rspamd_task *task)
 	obj = ucl_object_typed_new (UCL_ARRAY);
 	cb.top = obj;
 	cb.task = task;
+	cb.seen = g_hash_table_new (rspamd_url_host_hash, rspamd_urls_host_cmp);
 
 	g_hash_table_foreach (input, urls_protocol_cb, &cb);
 
+	g_hash_table_unref (cb.seen);
+
 	return obj;
 }
 
@@ -1168,18 +1199,16 @@ rspamd_protocol_write_ucl (struct rspamd_task *task,
 	}
 
 	if (flags & RSPAMD_PROTOCOL_URLS) {
-		if (task->flags & RSPAMD_TASK_FLAG_EXT_URLS) {
-			if (g_hash_table_size (task->urls) > 0) {
-				ucl_object_insert_key (top,
-						rspamd_urls_tree_ucl (task->urls, task),
-						"urls", 0, false);
-			}
+		if (g_hash_table_size (task->urls) > 0) {
+			ucl_object_insert_key (top,
+					rspamd_urls_tree_ucl (task->urls, task),
+					"urls", 0, false);
+		}
 
-			if (g_hash_table_size (task->emails) > 0) {
-				ucl_object_insert_key (top,
-						rspamd_emails_tree_ucl (task->emails, task),
-						"emails", 0, false);
-			}
+		if (g_hash_table_size (task->emails) > 0) {
+			ucl_object_insert_key (top,
+					rspamd_emails_tree_ucl (task->emails, task),
+					"emails", 0, false);
 		}
 	}
 
@@ -1279,9 +1308,7 @@ rspamd_protocol_http_reply (struct rspamd_http_message *msg,
 		rspamd_http_message_add_header (msg, hn->begin, hv->begin);
 	}
 
-	if (task->cfg->log_urls || (task->flags & RSPAMD_TASK_FLAG_EXT_URLS)) {
-		flags |= RSPAMD_PROTOCOL_URLS;
-	}
+	flags |= RSPAMD_PROTOCOL_URLS;
 
 	top = rspamd_protocol_write_ucl (task, flags);
 
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 4599f3ce1..4dcd11c9e 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2971,15 +2971,26 @@ guint
 rspamd_url_hash (gconstpointer u)
 {
 	const struct rspamd_url *url = u;
-	rspamd_cryptobox_fast_hash_state_t st;
-
-	rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
 
 	if (url->urllen > 0) {
-		rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen);
+		return rspamd_cryptobox_fast_hash (url->string, url->urllen,
+				rspamd_hash_seed ());
 	}
 
-	return rspamd_cryptobox_fast_hash_final (&st);
+	return 0;
+}
+
+guint
+rspamd_url_host_hash (gconstpointer u)
+{
+	const struct rspamd_url *url = u;
+
+	if (url->hostlen > 0) {
+		return rspamd_cryptobox_fast_hash (url->host, url->hostlen,
+				rspamd_hash_seed ());
+	}
+
+	return 0;
 }
 
 guint
@@ -3045,6 +3056,22 @@ rspamd_urls_cmp (gconstpointer a, gconstpointer b)
 	return r == 0;
 }
 
+gboolean
+rspamd_urls_host_cmp (gconstpointer a, gconstpointer b)
+{
+	const struct rspamd_url *u1 = a, *u2 = b;
+	int r = 0;
+
+	if (u1->hostlen != u2->hostlen) {
+		return FALSE;
+	}
+	else {
+		r = memcmp (u1->host, u2->host, u1->hostlen);
+	}
+
+	return r == 0;
+}
+
 gsize
 rspamd_url_decode (gchar *dst, const gchar *src, gsize size)
 {
@@ -3255,8 +3282,15 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
 	dest = rspamd_mempool_alloc (pool, dlen + 1);
 	d = dest;
 	dend = d + dlen;
-	d += rspamd_snprintf ((gchar *)d, dend - d,
-			"%*s://", url->protocollen, rspamd_url_protocols[url->protocol].name);
+
+	if (url->protocollen > 0 &&
+		(url->protocol >= 0 && url->protocol < G_N_ELEMENTS (rspamd_url_protocols))) {
+		d += rspamd_snprintf ((gchar *) d, dend - d,
+				"%*s://", url->protocollen, rspamd_url_protocols[url->protocol].name);
+	}
+	else {
+		d += rspamd_snprintf ((gchar *) d, dend - d, "http://");
+	}
 
 	if (url->userlen > 0) {
 		ENCODE_URL_COMPONENT ((guchar *)url->user, url->userlen,
diff --git a/src/libserver/url.h b/src/libserver/url.h
index fa5c69f00..523fb2c1f 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -203,12 +203,15 @@ void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
 
 guint rspamd_url_hash (gconstpointer u);
 guint rspamd_email_hash (gconstpointer u);
+guint rspamd_url_host_hash (gconstpointer u);
+
 
 /* Compare two emails for building emails hash */
 gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);
 
 /* Compare two urls for building emails hash */
 gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);
+gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b);
 
 /**
  * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated


More information about the Commits mailing list