commit a2eb042: [Minor] Strip visible parts of urls using utf rules

Vsevolod Stakhov vsevolod at highsecure.ru
Fri May 14 16:00:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-14 16:59:30 +0100
URL: https://github.com/rspamd/rspamd/commit/a2eb042dcd36228b9e0a6d1417c54032489d91ff (HEAD -> master)

[Minor] Strip visible parts of urls using utf rules

---
 src/libserver/html.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 326c8facc..30c2c022b 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -2617,8 +2617,43 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
 	rspamd_strlcpy (url->visible_part, dest->data + href_offset,
 			dest->len - href_offset + 1);
 	dlen = dest->len - href_offset;
-	url->visible_part =
-			(gchar *)rspamd_string_len_strip (url->visible_part, &dlen, " \t\v\r\n");
+
+	/* Strip unicode spaces from the start and the end */
+	gchar *p = url->visible_part, *end = url->visible_part + dlen;
+	gint i = 0;
+
+	while (i < dlen) {
+		UChar32 uc;
+		gint prev_i = i;
+
+		U8_NEXT(p, i, dlen, uc);
+
+		if (!u_isspace (uc)) {
+			i = prev_i;
+			break;
+		}
+	}
+
+	p += i;
+	dlen -= i;
+	url->visible_part = p;
+	i = end - url->visible_part - 1;
+
+	if (i > 0) {
+		gint32 dl = dlen;
+
+		while (i > 0) {
+			UChar32 uc;
+
+			U8_PREV(p, i, dl, uc);
+
+			if (!u_isspace (uc)) {
+				break;
+			}
+		}
+
+		dlen = i;
+	}
 
 
 	rspamd_html_url_is_phished (pool, url,


More information about the Commits mailing list