commit 8a4c5ca: [Feature] Core: Normalise zero-width spaces in urls

Wed Jan 16 15:07:06 UTC 2019

Author: Vsevolod Stakhov
Date: 2019-01-16 15:04:27 +0000
URL: https://github.com/rspamd/rspamd/commit/8a4c5ca57ae66ad31f64c348d15d6b22112c8dad

[Feature] Core: Normalise zero-width spaces in urls

---
 src/libutil/str_util.c | 81 +++++++++++++++++++++++++++++++++++++++-----------
 src/libutil/str_util.h | 10 ++++++-
 2 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 27d50aead..2016808cf 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2420,7 +2420,7 @@ rspamd_get_unicode_normalizer (void)
 }
 
 
-gboolean
+enum rspamd_normalise_result
 rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 		guint *len)
 {
@@ -2430,7 +2430,8 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 	const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
 	gint32 nsym, end;
 	UChar *src = NULL, *dest = NULL;
-	gboolean ret = FALSE;
+	enum rspamd_normalise_result ret = 0;
+	gboolean has_invisible = FALSE;
 
 	/* We first need to convert data to UChars :( */
 	src = g_malloc ((*len + 1) * sizeof (*src));
@@ -2440,6 +2441,7 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 	if (!U_SUCCESS (uc_err)) {
 		msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
 				u_errorName (uc_err));
+		ret |= RSPAMD_UNICODE_NORM_ERROR;
 		goto out;
 	}
 
@@ -2449,36 +2451,81 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
 	if (!U_SUCCESS (uc_err)) {
 		msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
 				u_errorName (uc_err));
+		ret |= RSPAMD_UNICODE_NORM_ERROR;
 		goto out;
 	}
 
-	if (end == nsym) {
-		/* No normalisation needed */
+	for (gint32 i = 0; i < nsym; i ++) {
+		if (IS_ZERO_WIDTH_SPACE (src[i])) {
+			has_invisible = TRUE;
+			break;
+		}
+	}
+
+	uc_err = U_ZERO_ERROR;
+
+	if (end != nsym) {
+		/* No normalisation needed, but we may still have invisible spaces */
+		/* We copy sub(src, 0, end) to dest and normalise the rest */
+		ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+		dest = g_malloc (nsym * sizeof (*dest));
+		memcpy (dest, src, end * sizeof (*dest));
+		nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+				src + end, nsym - end, &uc_err);
+
+		if (!U_SUCCESS (uc_err)) {
+			if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+				msg_warn_pool_check ("cannot normalise URL: %s",
+						u_errorName (uc_err));
+				ret |= RSPAMD_UNICODE_NORM_ERROR;
+			}
+
+			goto out;
+		}
+	}
+	else if (!has_invisible) {
 		goto out;
 	}
+	else {
+		dest = src;
+		src = NULL;
+	}
 
-	/* We copy sub(src, 0, end) to dest and normalise the rest */
-	ret = TRUE;
-	dest = g_malloc (nsym * sizeof (*dest));
-	memcpy (dest, src, end * sizeof (*dest));
-	nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
-			src + end, nsym - end, &uc_err);
+	if (has_invisible) {
+		/* Also filter zero width spaces */
+		gint32 new_len = 0;
+		UChar *t = dest, *h = dest;
 
-	if (!U_SUCCESS (uc_err)) {
-		if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
-			msg_warn_pool_check ("cannot normalise URL: %s",
-					u_errorName (uc_err));
+		ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+
+		for (gint32 i = 0; i < nsym; i ++) {
+			if (!IS_ZERO_WIDTH_SPACE (*h)) {
+				*t++ = *h++;
+				new_len ++;
+			}
+			else {
+				h ++;
+			}
 		}
 
-		goto out;
+		nsym = new_len;
 	}
 
 	/* We now convert it back to utf */
 	nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
 
 	if (!U_SUCCESS (uc_err)) {
-		msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
-				u_errorName (uc_err));
+		msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
+					   " input length: %d chars, unicode length: %d utf16 symbols",
+				u_errorName (uc_err), (gint)*len, (gint)nsym);
+
+		if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
+			ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
+		}
+		else {
+			ret |= RSPAMD_UNICODE_NORM_ERROR;
+		}
+
 		goto out;
 	}
 
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 742d34184..059665388 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -402,6 +402,14 @@ struct UConverter *rspamd_get_utf8_converter (void);
 struct UNormalizer2;
 const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
 
+enum rspamd_normalise_result {
+	RSPAMD_UNICODE_NORM_NORMAL = 0,
+	RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+	RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+	RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+	RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
 /**
  * Gets a string in UTF8 and normalises it to NFKC_Casefold form
  * @param pool optional memory pool used for logging purposes
@@ -409,7 +417,7 @@ const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
  * @param len
  * @return TRUE if a string has been normalised
  */
-gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
 		gchar *start, guint *len);
 
 enum rspamd_regexp_escape_flags {