commit c234e5b: [Rework] Rewrite rspamd_str_make_utf_valid function

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Nov 15 18:56:11 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-11-15 17:27:42 +0000
URL: https://github.com/rspamd/rspamd/commit/c234e5bc9c8b19625009d3925f37875e5fa820d4

[Rework] Rewrite rspamd_str_make_utf_valid function

---
 src/libutil/str_util.c | 115 +++++++++++++++++++++++++++++++++++--------------
 src/libutil/str_util.h |   2 +-
 2 files changed, 83 insertions(+), 34 deletions(-)

diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 90924f8d1..dd1b139d8 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2935,7 +2935,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 
 	if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
 		if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
-			tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
+			tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL, NULL);
 		}
 	}
 
@@ -3052,61 +3052,110 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 
 
 gchar *
-rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen)
+rspamd_str_make_utf_valid (const guchar *src, gsize slen,
+		gsize *dstlen,
+		rspamd_mempool_t *pool)
 {
-	GString *dst;
-	const gchar *last;
-	gchar *dchar;
-	gsize valid, prev;
 	UChar32 uc;
-	gint32 i;
+	goffset err_offset;
+	const guchar *p;
+	gchar *dst, *d;
+	gsize remain = slen, dlen = 0;
 
 	if (src == NULL) {
 		return NULL;
 	}
 
 	if (slen == 0) {
-		slen = strlen (src);
+		return NULL;
 	}
 
-	dst = g_string_sized_new (slen);
-	i = 0;
-	last = src;
-	valid = 0;
-	prev = 0;
+	p = src;
+	dlen = slen;
 
-	while (i < slen) {
-		U8_NEXT (src, i, slen, uc);
+	/* Check space required */
+	while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) {
+		gint i = 0;
 
-		if (uc <= 0) {
-			if (valid > 0) {
-				g_string_append_len (dst, last, valid);
+		p += err_offset;
+		remain -= err_offset;
+		dlen += err_offset;
+
+		/* Each invalid character of input requires 3 bytes of output */
+		while (i < remain) {
+			gint old_i = i;
+			U8_NEXT (p, i, remain, uc);
+
+			if (uc < 0) {
+				dlen += 3;
+			}
+			else {
+				p += old_i;
+				remain -= old_i;
+				break;
 			}
-			/* 0xFFFD in UTF8 */
-			g_string_append_len (dst, "\357\277\275", 3);
-			valid = 0;
-			last = &src[i];
-		}
-		else {
-			valid += i - prev;
 		}
+	}
 
-		prev = i;
+	if (pool) {
+		dst = rspamd_mempool_alloc (pool, dlen + 1);
+	}
+	else {
+		dst = g_malloc (dlen + 1);
 	}
 
-	if (valid > 0) {
-		g_string_append_len (dst, last, valid);
+	p = src;
+	d = dst;
+	remain = slen;
+
+	while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) {
+		/* Copy valid */
+		memcpy (d, p, err_offset);
+		d += err_offset;
+
+		/* Append 0xFFFD for each bad character */
+		gint i = 0;
+
+		p += err_offset;
+		remain -= err_offset;
+
+		while (i < remain) {
+			gint old_i = i;
+			U8_NEXT (p, i, remain, uc);
+
+			if (uc < 0) {
+				*d++ = '\357';
+				*d++ = '\277';
+				*d++ = '\275';
+			}
+			else {
+				/* Adjust p and remaining stuff and go to the outer cycle */
+				p += old_i;
+				remain -= old_i;
+				break;
+			}
+		}
+		/*
+		 * Now p is the first valid utf8 character and remain is the rest of the string
+		 * so we can continue our loop
+		 */
 	}
 
-	dchar = dst->str;
+	if (err_offset == 0 && remain > 0) {
+		/* Last piece */
+		memcpy (d, p, remain);
+		d += remain;
+	}
+
+	/* Last '\0' */
+	g_assert (dlen > d - dst);
+	*d = '\0';
 
 	if (dstlen) {
-		*dstlen = dst->len;
+		*dstlen = d - dst;
 	}
 
-	g_string_free (dst, FALSE);
-
-	return dchar;
+	return dst;
 }
 
 gsize
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 7891a8e54..77bb96249 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -527,7 +527,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
  * @param dstelen
  * @return
  */
-gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen);
+gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen, rspamd_mempool_t *pool);
 
 /**
  * Strips characters in `strip_chars` from start and end of the GString


More information about the Commits mailing list