commit 9b2e2d7: [Feature] Better escaping of unicode

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Feb 14 17:35:06 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-02-14 17:27:01 +0000
URL: https://github.com/rspamd/rspamd/commit/9b2e2d70a83c5c679f917253bcdb733d4bbbe705

[Feature] Better escaping of unicode

---
 src/libutil/str_util.c | 60 +++++++++++++++++++++++++++++++++++++-------------
 src/libutil/str_util.h |  1 +
 2 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 06d7a6cc7..0defa2acf 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2605,7 +2605,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 		gsize *dst_len, enum rspamd_regexp_escape_flags flags)
 {
 	const gchar *p, *end = pattern + slen;
-	gchar *res, *d, t, *tmp_utf = NULL;
+	gchar *res, *d, t, *tmp_utf = NULL, *dend;
 	gsize len;
 	static const gchar hexdigests[16] = "0123456789abcdef";
 
@@ -2634,15 +2634,22 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 		case '$':
 		case '|':
 		case '#':
-			len ++;
+			if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+				len++;
+			}
 			break;
 		default:
 			if (g_ascii_isspace (t)) {
 				len ++;
 			}
 			else {
-				if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
-					if (!g_ascii_isprint (t)) {
+				if (!g_ascii_isprint (t) || (t & 0x80)) {
+
+					if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
+						/* \x{code}, where code can be up to 5 digits */
+						len += 4;
+					}
+					else {
 						/* \\xHH -> 4 symbols */
 						len += 3;
 					}
@@ -2668,8 +2675,6 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 			*dst_len = slen;
 		}
 
-
-
 		if (tmp_utf) {
 			return tmp_utf;
 		}
@@ -2685,8 +2690,10 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 	res = g_malloc (len + 1);
 	p = pattern;
 	d = res;
+	dend = d + len;
 
 	while (p < end) {
+		g_assert (d < dend);
 		t = *p ++;
 
 		switch (t) {
@@ -2704,7 +2711,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 		case '$':
 		case '|':
 		case '#':
-			*d++ = '\\';
+			if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+				*d++ = '\\';
+			}
 			break;
 		case '*':
 		case '?':
@@ -2714,19 +2723,40 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
 				*d++ = '.';
 			}
 			else {
-				*d++ = '\\';
+				if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+					*d++ = '\\';
+				}
 			}
 			break;
 		default:
 			if (g_ascii_isspace (t)) {
-				*d++ = '\\';
+				if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+					*d++ = '\\';
+				}
 			}
-			else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
-				*d++ = '\\';
-				*d++ = 'x';
-				*d++ = hexdigests[((t >> 4) & 0xF)];
-				*d++ = hexdigests[((t) & 0xF)];
-				continue; /* To avoid *d++ = t; */
+			else if (t & 0x80 || !g_ascii_isprint (t)) {
+				if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
+					*d++ = '\\';
+					*d++ = 'x';
+					*d++ = hexdigests[((t >> 4) & 0xF)];
+					*d++ = hexdigests[((t) & 0xF)];
+					continue; /* To avoid *d++ = t; */
+				}
+				else {
+					if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) {
+						UChar32 uc;
+						gint32 off = p - pattern - 1;
+						U8_NEXT (pattern, off, slen, uc);
+
+						if (uc > 0) {
+							d += rspamd_snprintf (d, dend - d,
+									"\\x{%xd}", uc);
+							p = pattern + off;
+						}
+
+						continue; /* To avoid *d++ = t; */
+					}
+				}
 			}
 			break;
 		}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 46b74001b..34c1271d4 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -436,6 +436,7 @@ enum rspamd_regexp_escape_flags {
 	RSPAMD_REGEXP_ESCAPE_ASCII = 0,
 	RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
 	RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
+	RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
 };
 /**
  * Escapes special characters when reading plain data to be processed in pcre


More information about the Commits mailing list