commit 9b2e2d7: [Feature] Better escaping of unicode
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Feb 14 17:35:06 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-02-14 17:27:01 +0000
URL: https://github.com/rspamd/rspamd/commit/9b2e2d70a83c5c679f917253bcdb733d4bbbe705
[Feature] Better escaping of unicode
---
src/libutil/str_util.c | 60 +++++++++++++++++++++++++++++++++++++-------------
src/libutil/str_util.h | 1 +
2 files changed, 46 insertions(+), 15 deletions(-)
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 06d7a6cc7..0defa2acf 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2605,7 +2605,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
gsize *dst_len, enum rspamd_regexp_escape_flags flags)
{
const gchar *p, *end = pattern + slen;
- gchar *res, *d, t, *tmp_utf = NULL;
+ gchar *res, *d, t, *tmp_utf = NULL, *dend;
gsize len;
static const gchar hexdigests[16] = "0123456789abcdef";
@@ -2634,15 +2634,22 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
case '$':
case '|':
case '#':
- len ++;
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+ len++;
+ }
break;
default:
if (g_ascii_isspace (t)) {
len ++;
}
else {
- if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
- if (!g_ascii_isprint (t)) {
+ if (!g_ascii_isprint (t) || (t & 0x80)) {
+
+ if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
+ /* \x{code}, where code can be up to 5 digits */
+ len += 4;
+ }
+ else {
/* \\xHH -> 4 symbols */
len += 3;
}
@@ -2668,8 +2675,6 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
*dst_len = slen;
}
-
-
if (tmp_utf) {
return tmp_utf;
}
@@ -2685,8 +2690,10 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
res = g_malloc (len + 1);
p = pattern;
d = res;
+ dend = d + len;
while (p < end) {
+ g_assert (d < dend);
t = *p ++;
switch (t) {
@@ -2704,7 +2711,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
case '$':
case '|':
case '#':
- *d++ = '\\';
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+ *d++ = '\\';
+ }
break;
case '*':
case '?':
@@ -2714,19 +2723,40 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
*d++ = '.';
}
else {
- *d++ = '\\';
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+ *d++ = '\\';
+ }
}
break;
default:
if (g_ascii_isspace (t)) {
- *d++ = '\\';
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+ *d++ = '\\';
+ }
}
- else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
- *d++ = '\\';
- *d++ = 'x';
- *d++ = hexdigests[((t >> 4) & 0xF)];
- *d++ = hexdigests[((t) & 0xF)];
- continue; /* To avoid *d++ = t; */
+ else if (t & 0x80 || !g_ascii_isprint (t)) {
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
+ *d++ = '\\';
+ *d++ = 'x';
+ *d++ = hexdigests[((t >> 4) & 0xF)];
+ *d++ = hexdigests[((t) & 0xF)];
+ continue; /* To avoid *d++ = t; */
+ }
+ else {
+ if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) {
+ UChar32 uc;
+ gint32 off = p - pattern - 1;
+ U8_NEXT (pattern, off, slen, uc);
+
+ if (uc > 0) {
+ d += rspamd_snprintf (d, dend - d,
+ "\\x{%xd}", uc);
+ p = pattern + off;
+ }
+
+ continue; /* To avoid *d++ = t; */
+ }
+ }
}
break;
}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 46b74001b..34c1271d4 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -436,6 +436,7 @@ enum rspamd_regexp_escape_flags {
RSPAMD_REGEXP_ESCAPE_ASCII = 0,
RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
+ RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
};
/**
* Escapes special characters when reading plain data to be processed in pcre
More information about the Commits
mailing list