commit c234e5b: [Rework] Rewrite rspamd_str_make_utf_valid function
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Nov 15 18:56:11 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-11-15 17:27:42 +0000
URL: https://github.com/rspamd/rspamd/commit/c234e5bc9c8b19625009d3925f37875e5fa820d4
[Rework] Rewrite rspamd_str_make_utf_valid function
---
src/libutil/str_util.c | 115 +++++++++++++++++++++++++++++++++++--------------
src/libutil/str_util.h | 2 +-
2 files changed, 83 insertions(+), 34 deletions(-)
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 90924f8d1..dd1b139d8 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2935,7 +2935,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
- tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
+ tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL, NULL);
}
}
@@ -3052,61 +3052,110 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
gchar *
-rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen)
+rspamd_str_make_utf_valid (const guchar *src, gsize slen,
+ gsize *dstlen,
+ rspamd_mempool_t *pool)
{
- GString *dst;
- const gchar *last;
- gchar *dchar;
- gsize valid, prev;
UChar32 uc;
- gint32 i;
+ goffset err_offset;
+ const guchar *p;
+ gchar *dst, *d;
+ gsize remain = slen, dlen = 0;
if (src == NULL) {
return NULL;
}
if (slen == 0) {
- slen = strlen (src);
+ return NULL;
}
- dst = g_string_sized_new (slen);
- i = 0;
- last = src;
- valid = 0;
- prev = 0;
+ p = src;
+ dlen = slen;
- while (i < slen) {
- U8_NEXT (src, i, slen, uc);
+ /* Check space required */
+ while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) {
+ gint i = 0;
- if (uc <= 0) {
- if (valid > 0) {
- g_string_append_len (dst, last, valid);
+ p += err_offset;
+ remain -= err_offset;
+ dlen += err_offset;
+
+ /* Each invalid character of input requires 3 bytes of output */
+ while (i < remain) {
+ gint old_i = i;
+ U8_NEXT (p, i, remain, uc);
+
+ if (uc < 0) {
+ dlen += 3;
+ }
+ else {
+ p += old_i;
+ remain -= old_i;
+ break;
}
- /* 0xFFFD in UTF8 */
- g_string_append_len (dst, "\357\277\275", 3);
- valid = 0;
- last = &src[i];
- }
- else {
- valid += i - prev;
}
+ }
- prev = i;
+ if (pool) {
+ dst = rspamd_mempool_alloc (pool, dlen + 1);
+ }
+ else {
+ dst = g_malloc (dlen + 1);
}
- if (valid > 0) {
- g_string_append_len (dst, last, valid);
+ p = src;
+ d = dst;
+ remain = slen;
+
+ while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) {
+ /* Copy valid */
+ memcpy (d, p, err_offset);
+ d += err_offset;
+
+ /* Append 0xFFFD for each bad character */
+ gint i = 0;
+
+ p += err_offset;
+ remain -= err_offset;
+
+ while (i < remain) {
+ gint old_i = i;
+ U8_NEXT (p, i, remain, uc);
+
+ if (uc < 0) {
+ *d++ = '\357';
+ *d++ = '\277';
+ *d++ = '\275';
+ }
+ else {
+ /* Adjust p and remaining stuff and go to the outer cycle */
+ p += old_i;
+ remain -= old_i;
+ break;
+ }
+ }
+ /*
+ * Now p is the first valid utf8 character and remain is the rest of the string
+ * so we can continue our loop
+ */
}
- dchar = dst->str;
+ if (err_offset == 0 && remain > 0) {
+ /* Last piece */
+ memcpy (d, p, remain);
+ d += remain;
+ }
+
+ /* Last '\0' */
+ g_assert (dlen > d - dst);
+ *d = '\0';
if (dstlen) {
- *dstlen = dst->len;
+ *dstlen = d - dst;
}
- g_string_free (dst, FALSE);
-
- return dchar;
+ return dst;
}
gsize
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 7891a8e54..77bb96249 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -527,7 +527,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
* @param dstelen
* @return
*/
-gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen);
+gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen, rspamd_mempool_t *pool);
/**
* Strips characters in `strip_chars` from start and end of the GString
More information about the Commits
mailing list