commit 26eff81: [Minor] Add safety check when using icu ubrk iterators
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Oct 24 12:14:06 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-10-24 13:08:29 +0100
URL: https://github.com/rspamd/rspamd/commit/26eff813e8e951bc1470ed4667ecaabd0aa0588f (HEAD -> master)
[Minor] Add safety check when using icu ubrk iterators
---
src/libmime/message.c | 4 +++-
src/libstat/tokenizers/tokenizers.c | 46 ++++++++++++++++++++++++++++++++-----
src/libstat/tokenizers/tokenizers.h | 3 ++-
src/lua/lua_util.c | 2 +-
4 files changed, 46 insertions(+), 9 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index cfa8cf97d..648fa82c5 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -187,7 +187,9 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
&part->utf_stripped_text,
tok_type, task->cfg,
part->exceptions,
- NULL, NULL);
+ NULL,
+ NULL,
+ task->task_pool);
if (part->utf_words) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index acd3c5739..c533534ed 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -285,7 +285,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
struct rspamd_config *cfg,
GList *exceptions,
guint64 *hash,
- GArray *cur_words)
+ GArray *cur_words,
+ rspamd_mempool_t *pool)
{
rspamd_stat_token_t token, buf;
const gchar *pos = NULL;
@@ -359,7 +360,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
ev_tstamp now = ev_time ();
if (now - start > max_exec_time) {
- msg_warn ("too long time has been spent on tokenization:"
+ msg_warn_pool_check (
+ "too long time has been spent on tokenization:"
" %.1f ms, limit is %.1f ms; %d words added so far",
(now - start) * 1e3, max_exec_time * 1e3,
res->len);
@@ -373,7 +375,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
/* Due to bug in glib ! */
- msg_err ("too many words found: %d, stop tokenization to avoid DoS",
+ msg_err_pool_check (
+ "too many words found: %d, stop tokenization to avoid DoS",
res->len);
goto end;
@@ -420,7 +423,17 @@ start_over:
if (last > p) {
/* Exception spread over the boundaries */
while (last > p && p != UBRK_DONE) {
+ gint32 old_p = p;
p = ubrk_next (bi);
+
+ if (p <= old_p) {
+ msg_warn_pool_check (
+ "tokenization reversed back on position %d,"
+ "%d new position (%d backward), likely libicu bug!",
+ (gint)(p), (gint)(old_p), old_p - p);
+
+ goto end;
+ }
}
/* We need to reset our scan with new p and last */
@@ -450,7 +463,16 @@ start_over:
if (last > p) {
/* Exception spread over the boundaries */
while (last > p && p != UBRK_DONE) {
+ gint32 old_p = p;
p = ubrk_next (bi);
+ if (p <= old_p) {
+ msg_warn_pool_check (
+ "tokenization reversed back on position %d,"
+ "%d new position (%d backward), likely libicu bug!",
+ (gint)(p), (gint)(old_p), old_p - p);
+
+ goto end;
+ }
}
/* We need to reset our scan with new p and last */
SHIFT_EX;
@@ -531,7 +553,8 @@ start_over:
ev_tstamp now = ev_time ();
if (now - start > max_exec_time) {
- msg_warn ("too long time has been spent on tokenization:"
+ msg_warn_pool_check (
+ "too long time has been spent on tokenization:"
" %.1f ms, limit is %.1f ms; %d words added so far",
(now - start) * 1e3, max_exec_time * 1e3,
res->len);
@@ -543,6 +566,14 @@ start_over:
last = p;
p = ubrk_next (bi);
+
+ if (p <= last) {
+ msg_warn_pool_check ("tokenization reversed back on position %d,"
+ "%d new position (%d backward), likely libicu bug!",
+ (gint)(p), (gint)(last), last - p);
+
+ goto end;
+ }
}
}
@@ -599,14 +630,17 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len,
task->meta_words = rspamd_tokenize_text (beg, len,
&utxt, RSPAMD_TOKENIZE_UTF,
- task->cfg, NULL, NULL, task->meta_words);
+ task->cfg, NULL, NULL,
+ task->meta_words,
+ task->task_pool);
utext_close (&utxt);
}
else {
task->meta_words = rspamd_tokenize_text (beg, len,
NULL, RSPAMD_TOKENIZE_RAW,
- task->cfg, NULL, NULL, task->meta_words);
+ task->cfg, NULL, NULL, task->meta_words,
+ task->task_pool);
}
}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index bf4987c7a..ca7261802 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -50,7 +50,8 @@ GArray *rspamd_tokenize_text (const gchar *text, gsize len,
struct rspamd_config *cfg,
GList *exceptions,
guint64 *hash,
- GArray *cur_words);
+ GArray *cur_words,
+ rspamd_mempool_t *pool);
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 461130157..1ea8d380c 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1322,7 +1322,7 @@ lua_util_tokenize_text (lua_State *L)
&utxt,
RSPAMD_TOKENIZE_UTF, NULL,
exceptions,
- NULL, NULL);
+ NULL, NULL, NULL);
if (res == NULL) {
lua_pushnil (L);
More information about the Commits
mailing list