commit 26eff81: [Minor] Add safety check when using icu ubrk iterators

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Oct 24 12:14:06 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-10-24 13:08:29 +0100
URL: https://github.com/rspamd/rspamd/commit/26eff813e8e951bc1470ed4667ecaabd0aa0588f (HEAD -> master)

[Minor] Add safety check when using icu ubrk iterators

---
 src/libmime/message.c               |  4 +++-
 src/libstat/tokenizers/tokenizers.c | 46 ++++++++++++++++++++++++++++++++-----
 src/libstat/tokenizers/tokenizers.h |  3 ++-
 src/lua/lua_util.c                  |  2 +-
 4 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index cfa8cf97d..648fa82c5 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -187,7 +187,9 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
 			&part->utf_stripped_text,
 			tok_type, task->cfg,
 			part->exceptions,
-			NULL, NULL);
+			NULL,
+			NULL,
+			task->task_pool);
 
 
 	if (part->utf_words) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index acd3c5739..c533534ed 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -285,7 +285,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 					  struct rspamd_config *cfg,
 					  GList *exceptions,
 					  guint64 *hash,
-					  GArray *cur_words)
+					  GArray *cur_words,
+					  rspamd_mempool_t *pool)
 {
 	rspamd_stat_token_t token, buf;
 	const gchar *pos = NULL;
@@ -359,7 +360,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 					ev_tstamp now = ev_time ();
 
 					if (now - start > max_exec_time) {
-						msg_warn ("too long time has been spent on tokenization:"
+						msg_warn_pool_check (
+								"too long time has been spent on tokenization:"
 								  " %.1f ms, limit is %.1f ms; %d words added so far",
 								(now - start) * 1e3, max_exec_time * 1e3,
 								res->len);
@@ -373,7 +375,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 
 			if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
 				/* Due to bug in glib ! */
-				msg_err ("too many words found: %d, stop tokenization to avoid DoS",
+				msg_err_pool_check (
+						"too many words found: %d, stop tokenization to avoid DoS",
 						res->len);
 
 				goto end;
@@ -420,7 +423,17 @@ start_over:
 							if (last > p) {
 								/* Exception spread over the boundaries */
 								while (last > p && p != UBRK_DONE) {
+									gint32 old_p = p;
 									p = ubrk_next (bi);
+
+									if (p <= old_p) {
+										msg_warn_pool_check (
+												"tokenization reversed back on position %d,"
+												"%d new position (%d backward), likely libicu bug!",
+												(gint)(p), (gint)(old_p), old_p - p);
+
+										goto end;
+									}
 								}
 
 								/* We need to reset our scan with new p and last */
@@ -450,7 +463,16 @@ start_over:
 							if (last > p) {
 								/* Exception spread over the boundaries */
 								while (last > p && p != UBRK_DONE) {
+									gint32 old_p = p;
 									p = ubrk_next (bi);
+									if (p <= old_p) {
+										msg_warn_pool_check (
+												"tokenization reversed back on position %d,"
+												"%d new position (%d backward), likely libicu bug!",
+												(gint)(p), (gint)(old_p), old_p - p);
+
+										goto end;
+									}
 								}
 								/* We need to reset our scan with new p and last */
 								SHIFT_EX;
@@ -531,7 +553,8 @@ start_over:
 					ev_tstamp now = ev_time ();
 
 					if (now - start > max_exec_time) {
-						msg_warn ("too long time has been spent on tokenization:"
+						msg_warn_pool_check (
+								"too long time has been spent on tokenization:"
 								  " %.1f ms, limit is %.1f ms; %d words added so far",
 								(now - start) * 1e3, max_exec_time * 1e3,
 								res->len);
@@ -543,6 +566,14 @@ start_over:
 
 			last = p;
 			p = ubrk_next (bi);
+
+			if (p <= last) {
+				msg_warn_pool_check ("tokenization reversed back on position %d,"
+						 "%d new position (%d backward), likely libicu bug!",
+						(gint)(p), (gint)(last), last - p);
+
+				goto end;
+			}
 		}
 	}
 
@@ -599,14 +630,17 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len,
 
 		task->meta_words = rspamd_tokenize_text (beg, len,
 				&utxt, RSPAMD_TOKENIZE_UTF,
-				task->cfg, NULL, NULL, task->meta_words);
+				task->cfg, NULL, NULL,
+				task->meta_words,
+				task->task_pool);
 
 		utext_close (&utxt);
 	}
 	else {
 		task->meta_words = rspamd_tokenize_text (beg, len,
 				NULL, RSPAMD_TOKENIZE_RAW,
-				task->cfg, NULL, NULL, task->meta_words);
+				task->cfg, NULL, NULL, task->meta_words,
+				task->task_pool);
 	}
 }
 
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index bf4987c7a..ca7261802 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -50,7 +50,8 @@ GArray *rspamd_tokenize_text (const gchar *text, gsize len,
 							  struct rspamd_config *cfg,
 							  GList *exceptions,
 							  guint64 *hash,
-							  GArray *cur_words);
+							  GArray *cur_words,
+							  rspamd_mempool_t *pool);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 461130157..1ea8d380c 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1322,7 +1322,7 @@ lua_util_tokenize_text (lua_State *L)
 			&utxt,
 			RSPAMD_TOKENIZE_UTF, NULL,
 			exceptions,
-			NULL, NULL);
+			NULL, NULL, NULL);
 
 	if (res == NULL) {
 		lua_pushnil (L);


More information about the Commits mailing list