commit 1a7b7d7: [Feature] Add html parsing limit

Wed Apr 26 22:00:03 UTC 2023

Author: Vsevolod Stakhov
Date: 2023-04-26 22:54:24 +0100
URL: https://github.com/rspamd/rspamd/commit/1a7b7d7076f41651444832f693606f5eca39a624

[Feature] Add html parsing limit

---
 src/libmime/message.c       |  2 +-
 src/libserver/cfg_file.h    |  1 +
 src/libserver/cfg_rcl.c     |  6 ++++++
 src/libserver/cfg_utils.c   |  2 ++
 src/libserver/html/html.cxx | 46 ++++++++++++++++++++++++++++++++++++---------
 src/libserver/html/html.h   |  2 +-
 6 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index ec49b3b5e..ad2cccf92 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -766,7 +766,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
 
 
 	text_part->html = rspamd_html_process_part_full (
-			task->task_pool,
+			task,
 			text_part->utf_raw_content,
 			&text_part->exceptions,
 			MESSAGE_FIELD (task, urls),
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 44502ebb7..d7c3789e7 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -478,6 +478,7 @@ struct rspamd_config {
 	gint max_recipients;                           /**< maximum number of recipients to be processed	*/
 	guint max_blas_threads;                         /**< maximum threads for openblas when learning ANN		*/
 	guint max_opts_len;                             /**< maximum length for all options for a symbol		*/
+	gsize max_html_len;                             /**< maximum length of HTML document					*/
 
 	struct module_s **compiled_modules;                /**< list of compiled C modules							*/
 	struct worker_s **compiled_workers;                /**< list of compiled C modules							*/
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index 01c2a6ad1..08d534eb3 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1919,6 +1919,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
 				G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
 				RSPAMD_CL_FLAG_UINT,
 				"Maximum length of the word to be considered in statistics/fuzzy");
+		rspamd_rcl_add_default_handler (sub,
+			"max_html_len",
+			rspamd_rcl_parse_struct_integer,
+			G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
+			RSPAMD_CL_FLAG_INT_SIZE,
+			"Maximum length of the html part to be parsed");
 		rspamd_rcl_add_default_handler (sub,
 				"words_decay",
 				rspamd_rcl_parse_struct_integer,
diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c
index 09e2ab158..67bc97070 100644
--- a/src/libserver/cfg_utils.c
+++ b/src/libserver/cfg_utils.c
@@ -75,6 +75,7 @@
 #define DEFAULT_MAX_SHOTS 100
 #define DEFAULT_MAX_SESSIONS 100
 #define DEFAULT_MAX_WORKERS 4
+#define DEFAULT_MAX_HTML_SIZE DEFAULT_MAX_MESSAGE / 5 /* 10 Mb */
 /* Timeout for task processing */
 #define DEFAULT_TASK_TIMEOUT 8.0
 #define DEFAULT_LUA_GC_STEP 200
@@ -243,6 +244,7 @@ rspamd_config_new (enum rspamd_config_init_flags flags)
 	cfg->words_decay = DEFAULT_WORDS_DECAY;
 	cfg->min_word_len = DEFAULT_MIN_WORD;
 	cfg->max_word_len = DEFAULT_MAX_WORD;
+	cfg->max_html_len = DEFAULT_MAX_HTML_SIZE;
 
 	/* GC limits */
 	cfg->lua_gc_pause = DEFAULT_LUA_GC_PAUSE;
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e2f484804..91a59c8d0 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -22,6 +22,8 @@
 #include "html.hxx"
 #include "libserver/css/css_value.hxx"
 #include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
 
 #include "url.h"
 #include "contrib/libucl/khash.h"
@@ -1321,7 +1323,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 }
 
 auto
-html_process_input(rspamd_mempool_t *pool,
+html_process_input(struct rspamd_task *task,
 				   GByteArray *in,
 				   GList **exceptions,
 				   khash_t (rspamd_url_hash) *url_set,
@@ -1334,8 +1336,11 @@ html_process_input(rspamd_mempool_t *pool,
 	guint obrace = 0, ebrace = 0;
 	struct rspamd_url *url = nullptr;
 	gint href_offset = -1;
+	auto overflow_input = false;
 	struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
 	struct tag_content_parser_state content_parser_env;
+	auto process_size = in->len;
+
 
 	enum {
 		parse_start = 0,
@@ -1364,10 +1369,20 @@ html_process_input(rspamd_mempool_t *pool,
 	} html_document_state = html_document_state::doctype;
 
 	g_assert (in != NULL);
-	g_assert (pool != NULL);
+	g_assert (task != NULL);
+
+	auto *pool = task->task_pool;
 
-	struct html_content *hc = new html_content;
-	rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
+	auto *hc = new html_content;
+	rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+	if (task->cfg && in->len > task->cfg->max_html_len) {
+		msg_notice_task("html input is too big: %z, limit is %z",
+				in->len,
+				task->cfg->max_html_len);
+		process_size = task->cfg->max_html_len;
+		overflow_input = true;
+	}
 
 	auto new_tag = [&](int flags = 0) -> struct html_tag * {
 
@@ -1525,7 +1540,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 	p = (const char *) in->data;
 	c = p;
-	end = p + in->len;
+	end = p + process_size;
 	start = c;
 
 	while (p < end) {
@@ -2140,8 +2155,17 @@ html_process_input(rspamd_mempool_t *pool,
 		break;
 	}
 
+	if (overflow_input) {
+		/*
+		 * Append the rest of the input as raw html, this might work as
+		 * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+		 * It is still unclear about urls though...
+		 */
+		hc->parsed.append(end, in->len - process_size);
+	}
+
 	if (!hc->parsed.empty()) {
-		/* Trim extra spaces at the at the end if needed */
+		/* Trim extra spaces at the end if needed */
 		if (g_ascii_isspace(hc->parsed.back())) {
 			auto last_it = std::end(hc->parsed);
 
@@ -2244,13 +2268,13 @@ html_tag::get_content(const struct html_content *hc) const -> std::string_view
 }
 
 void *
-rspamd_html_process_part_full(rspamd_mempool_t *pool,
+rspamd_html_process_part_full(struct rspamd_task *task,
 							  GByteArray *in, GList **exceptions,
 							  khash_t (rspamd_url_hash) *url_set,
 							  GPtrArray *part_urls,
 							  bool allow_css)
 {
-	return rspamd::html::html_process_input(pool, in, exceptions, url_set,
+	return rspamd::html::html_process_input(task, in, exceptions, url_set,
 			part_urls, allow_css);
 }
 
@@ -2258,7 +2282,11 @@ void *
 rspamd_html_process_part(rspamd_mempool_t *pool,
 						 GByteArray *in)
 {
-	return rspamd_html_process_part_full (pool, in, NULL,
+	struct rspamd_task fake_task;
+	memset(&fake_task, 0, sizeof(fake_task));
+	fake_task.task_pool = pool;
+
+	return rspamd_html_process_part_full (&fake_task, in, NULL,
 			NULL, NULL, FALSE);
 }
 
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 8b690499e..2a43223f9 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -70,7 +70,7 @@ guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
 void* rspamd_html_process_part(rspamd_mempool_t *pool,
 							   GByteArray *in);
 
-void *rspamd_html_process_part_full(rspamd_mempool_t *pool,
+void *rspamd_html_process_part_full(struct rspamd_task *task,
 									GByteArray *in, GList **exceptions,
 									khash_t (rspamd_url_hash) *url_set,
 									GPtrArray *part_urls,