commit 1a7b7d7: [Feature] Add html parsing limit
Vsevolod Stakhov
vsevolod at rspamd.com
Wed Apr 26 22:00:03 UTC 2023
Author: Vsevolod Stakhov
Date: 2023-04-26 22:54:24 +0100
URL: https://github.com/rspamd/rspamd/commit/1a7b7d7076f41651444832f693606f5eca39a624
[Feature] Add html parsing limit
---
src/libmime/message.c | 2 +-
src/libserver/cfg_file.h | 1 +
src/libserver/cfg_rcl.c | 6 ++++++
src/libserver/cfg_utils.c | 2 ++
src/libserver/html/html.cxx | 46 ++++++++++++++++++++++++++++++++++++---------
src/libserver/html/html.h | 2 +-
6 files changed, 48 insertions(+), 11 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index ec49b3b5e..ad2cccf92 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -766,7 +766,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
text_part->html = rspamd_html_process_part_full (
- task->task_pool,
+ task,
text_part->utf_raw_content,
&text_part->exceptions,
MESSAGE_FIELD (task, urls),
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 44502ebb7..d7c3789e7 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -478,6 +478,7 @@ struct rspamd_config {
gint max_recipients; /**< maximum number of recipients to be processed */
guint max_blas_threads; /**< maximum threads for openblas when learning ANN */
guint max_opts_len; /**< maximum length for all options for a symbol */
+ gsize max_html_len; /**< maximum length of HTML document */
struct module_s **compiled_modules; /**< list of compiled C modules */
struct worker_s **compiled_workers; /**< list of compiled C modules */
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index 01c2a6ad1..08d534eb3 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1919,6 +1919,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
RSPAMD_CL_FLAG_UINT,
"Maximum length of the word to be considered in statistics/fuzzy");
+ rspamd_rcl_add_default_handler (sub,
+ "max_html_len",
+ rspamd_rcl_parse_struct_integer,
+ G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
+ RSPAMD_CL_FLAG_INT_SIZE,
+ "Maximum length of the html part to be parsed");
rspamd_rcl_add_default_handler (sub,
"words_decay",
rspamd_rcl_parse_struct_integer,
diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c
index 09e2ab158..67bc97070 100644
--- a/src/libserver/cfg_utils.c
+++ b/src/libserver/cfg_utils.c
@@ -75,6 +75,7 @@
#define DEFAULT_MAX_SHOTS 100
#define DEFAULT_MAX_SESSIONS 100
#define DEFAULT_MAX_WORKERS 4
+#define DEFAULT_MAX_HTML_SIZE DEFAULT_MAX_MESSAGE / 5 /* 10 Mb */
/* Timeout for task processing */
#define DEFAULT_TASK_TIMEOUT 8.0
#define DEFAULT_LUA_GC_STEP 200
@@ -243,6 +244,7 @@ rspamd_config_new (enum rspamd_config_init_flags flags)
cfg->words_decay = DEFAULT_WORDS_DECAY;
cfg->min_word_len = DEFAULT_MIN_WORD;
cfg->max_word_len = DEFAULT_MAX_WORD;
+ cfg->max_html_len = DEFAULT_MAX_HTML_SIZE;
/* GC limits */
cfg->lua_gc_pause = DEFAULT_LUA_GC_PAUSE;
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e2f484804..91a59c8d0 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -22,6 +22,8 @@
#include "html.hxx"
#include "libserver/css/css_value.hxx"
#include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
#include "url.h"
#include "contrib/libucl/khash.h"
@@ -1321,7 +1323,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
}
auto
-html_process_input(rspamd_mempool_t *pool,
+html_process_input(struct rspamd_task *task,
GByteArray *in,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
@@ -1334,8 +1336,11 @@ html_process_input(rspamd_mempool_t *pool,
guint obrace = 0, ebrace = 0;
struct rspamd_url *url = nullptr;
gint href_offset = -1;
+ auto overflow_input = false;
struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
struct tag_content_parser_state content_parser_env;
+ auto process_size = in->len;
+
enum {
parse_start = 0,
@@ -1364,10 +1369,20 @@ html_process_input(rspamd_mempool_t *pool,
} html_document_state = html_document_state::doctype;
g_assert (in != NULL);
- g_assert (pool != NULL);
+ g_assert (task != NULL);
+
+ auto *pool = task->task_pool;
- struct html_content *hc = new html_content;
- rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
+ auto *hc = new html_content;
+ rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+ if (task->cfg && in->len > task->cfg->max_html_len) {
+ msg_notice_task("html input is too big: %z, limit is %z",
+ in->len,
+ task->cfg->max_html_len);
+ process_size = task->cfg->max_html_len;
+ overflow_input = true;
+ }
auto new_tag = [&](int flags = 0) -> struct html_tag * {
@@ -1525,7 +1540,7 @@ html_process_input(rspamd_mempool_t *pool,
p = (const char *) in->data;
c = p;
- end = p + in->len;
+ end = p + process_size;
start = c;
while (p < end) {
@@ -2140,8 +2155,17 @@ html_process_input(rspamd_mempool_t *pool,
break;
}
+ if (overflow_input) {
+ /*
+ * Append the rest of the input as raw html, this might work as
+ * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+ * It is still unclear about urls though...
+ */
+ hc->parsed.append(end, in->len - process_size);
+ }
+
if (!hc->parsed.empty()) {
- /* Trim extra spaces at the at the end if needed */
+ /* Trim extra spaces at the end if needed */
if (g_ascii_isspace(hc->parsed.back())) {
auto last_it = std::end(hc->parsed);
@@ -2244,13 +2268,13 @@ html_tag::get_content(const struct html_content *hc) const -> std::string_view
}
void *
-rspamd_html_process_part_full(rspamd_mempool_t *pool,
+rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
bool allow_css)
{
- return rspamd::html::html_process_input(pool, in, exceptions, url_set,
+ return rspamd::html::html_process_input(task, in, exceptions, url_set,
part_urls, allow_css);
}
@@ -2258,7 +2282,11 @@ void *
rspamd_html_process_part(rspamd_mempool_t *pool,
GByteArray *in)
{
- return rspamd_html_process_part_full (pool, in, NULL,
+ struct rspamd_task fake_task;
+ memset(&fake_task, 0, sizeof(fake_task));
+ fake_task.task_pool = pool;
+
+ return rspamd_html_process_part_full (&fake_task, in, NULL,
NULL, NULL, FALSE);
}
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 8b690499e..2a43223f9 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -70,7 +70,7 @@ guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
void* rspamd_html_process_part(rspamd_mempool_t *pool,
GByteArray *in);
-void *rspamd_html_process_part_full(rspamd_mempool_t *pool,
+void *rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
More information about the Commits
mailing list