commit 40db5f6: [Project] Css: Enable conditional css parsing support from the HTML parser
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Mar 26 20:57:33 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-03-26 20:54:37 +0000
URL: https://github.com/rspamd/rspamd/commit/40db5f6260c874c1bc5a2f1d8234310df10990f7 (HEAD -> master)
[Project] Css: Enable conditional css parsing support from the HTML parser
---
src/libmime/message.c | 3 ++-
src/libserver/cfg_file.h | 1 +
src/libserver/cfg_rcl.c | 6 ++++++
src/libserver/html.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
src/libserver/html.h | 3 ++-
5 files changed, 53 insertions(+), 5 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 8a9601fa7..9713a6bf5 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -769,7 +769,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
text_part->utf_raw_content,
&text_part->exceptions,
MESSAGE_FIELD (task, urls),
- text_part->mime_part->urls);
+ text_part->mime_part->urls,
+ task->cfg->enable_css_parser);
if (text_part->utf_content->len == 0) {
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 9ef795d05..67f18e1e9 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -377,6 +377,7 @@ struct rspamd_config {
gboolean soft_reject_on_timeout; /**< If true emit soft reject on task timeout (if not reject) */
gboolean public_groups_only; /**< Output merely public groups everywhere */
gboolean enable_test_patterns; /**< Enable test patterns */
+ gboolean enable_css_parser; /**< Enable css parsing in HTML */
gsize max_cores_size; /**< maximum size occupied by rspamd core files */
gsize max_cores_count; /**< maximum number of core files */
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index ffdc5e596..4891c4194 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1999,6 +1999,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
G_STRUCT_OFFSET (struct rspamd_config, enable_test_patterns),
0,
"Enable test GTUBE like patterns (not for production!)");
+ rspamd_rcl_add_default_handler (sub,
+ "enable_css_parser",
+ rspamd_rcl_parse_struct_boolean,
+ G_STRUCT_OFFSET (struct rspamd_config, enable_css_parser),
+ 0,
+ "Enable CSS parser (experimental)");
rspamd_rcl_add_default_handler (sub,
"enable_experimental",
rspamd_rcl_parse_struct_boolean,
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 3d9d540f5..b56f3ef32 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -24,6 +24,7 @@
#include "url.h"
#include "contrib/libucl/khash.h"
#include "libmime/images.h"
+#include "css/css.h"
#include <unicode/uversion.h>
#include <unicode/ucnv.h>
@@ -2781,7 +2782,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
GByteArray *in,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls)
+ GPtrArray *part_urls,
+ bool allow_css)
{
const guchar *p, *c, *end, *savep = NULL;
guchar t;
@@ -2809,6 +2811,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
xml_tag_end,
content_ignore,
content_write,
+ content_style,
content_ignore_sp
} state = parse_start;
@@ -3118,6 +3121,36 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
p ++;
break;
+ case content_style: {
+
+ /*
+ * We just search for the first </s substring and then pass
+ * the content to the parser (if needed)
+ */
+ goffset end_style = rspamd_substring_search (p, end - p,
+ "</", 2);
+ if (end_style == -1 || g_ascii_tolower (p[end_style + 2]) != 's') {
+ /* Invalid style */
+ state = content_ignore;
+ }
+ else {
+
+ if (allow_css) {
+ GError *err = NULL;
+ (void)rspamd_css_parse_style (pool, p, end_style, &err);
+
+ if (err) {
+ msg_info_pool ("cannot parse css: %e", err);
+ g_error_free (err);
+ }
+ }
+
+ p += end_style;
+ state = tag_begin;
+ }
+ break;
+ }
+
case content_ignore_sp:
if (!g_ascii_isspace (t)) {
c = p;
@@ -3173,7 +3206,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
need_decode = FALSE;
}
else {
- state = content_ignore;
+ if (cur_tag->id == Tag_STYLE) {
+ state = content_style;
+ }
+ else {
+ state = content_ignore;
+ }
}
if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
@@ -3387,5 +3425,6 @@ rspamd_html_process_part (rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in)
{
- return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
+ return rspamd_html_process_part_full (pool, hc, in, NULL,
+ NULL, NULL, FALSE);
}
diff --git a/src/libserver/html.h b/src/libserver/html.h
index fba412cb3..f8a5e18e4 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -147,7 +147,8 @@ GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls);
+ GPtrArray *part_urls,
+ bool allow_css);
/*
* Returns true if a specified tag has been seen in a part
More information about the Commits
mailing list