commit 40db5f6: [Project] Css: Enable conditional css parsing support from the HTML parser

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Mar 26 20:57:33 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-03-26 20:54:37 +0000
URL: https://github.com/rspamd/rspamd/commit/40db5f6260c874c1bc5a2f1d8234310df10990f7 (HEAD -> master)

[Project] Css: Enable conditional css parsing support from the HTML parser

---
 src/libmime/message.c    |  3 ++-
 src/libserver/cfg_file.h |  1 +
 src/libserver/cfg_rcl.c  |  6 ++++++
 src/libserver/html.c     | 45 ++++++++++++++++++++++++++++++++++++++++++---
 src/libserver/html.h     |  3 ++-
 5 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index 8a9601fa7..9713a6bf5 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -769,7 +769,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
 			text_part->utf_raw_content,
 			&text_part->exceptions,
 			MESSAGE_FIELD (task, urls),
-			text_part->mime_part->urls);
+			text_part->mime_part->urls,
+			task->cfg->enable_css_parser);
 
 	if (text_part->utf_content->len == 0) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 9ef795d05..67f18e1e9 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -377,6 +377,7 @@ struct rspamd_config {
 	gboolean soft_reject_on_timeout;                /**< If true emit soft reject on task timeout (if not reject) */
 	gboolean public_groups_only;                    /**< Output merely public groups everywhere				*/
 	gboolean enable_test_patterns;                  /**< Enable test patterns								*/
+	gboolean enable_css_parser;                     /**< Enable css parsing in HTML							*/
 
 	gsize max_cores_size;                           /**< maximum size occupied by rspamd core files			*/
 	gsize max_cores_count;                          /**< maximum number of core files						*/
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index ffdc5e596..4891c4194 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1999,6 +1999,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
 				G_STRUCT_OFFSET (struct rspamd_config, enable_test_patterns),
 				0,
 				"Enable test GTUBE like patterns (not for production!)");
+		rspamd_rcl_add_default_handler (sub,
+				"enable_css_parser",
+				rspamd_rcl_parse_struct_boolean,
+				G_STRUCT_OFFSET (struct rspamd_config, enable_css_parser),
+				0,
+				"Enable CSS parser (experimental)");
 		rspamd_rcl_add_default_handler (sub,
 				"enable_experimental",
 				rspamd_rcl_parse_struct_boolean,
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 3d9d540f5..b56f3ef32 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -24,6 +24,7 @@
 #include "url.h"
 #include "contrib/libucl/khash.h"
 #include "libmime/images.h"
+#include "css/css.h"
 
 #include <unicode/uversion.h>
 #include <unicode/ucnv.h>
@@ -2781,7 +2782,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 							   GByteArray *in,
 							   GList **exceptions,
 							   khash_t (rspamd_url_hash) *url_set,
-							   GPtrArray *part_urls)
+							   GPtrArray *part_urls,
+							   bool allow_css)
 {
 	const guchar *p, *c, *end, *savep = NULL;
 	guchar t;
@@ -2809,6 +2811,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 		xml_tag_end,
 		content_ignore,
 		content_write,
+		content_style,
 		content_ignore_sp
 	} state = parse_start;
 
@@ -3118,6 +3121,36 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 			p ++;
 			break;
 
+		case content_style: {
+
+			/*
+			 * We just search for the first </s substring and then pass
+			 * the content to the parser (if needed)
+			 */
+			goffset end_style = rspamd_substring_search (p, end - p,
+					"</", 2);
+			if (end_style == -1 || g_ascii_tolower (p[end_style + 2]) != 's') {
+				/* Invalid style */
+				state = content_ignore;
+			}
+			else {
+
+				if (allow_css) {
+					GError *err = NULL;
+					(void)rspamd_css_parse_style (pool, p, end_style, &err);
+
+					if (err) {
+						msg_info_pool ("cannot parse css: %e", err);
+						g_error_free (err);
+					}
+				}
+
+				p += end_style;
+				state = tag_begin;
+			}
+			break;
+		}
+
 		case content_ignore_sp:
 			if (!g_ascii_isspace (t)) {
 				c = p;
@@ -3173,7 +3206,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 					need_decode = FALSE;
 				}
 				else {
-					state = content_ignore;
+					if (cur_tag->id == Tag_STYLE) {
+						state = content_style;
+					}
+					else {
+						state = content_ignore;
+					}
 				}
 
 				if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
@@ -3387,5 +3425,6 @@ rspamd_html_process_part (rspamd_mempool_t *pool,
 		struct html_content *hc,
 		GByteArray *in)
 {
-	return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
+	return rspamd_html_process_part_full (pool, hc, in, NULL,
+			NULL, NULL, FALSE);
 }
diff --git a/src/libserver/html.h b/src/libserver/html.h
index fba412cb3..f8a5e18e4 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -147,7 +147,8 @@ GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
 										   struct html_content *hc,
 										   GByteArray *in, GList **exceptions,
 										   khash_t (rspamd_url_hash) *url_set,
-										   GPtrArray *part_urls);
+										   GPtrArray *part_urls,
+										   bool allow_css);
 
 /*
  * Returns true if a specified tag has been seen in a part


More information about the Commits mailing list