commit e02a4f2: [Project] Html/CSS: Switch styles parsing to css parser

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Jun 11 14:14:07 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-10 17:21:09 +0100
URL: https://github.com/rspamd/rspamd/commit/e02a4f2831af83eec951b98cc93823568c226f4f

[Project] Html/CSS: Switch styles parsing to css parser

---
 src/libserver/css/css_parser.cxx  |  14 +
 src/libserver/css/css_parser.hxx  |  13 +
 src/libserver/html/html.cxx       | 618 +-------------------------------------
 src/libserver/html/html.hxx       |   2 -
 src/libserver/html/html_block.hxx |  11 +
 5 files changed, 50 insertions(+), 608 deletions(-)

diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 774a65cfc..34d65aadc 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -801,6 +801,20 @@ auto parse_css(rspamd_mempool_t *pool, const std::string_view &st,
 											   "cannot parse input"});
 }
 
+auto
+parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st)
+	-> rspamd::html::html_block *
+{
+	auto &&res = process_declaration_tokens(pool,
+			get_rules_parser_functor(pool, st));
+
+	if (res) {
+		return res->compile_to_block(pool);
+	}
+
+	return nullptr;
+}
+
 TEST_SUITE("css parser") {
 	TEST_CASE("parse colors") {
 		const std::vector<const char *> cases{
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
index ec6d5159a..1e0762d78 100644
--- a/src/libserver/css/css_parser.hxx
+++ b/src/libserver/css/css_parser.hxx
@@ -30,6 +30,10 @@
 #include "contrib/expected/expected.hpp"
 #include "logger.h"
 
+/* Forward declaration */
+namespace rspamd::html {
+struct html_block;
+}
 
 namespace rspamd::css {
 
@@ -205,6 +209,15 @@ auto get_selectors_parser_functor(rspamd_mempool_t *pool,
 auto get_rules_parser_functor(rspamd_mempool_t *pool,
 							  const std::string_view &st) -> blocks_gen_functor;
 
+/**
+ * Parses a css declaration (e.g. embedded css and returns a completed html block)
+ * @param pool
+ * @param st
+ * @return
+ */
+auto parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st)
+	-> rspamd::html::html_block *;
+
 }
 
 #endif //RSPAMD_CSS_PARSER_HXX
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e867cce6d..1d13c2466 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -990,610 +990,36 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	}
 }
 
-static void
-html_process_color(std::string_view input, struct html_color *cl)
-{
-	const gchar *p = input.data(), *end = input.data() + input.size();
-	char hexbuf[7];
-
-	memset(cl, 0, sizeof(*cl));
-
-	if (*p == '#') {
-		/* HEX color */
-		p++;
-		rspamd_strlcpy(hexbuf, p, MIN ((gint) sizeof(hexbuf), end - p + 1));
-		cl->d.val = strtoul(hexbuf, NULL, 16);
-		cl->d.comp.alpha = 255;
-		cl->valid = TRUE;
-	}
-	else if (input.size() > 4 && rspamd_lc_cmp(p, "rgb", 3) == 0) {
-		/* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
-		enum {
-			obrace,
-			num1,
-			num2,
-			num3,
-			num4,
-			skip_spaces
-		} state = skip_spaces, next_state = obrace;
-		gulong r = 0, g = 0, b = 0, opacity = 255;
-		const gchar *c;
-		gboolean valid = FALSE;
-
-		p += 3;
-
-		if (*p == 'a') {
-			p++;
-		}
-
-		c = p;
-
-		while (p < end) {
-			switch (state) {
-			case obrace:
-				if (*p == '(') {
-					p++;
-					state = skip_spaces;
-					next_state = num1;
-				}
-				else if (g_ascii_isspace (*p)) {
-					state = skip_spaces;
-					next_state = obrace;
-				}
-				else {
-					goto stop;
-				}
-				break;
-			case num1:
-				if (*p == ',') {
-					if (!rspamd_strtoul(c, p - c, &r)) {
-						goto stop;
-					}
-
-					p++;
-					state = skip_spaces;
-					next_state = num2;
-				}
-				else if (!g_ascii_isdigit (*p)) {
-					goto stop;
-				}
-				else {
-					p++;
-				}
-				break;
-			case num2:
-				if (*p == ',') {
-					if (!rspamd_strtoul(c, p - c, &g)) {
-						goto stop;
-					}
-
-					p++;
-					state = skip_spaces;
-					next_state = num3;
-				}
-				else if (!g_ascii_isdigit (*p)) {
-					goto stop;
-				}
-				else {
-					p++;
-				}
-				break;
-			case num3:
-				if (*p == ',') {
-					if (!rspamd_strtoul(c, p - c, &b)) {
-						goto stop;
-					}
-
-					valid = TRUE;
-					p++;
-					state = skip_spaces;
-					next_state = num4;
-				}
-				else if (*p == ')') {
-					if (!rspamd_strtoul(c, p - c, &b)) {
-						goto stop;
-					}
-
-					valid = TRUE;
-					goto stop;
-				}
-				else if (!g_ascii_isdigit (*p)) {
-					goto stop;
-				}
-				else {
-					p++;
-				}
-				break;
-			case num4:
-				if (*p == ',') {
-					if (!rspamd_strtoul(c, p - c, &opacity)) {
-						goto stop;
-					}
-
-					valid = TRUE;
-					goto stop;
-				}
-				else if (*p == ')') {
-					if (!rspamd_strtoul(c, p - c, &opacity)) {
-						goto stop;
-					}
-
-					valid = TRUE;
-					goto stop;
-				}
-				else if (!g_ascii_isdigit (*p)) {
-					goto stop;
-				}
-				else {
-					p++;
-				}
-				break;
-			case skip_spaces:
-				if (!g_ascii_isspace (*p)) {
-					c = p;
-					state = next_state;
-				}
-				else {
-					p++;
-				}
-				break;
-			}
-		}
-
-stop:
-
-		if (valid) {
-			cl->d.comp.r = r;
-			cl->d.comp.g = g;
-			cl->d.comp.b = b;
-			cl->d.comp.alpha = opacity;
-			cl->valid = TRUE;
-		}
-	}
-	else {
-		auto maybe_color_value =
-				rspamd::css::css_value::maybe_color_from_string(input);
-
-		if (maybe_color_value.has_value()) {
-			auto color = maybe_color_value->to_color().value();
-			cl->d.val = color.to_number();
-			cl->d.comp.alpha = 255; /* Non transparent */
-		}
-	}
-}
-
-/*
- * Target is used for in and out if this function returns TRUE
- */
-static auto
-html_process_css_size(const gchar *suffix, gsize len,
-							 double &tgt)  -> bool
-{
-	gdouble sz = tgt;
-	gboolean ret = FALSE;
-
-	if (len >= 2) {
-		if (memcmp(suffix, "px", 2) == 0) {
-			sz = (guint) sz; /* Round to number */
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "em", 2) == 0) {
-			/* EM is 16 px, so multiply and round */
-			sz = (guint) (sz * 16.0);
-			ret = TRUE;
-		}
-		else if (len >= 3 && memcmp(suffix, "rem", 3) == 0) {
-			/* equal to EM in our case */
-			sz = (guint) (sz * 16.0);
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "ex", 2) == 0) {
-			/*
-			 * Represents the x-height of the element's font.
-			 * On fonts with the "x" letter, this is generally the height
-			 * of lowercase letters in the font; 1ex = 0.5em in many fonts.
-			 */
-			sz = (guint) (sz * 8.0);
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "vw", 2) == 0) {
-			/*
-			 * Vewport width in percentages:
-			 * we assume 1% of viewport width as 8px
-			 */
-			sz = (guint) (sz * 8.0);
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "vh", 2) == 0) {
-			/*
-			 * Vewport height in percentages
-			 * we assume 1% of viewport width as 6px
-			 */
-			sz = (guint) (sz * 6.0);
-			ret = TRUE;
-		}
-		else if (len >= 4 && memcmp(suffix, "vmax", 4) == 0) {
-			/*
-			 * Vewport width in percentages
-			 * we assume 1% of viewport width as 6px
-			 */
-			sz = (guint) (sz * 8.0);
-			ret = TRUE;
-		}
-		else if (len >= 4 && memcmp(suffix, "vmin", 4) == 0) {
-			/*
-			 * Vewport height in percentages
-			 * we assume 1% of viewport width as 6px
-			 */
-			sz = (guint) (sz * 6.0);
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "pt", 2) == 0) {
-			sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "cm", 2) == 0) {
-			sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "mm", 2) == 0) {
-			sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "in", 2) == 0) {
-			sz = (guint) (sz * 96.0); /* 96px */
-			ret = TRUE;
-		}
-		else if (memcmp(suffix, "pc", 2) == 0) {
-			sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
-			ret = TRUE;
-		}
-	}
-	else if (suffix[0] == '%') {
-		/* Percentages from 16 px */
-		sz = (guint) (sz / 100.0 * 16.0);
-		ret = TRUE;
-	}
-
-	if (ret) {
-		tgt = sz;
-	}
-
-	return ret;
-}
-
-static auto
-html_process_font_size(const gchar *line, guint len, guint &fs,
-							  gboolean is_css) -> void
-{
-	const gchar *p = line, *end = line + len;
-	gchar *err = NULL, numbuf[64];
-	gdouble sz = 0;
-	gboolean failsafe = FALSE;
-
-	while (p < end && g_ascii_isspace (*p)) {
-		p++;
-		len--;
-	}
-
-	if (g_ascii_isdigit (*p)) {
-		rspamd_strlcpy(numbuf, p, MIN (sizeof(numbuf), len + 1));
-		sz = strtod(numbuf, &err);
-
-		/* Now check leftover */
-		if (sz < 0) {
-			sz = 0;
-		}
-	}
-	else {
-		/* Ignore the rest */
-		failsafe = TRUE;
-		sz = is_css ? 16 : 1;
-		/* TODO: add textual fonts descriptions */
-	}
-
-	if (err && *err != '\0') {
-		const gchar *e = err;
-		gsize slen;
-
-		/* Skip spaces */
-		while (*e && g_ascii_isspace (*e)) {
-			e++;
-		}
-
-		/* Lowercase */
-		slen = strlen(e);
-		rspamd_str_lc((gchar *) e, slen);
-
-		if (!html_process_css_size(e, slen, sz)) {
-			failsafe = TRUE;
-		}
-	}
-	else {
-		/* Failsafe naked number */
-		failsafe = TRUE;
-	}
-
-	if (failsafe) {
-		if (is_css) {
-			/*
-			 * In css mode we usually ignore sizes, but let's treat
-			 * small sizes specially
-			 */
-			if (sz < 1) {
-				sz = 0;
-			}
-			else {
-				sz = 16; /* Ignore */
-			}
-		}
-		else {
-			/* In non-css mode we have to check legacy size */
-			sz = sz >= 1 ? sz * 16 : 16;
-		}
-	}
-
-	if (sz > 32) {
-		sz = 32;
-	}
-
-	fs = sz;
-}
-
-static void
-html_process_style(rspamd_mempool_t *pool, struct html_block *bl,
-				   struct html_content *hc,
-				   std::string_view style)
-{
-	const gchar *p, *c, *end, *key = NULL;
-	enum {
-		read_key,
-		read_colon,
-		read_value,
-		skip_spaces,
-	} state = skip_spaces, next_state = read_key;
-	guint klen = 0;
-	gdouble opacity = 1.0;
-
-	p = style.data();
-	c = p;
-	end = p + style.size();
-
-	while (p <= end) {
-		switch (state) {
-		case read_key:
-			if (p == end || *p == ':') {
-				key = c;
-				klen = p - c;
-				state = skip_spaces;
-				next_state = read_value;
-			}
-			else if (g_ascii_isspace (*p)) {
-				key = c;
-				klen = p - c;
-				state = skip_spaces;
-				next_state = read_colon;
-			}
-
-			p++;
-			break;
-
-		case read_colon:
-			if (p == end || *p == ':') {
-				state = skip_spaces;
-				next_state = read_value;
-			}
-
-			p++;
-			break;
-
-		case read_value:
-			if (p == end || *p == ';') {
-				if (key && klen && p - c > 0) {
-					if ((klen == 5 && g_ascii_strncasecmp(key, "color", 5) == 0)
-						|| (klen == 10 && g_ascii_strncasecmp(key, "font-color", 10) == 0)) {
-
-						html_process_color({c, (std::size_t)(p - c)}, &bl->font_color);
-						msg_debug_html ("got color: %xd", bl->font_color.d.val);
-					}
-					else if ((klen == 16 && g_ascii_strncasecmp(key,
-							"background-color", 16) == 0) ||
-							 (klen == 10 && g_ascii_strncasecmp(key,
-									 "background", 10) == 0)) {
-
-						html_process_color({c, (std::size_t)(p - c)}, &bl->background_color);
-						msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
-					}
-					else if (klen == 7 && g_ascii_strncasecmp(key, "display", 7) == 0) {
-						if (p - c >= 4 && rspamd_substring_search_caseless(c, p - c,
-								"none", 4) != -1) {
-							bl->visible = FALSE;
-							msg_debug_html ("tag is not visible");
-						}
-					}
-					else if (klen == 9 &&
-							 g_ascii_strncasecmp(key, "font-size", 9) == 0) {
-						html_process_font_size(c, p - c,
-								bl->font_size, TRUE);
-						msg_debug_html ("got font size: %ud", bl->font_size);
-					}
-					else if (klen == 7 &&
-							 g_ascii_strncasecmp(key, "opacity", 7) == 0) {
-						gchar numbuf[64];
-
-						rspamd_strlcpy(numbuf, c,
-								MIN (sizeof(numbuf), p - c + 1));
-						opacity = strtod(numbuf, NULL);
-
-						if (opacity > 1) {
-							opacity = 1;
-						}
-						else if (opacity < 0) {
-							opacity = 0;
-						}
-
-						bl->font_color.d.comp.alpha = (guint8) (opacity * 255.0);
-					}
-					else if (klen == 10 &&
-							 g_ascii_strncasecmp(key, "visibility", 10) == 0) {
-						if (p - c >= 6 && rspamd_substring_search_caseless(c,
-								p - c,
-								"hidden", 6) != -1) {
-							bl->visible = FALSE;
-							msg_debug_html ("tag is not visible");
-						}
-					}
-				}
-
-				key = NULL;
-				klen = 0;
-				state = skip_spaces;
-				next_state = read_key;
-			}
-
-			p++;
-			break;
-
-		case skip_spaces:
-			if (p < end && !g_ascii_isspace (*p)) {
-				c = p;
-				state = next_state;
-			}
-			else {
-				p++;
-			}
-
-			break;
-		}
-	}
-}
-
 static auto
 html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 					   struct html_content *hc) -> void
 {
-	auto *bl = rspamd_mempool_alloc0_type (pool, struct html_block);
-	bl->tag = tag;
-	bl->visible = TRUE;
-	bl->font_size = (guint) -1;
-	bl->font_color.d.comp.alpha = 255;
+	std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
 
 	for (const auto &param : tag->parameters) {
 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
-			html_process_color(param.value, &bl->font_color);
-			msg_debug_html ("tag %*s; got color: %xd",
-					(int) tag->name.size(), tag->name.data(),
-					bl->font_color.d.val);
+			maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
 		}
 
 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
-			html_process_color(param.value, &bl->background_color);
-			msg_debug_html ("tag %*s; got bgcolor: %xd",
-					(int) tag->name.size(), tag->name.data(),
-					bl->background_color.d.val);
-			if (tag->id == Tag_BODY) {
-				/* Set global background color */
-				memcpy(&hc->bgcolor, &bl->background_color,
-						sizeof(hc->bgcolor));
-			}
+			maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
 		}
 
 		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
-			html_process_style(pool, bl, hc, param.value);
-			msg_debug_html ("tag: %*s; got style: %*s",
-					(int) tag->name.size(), tag->name.data(),
-					(int) bl->style.len, bl->style.begin);
-		}
-
-		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
-			rspamd_ftok_t fstr;
-			fstr.begin = param.value.data();
-			fstr.len = param.value.size();
-			bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
-			msg_debug_html ("tag: %*s; got class: %s",
-					(int) tag->name.size(), tag->name.data(), bl->html_class);
+			tag->block = rspamd::css::parse_css_declaration(pool, param.value);
 		}
 	}
 
-	hc->blocks.push_back(bl);
-	tag->block = bl;
-}
-
-static auto
-html_propagate_style(struct html_content *hc,
-							struct html_tag *tag,
-							struct html_block *bl,
-							std::vector<struct html_block *> &blocks) -> void
-{
-	gboolean push_block = FALSE;
-
-	if (blocks.empty()) {
-		/* No blocks to propagate */
-		return;
-	}
-	/* Propagate from the parent if needed */
-	auto *bl_parent = blocks.back();
-
-	if (!bl->background_color.valid) {
-		/* Try to propagate background color from parent nodes */
-		if (bl_parent->background_color.valid) {
-			memcpy(&bl->background_color, &bl_parent->background_color,
-					sizeof(bl->background_color));
-		}
-	}
-	else {
-		push_block = TRUE;
+	if (!tag->block) {
+		tag->block = html_block::undefined_html_block_pool(pool);
 	}
 
-	if (!bl->font_color.valid) {
-		/* Try to propagate background color from parent nodes */
-		if (bl_parent->font_color.valid) {
-			memcpy(&bl->font_color, &bl_parent->font_color,
-					sizeof(bl->font_color));
-		}
-	}
-	else {
-		push_block = TRUE;
-	}
-
-	/* Propagate font size */
-	if (bl->font_size == (guint) -1) {
-		if (bl_parent->font_size != (guint) -1) {
-			bl->font_size = bl_parent->font_size;
-		}
-	}
-	else {
-		push_block = TRUE;
+	if (maybe_fgcolor) {
+		tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
 	}
 
-	/* Set bgcolor to the html bgcolor and font color to black as a last resort */
-	if (!bl->font_color.valid) {
-		/* Don't touch opacity as it can be set separately */
-		bl->font_color.d.comp.r = 0;
-		bl->font_color.d.comp.g = 0;
-		bl->font_color.d.comp.b = 0;
-		bl->font_color.valid = TRUE;
-	}
-	else {
-		push_block = TRUE;
-	}
-
-	if (!bl->background_color.valid) {
-		memcpy(&bl->background_color, &hc->bgcolor, sizeof(hc->bgcolor));
-	}
-	else {
-		push_block = TRUE;
-	}
-
-	if (bl->font_size == (guint) -1) {
-		bl->font_size = 16; /* Default for browsers */
-	}
-	else {
-		push_block = TRUE;
-	}
-
-	if (push_block && !(tag->flags & FL_CLOSED)) {
-		blocks.push_back(bl);
+	if (maybe_bgcolor) {
+		tag->block->set_bgcolor(maybe_fgcolor->to_color().value());
 	}
 }
 
@@ -2186,7 +1612,8 @@ html_process_input(rspamd_mempool_t *pool,
 					html_process_link_tag(pool, cur_tag, hc, url_set,
 							part_urls);
 				}
-				else if (cur_tag->flags & FL_BLOCK) {
*** OUTPUT TRUNCATED, 75 LINES SKIPPED ***


More information about the Commits mailing list