commit 4658a09: [Project] Css: Declarations parsing logic skeleton

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Feb 18 16:56:27 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-02-18 16:18:08 +0000
URL: https://github.com/rspamd/rspamd/commit/4658a093bf5cfc6c1961e6974772bdead4dda2f4

[Project] Css: Declarations parsing logic skeleton

---
 CMakeLists.txt                      |   1 +
 src/libserver/css/css_parser.cxx    | 254 ++++++++++++------------------------
 src/libserver/css/css_parser.hxx    | 118 +++++++++++++++++
 src/libserver/css/css_property.cxx  |  36 ++++-
 src/libserver/css/css_property.hxx  |   8 +-
 src/libserver/css/css_rule.cxx      |  93 ++++++++++++-
 src/libserver/css/css_rule.hxx      |   6 +-
 src/libserver/css/css_selector.cxx  |   2 +-
 src/libserver/css/css_selector.hxx  |   5 +-
 src/libserver/css/css_tokeniser.cxx |   3 +
 src/libserver/css/css_tokeniser.hxx |   9 +-
 src/libserver/css/css_value.cxx     |   4 +-
 src/libserver/css/css_value.hxx     |   4 +-
 13 files changed, 356 insertions(+), 187 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 698550f07..a218e28fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,7 @@ INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/"
 		"${CMAKE_SOURCE_DIR}/contrib/aho-corasick"
 		"${CMAKE_SOURCE_DIR}/contrib/lc-btrie"
 		"${CMAKE_SOURCE_DIR}/contrib/lua-lpeg"
+		"${CMAKE_SOURCE_DIR}/contrib/frozen/include"
 		"${CMAKE_BINARY_DIR}/src" #Stored in the binary dir
 		"${CMAKE_BINARY_DIR}/src/libcryptobox")
 
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 1a9231700..e4a8159f1 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -24,181 +24,97 @@
 
 namespace rspamd::css {
 
-struct css_consumed_block;
-/*
- * Represents a consumed token by a parser
- */
-struct css_consumed_block {
-	enum class parser_tag_type : std::uint8_t  {
-		css_top_block,
-		css_qualified_rule,
-		css_at_rule,
-		css_simple_block,
-		css_function,
-		css_function_arg,
-		css_component,
-		css_selector,
-	};
-
-	using consumed_block_ptr = std::unique_ptr<css_consumed_block>;
-
-	parser_tag_type tag;
-	std::variant<std::monostate,
-		std::vector<consumed_block_ptr>,
-		css_parser_token> content;
-
-	css_consumed_block() = delete;
-
-	css_consumed_block(parser_tag_type tag) : tag(tag) {
-		if (tag == parser_tag_type::css_top_block ||
-			tag == parser_tag_type::css_qualified_rule ||
-			tag == parser_tag_type::css_simple_block) {
-			/* Pre-allocate content for known vector blocks */
-			std::vector<consumed_block_ptr> vec;
-			vec.reserve(4);
-			content = std::move(vec);
-		}
-	}
-	/* Construct a block from a single lexer token (for trivial blocks) */
-	explicit css_consumed_block(parser_tag_type tag, css_parser_token &&tok) :
-			tag(tag), content(std::move(tok)) {}
-
-	/* Attach a new block to the compound block, consuming block inside */
-	auto attach_block(consumed_block_ptr &&block) -> bool {
-		if (content.index() == 0) {
-			/* Switch from monostate */
-			content = std::vector<consumed_block_ptr>();
-		}
-		else if (content.index() == 2) {
-			/* A single component, cannot attach a block ! */
-			return false;
-		}
-
-		auto &value_vec = std::get<std::vector<consumed_block_ptr>>(content);
-		value_vec.push_back(std::move(block));
+const css_consumed_block css_parser_eof_block{};
 
-		return true;
+auto css_consumed_block::attach_block(consumed_block_ptr &&block) -> bool {
+	if (content.index() == 0) {
+		/* Switch from monostate */
+		content = std::vector<consumed_block_ptr>();
 	}
-
-	auto assign_token(css_parser_token &&tok) -> void
-	{
-		content = std::move(tok);
-	}
-
-	/* Empty blocks used to avoid type checks in loops */
-	const inline static std::vector<consumed_block_ptr> empty_block_vec{};
-
-	auto get_blocks_or_empty() const -> const std::vector<consumed_block_ptr>& {
-		if (content.index() == 1) {
-			return std::get<std::vector<consumed_block_ptr>>(content);
-		}
-
-		return empty_block_vec;
-	}
-
-	auto get_token_or_empty() const -> const css_parser_token& {
-		if (content.index() == 2) {
-			return std::get<css_parser_token>(content);
-		}
-
-		return css_parser_eof_token();
+	else if (content.index() == 2) {
+		/* A single component, cannot attach a block ! */
+		return false;
 	}
 
-	auto token_type_str(void) const -> const char *
-	{
-		const auto *ret = "";
+	auto &value_vec = std::get<std::vector<consumed_block_ptr>>(content);
+	value_vec.push_back(std::move(block));
 
-		switch(tag) {
-		case parser_tag_type::css_top_block:
-			ret = "top";
-			break;
-		case parser_tag_type::css_qualified_rule:
-			ret = "qualified rule";
-			break;
-		case parser_tag_type::css_at_rule:
-			ret = "at rule";
-			break;
-		case parser_tag_type::css_simple_block:
-			ret = "simple block";
-			break;
-		case parser_tag_type::css_function:
-			ret = "function";
-			break;
-		case parser_tag_type::css_function_arg:
-			ret = "function args";
-			break;
-		case parser_tag_type::css_component:
-			ret = "component";
-			break;
-		case parser_tag_type::css_selector:
-			ret = "selector";
-			break;
-		}
+	return true;
+}
 
-		return ret;
+auto css_consumed_block::token_type_str(void) const -> const char *
+{
+	const auto *ret = "";
+
+	switch(tag) {
+	case parser_tag_type::css_top_block:
+		ret = "top";
+		break;
+	case parser_tag_type::css_qualified_rule:
+		ret = "qualified rule";
+		break;
+	case parser_tag_type::css_at_rule:
+		ret = "at rule";
+		break;
+	case parser_tag_type::css_simple_block:
+		ret = "simple block";
+		break;
+	case parser_tag_type::css_function:
+		ret = "function";
+		break;
+	case parser_tag_type::css_function_arg:
+		ret = "function args";
+		break;
+	case parser_tag_type::css_component:
+		ret = "component";
+		break;
+	case parser_tag_type::css_selector:
+		ret = "selector";
+		break;
+	case parser_tag_type::css_eof_block:
+		ret = "eof";
+		break;
 	}
 
-	auto size() const -> std::size_t {
-		auto ret = 0;
-
-		std::visit([&](auto& arg) {
-			using T = std::decay_t<decltype(arg)>;
-
-			if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) {
-				/* Array of blocks */
-				ret = arg.size();
-			}
-			else if constexpr (std::is_same_v<T, std::monostate>) {
-				/* Empty block */
-				ret = 0;
-			}
-			else {
-				/* Single element block */
-				ret = 1;
-			}
-		},
-		content);
+	return ret;
+}
 
-		return ret;
-	}
+auto css_consumed_block::debug_str(void) -> std::string {
+	std::string ret = std::string(R"("type": ")") + token_type_str() + "\"";
 
-	auto debug_str(void) -> std::string {
-		std::string ret = std::string("\"type\": \"") + token_type_str() + "\"";
+	ret += ", \"value\": ";
 
-		ret += ", \"value\": ";
+	std::visit([&](auto& arg) {
+				using T = std::decay_t<decltype(arg)>;
 
-		std::visit([&](auto& arg) {
-			using T = std::decay_t<decltype(arg)>;
+				if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) {
+					/* Array of blocks */
+					ret += "[";
+					for (const auto &block : arg) {
+						ret += "{";
+						ret += block->debug_str();
+						ret += "}, ";
+					}
 
-			if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) {
-				/* Array of blocks */
-				ret += "[";
-				for (const auto &block : arg) {
-					ret += "{";
-					ret += block->debug_str();
-					ret += "}, ";
+					if (*(--ret.end()) == ' ') {
+						ret.pop_back();
+						ret.pop_back(); /* Last ',' */
+					}
+					ret += "]";
 				}
-
-				if (*(--ret.end()) == ' ') {
-					ret.pop_back();
-					ret.pop_back(); /* Last ',' */
+				else if constexpr (std::is_same_v<T, std::monostate>) {
+					/* Empty block */
+					ret += R"("empty")";
 				}
-				ret += "]";
-			}
-			else if constexpr (std::is_same_v<T, std::monostate>) {
-				/* Empty block */
-				ret += "\"empty\"";
-			}
-			else {
-				/* Single element block */
-				ret += "\"" + arg.debug_token_str() + "\"";
-			}
-		},
-		content);
+				else {
+					/* Single element block */
+					ret += "\"" + arg.debug_token_str() + "\"";
+				}
+			},
+			content);
 
-		return ret;
-	}
-};
+	return ret;
+}
 
 class css_parser {
 public:
@@ -622,18 +538,18 @@ bool css_parser::consume_input(const std::string_view &sv)
 				auto selector_it = children.cbegin();
 
 				auto selector_token_functor = [&selector_it,&simple_block](void)
-						-> const css_parser_token & {
+						-> const css_consumed_block & {
 					for (;;) {
 						if (selector_it == simple_block) {
-							return css_parser_eof_token();
+							return css_parser_eof_block;
 						}
 
-						const auto &ret = (*selector_it)->get_token_or_empty();
+						const auto &ret = (*selector_it);
 
 						++selector_it;
 
-						if (ret.type != css_parser_token::token_type::eof_token) {
-							return ret;
+						if (ret->get_token_or_empty().type != css_parser_token::token_type::eof_token) {
+							return *ret;
 						}
 					}
 				};
@@ -643,18 +559,18 @@ bool css_parser::consume_input(const std::string_view &sv)
 				auto decls_it = (*simple_block)->get_blocks_or_empty().cbegin();
 				auto decls_end = (*simple_block)->get_blocks_or_empty().cend();
 				auto declaration_token_functor = [&decls_it,&decls_end](void)
-						-> const css_parser_token & {
+						-> const css_consumed_block & {
 					for (;;) {
 						if (decls_it == decls_end) {
-							return css_parser_eof_token();
+							return css_parser_eof_block;
 						}
 
-						const auto &ret = (*decls_it)->get_token_or_empty();
+						const auto &ret = (*decls_it);
 
 						++decls_it;
 
-						if (ret.type != css_parser_token::token_type::eof_token) {
-							return ret;
+						if (ret->get_token_or_empty().type != css_parser_token::token_type::eof_token) {
+							return *ret;
 						}
 					}
 				};
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
index 2f10f994e..de982525a 100644
--- a/src/libserver/css/css_parser.hxx
+++ b/src/libserver/css/css_parser.hxx
@@ -19,13 +19,131 @@
 #ifndef RSPAMD_CSS_PARSER_HXX
 #define RSPAMD_CSS_PARSER_HXX
 
+#include <variant>
+#include <vector>
+#include <memory>
+#include <string>
+
+#include "css_tokeniser.hxx"
 #include "css.hxx"
 #include "parse_error.hxx"
 #include "contrib/expected/expected.hpp"
 #include "logger.h"
 
+
 namespace rspamd::css {
 
+/*
+ * Represents a consumed token by a parser
+ */
+class css_consumed_block {
+public:
+	enum class parser_tag_type : std::uint8_t  {
+		css_top_block,
+		css_qualified_rule,
+		css_at_rule,
+		css_simple_block,
+		css_function,
+		css_function_arg,
+		css_component,
+		css_selector,
+		css_eof_block,
+	};
+	using consumed_block_ptr = std::unique_ptr<css_consumed_block>;
+
+	css_consumed_block() : tag(parser_tag_type::css_eof_block) {}
+	css_consumed_block(parser_tag_type tag) : tag(tag) {
+		if (tag == parser_tag_type::css_top_block ||
+			tag == parser_tag_type::css_qualified_rule ||
+			tag == parser_tag_type::css_simple_block) {
+			/* Pre-allocate content for known vector blocks */
+			std::vector<consumed_block_ptr> vec;
+			vec.reserve(4);
+			content = std::move(vec);
+		}
+	}
+	/* Construct a block from a single lexer token (for trivial blocks) */
+	explicit css_consumed_block(parser_tag_type tag, css_parser_token &&tok) :
+			tag(tag), content(std::move(tok)) {}
+
+	/* Attach a new block to the compound block, consuming block inside */
+	auto attach_block(consumed_block_ptr &&block) -> bool;
+
+	auto assign_token(css_parser_token &&tok) -> void {
+		content = std::move(tok);
+	}
+
+	/* Empty blocks used to avoid type checks in loops */
+	const inline static std::vector<consumed_block_ptr> empty_block_vec{};
+
+	auto is_blocks_vec() const -> bool {
+		return (content.index() == 1);
+	}
+
+	auto get_blocks_or_empty() const -> const std::vector<consumed_block_ptr>& {
+		if (is_blocks_vec()) {
+			return std::get<std::vector<consumed_block_ptr>>(content);
+		}
+
+		return empty_block_vec;
+	}
+
+	auto is_token() const -> bool {
+		return (content.index() == 2);
+	}
+
+	auto get_token_or_empty() const -> const css_parser_token& {
+		if (is_token()) {
+			return std::get<css_parser_token>(content);
+		}
+
+		return css_parser_eof_token();
+	}
+
+	auto size() const -> std::size_t {
+		auto ret = 0;
+
+		std::visit([&](auto& arg) {
+					using T = std::decay_t<decltype(arg)>;
+
+					if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) {
+						/* Array of blocks */
+						ret = arg.size();
+					}
+					else if constexpr (std::is_same_v<T, std::monostate>) {
+						/* Empty block */
+						ret = 0;
+					}
+					else {
+						/* Single element block */
+						ret = 1;
+					}
+				},
+				content);
+
+		return ret;
+	}
+
+	auto is_eof() -> bool {
+		return tag == parser_tag_type::css_eof_block;
+	}
+
+	/* Debug methods */
+	auto token_type_str(void) const -> const char *;
+	auto debug_str(void) -> std::string;
+
+public:
+	parser_tag_type tag;
+private:
+	std::variant<std::monostate,
+			std::vector<consumed_block_ptr>,
+			css_parser_token> content;
+};
+
+extern const css_consumed_block css_parser_eof_block;
+
+using blocks_gen_functor = std::function<const css_consumed_block &(void)>;
+
 auto parse_css (rspamd_mempool_t *pool, const std::string_view &st) ->
 		tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>;
 
diff --git a/src/libserver/css/css_property.cxx b/src/libserver/css/css_property.cxx
index 98543f75a..77927d724 100644
--- a/src/libserver/css/css_property.cxx
+++ b/src/libserver/css/css_property.cxx
@@ -15,12 +15,44 @@
  */
 
 #include "css_property.hxx"
-
+#include "frozen/unordered_map.h"
+#include "frozen/string.h"
 
 namespace rspamd::css {
 
-auto css_property::from_bytes (const char *input, size_t inlen) -> tl::expected<css_property,css_parse_error>
+constexpr const auto max_type = static_cast<int>(css_property_type::PROPERTY_NYI);
+constexpr frozen::unordered_map<frozen::string, css_property_type, max_type> type_map{
+		{"font", css_property_type::PROPERTY_FONT},
+		{"color", css_property_type::PROPERTY_COLOR},
+		{"bgcolor", css_property_type::PROPERTY_BGCOLOR},
+		{"background", css_property_type::PROPERTY_BACKGROUND},
+		{"height", css_property_type::PROPERTY_HEIGHT},
+		{"width", css_property_type::PROPERTY_WIDTH},
+		{"display", css_property_type::PROPERTY_DISPLAY},
+		{"visibility", css_property_type::PROPERTY_VISIBILITY},
+};
+
+auto token_string_to_property(const std::string_view &inp) -> css_property_type {
+
+	css_property_type ret = css_property_type::PROPERTY_NYI;
+
+	auto known_type = type_map.find(inp);
+
+	if (known_type != type_map.end()) {
+		ret = known_type->second;
+	}
+
+	return ret;
+}
+
+auto css_property::from_token(const css_parser_token &tok) -> tl::expected<css_property,css_parse_error>
 {
+	if (tok.type == css_parser_token::token_type::ident_token) {
+		auto sv = tok.get_string_or_default("");
+
+		return css_property{token_string_to_property(sv)};
+	}
+
 	return tl::unexpected{css_parse_error(css_parse_error_type::PARSE_ERROR_NYI)};
 }
 
diff --git a/src/libserver/css/css_property.hxx b/src/libserver/css/css_property.hxx
index 2e668c640..562e54894 100644
--- a/src/libserver/css/css_property.hxx
+++ b/src/libserver/css/css_property.hxx
@@ -19,6 +19,7 @@
 #define RSPAMD_CSS_PROPERTY_HXX
 
 #include <string>
+#include "css_tokeniser.hxx"
 #include "parse_error.hxx"
 #include "contrib/expected/expected.hpp"
 
@@ -29,7 +30,7 @@ namespace rspamd::css {
  * point of view
  */
 enum class css_property_type {
-	PROPERTY_FONT,
+	PROPERTY_FONT = 0,
 	PROPERTY_COLOR,
 	PROPERTY_BGCOLOR,
 	PROPERTY_BACKGROUND,
@@ -37,12 +38,13 @@ enum class css_property_type {
 	PROPERTY_WIDTH,
 	PROPERTY_DISPLAY,
 	PROPERTY_VISIBILITY,
+	PROPERTY_NYI,
 };
 
 struct css_property {
 	css_property_type type;
-	static tl::expected<css_property,css_parse_error> from_bytes (const char *input,
-																 size_t inlen);
+	static tl::expected<css_property,css_parse_error> from_token(
+			const css_parser_token &tok);
 };
 
 
diff --git a/src/libserver/css/css_rule.cxx b/src/libserver/css/css_rule.cxx
index 44148b01a..cb0d4abad 100644
--- a/src/libserver/css/css_rule.cxx
+++ b/src/libserver/css/css_rule.cxx
@@ -19,10 +19,101 @@
 namespace rspamd::css {
 
 auto process_declaration_tokens(rspamd_mempool_t *pool,
-								const tokeniser_gen_functor &next_token_functor)
+								const blocks_gen_functor &next_block_functor)
 	-> declarations_vec
 {
 	declarations_vec ret;
+	bool can_continue = true;
+	css_property cur_property{css_property_type::PROPERTY_NYI};
+	static const css_property bad_property{css_property_type::PROPERTY_NYI};
+	std::unique_ptr<css_rule> cur_rule;
+
+	enum {
+		parse_property,
+		parse_value,
+		ignore_value, /* For unknown properties */
+	} state = parse_property;
+
+	while (can_continue) {
+		const auto &next_tok = next_block_functor();
+
+		switch (next_tok.tag) {
+		case css_consumed_block::parser_tag_type::css_component:
+			if (state == parse_property) {
+				cur_property = css_property::from_token(next_tok.get_token_or_empty())
+						.value_or(bad_property);
+
+				if (cur_property.type == css_property_type::PROPERTY_NYI) {
+					state = ignore_value;
+					/* Ignore everything till ; */
+					continue;
+				}
+
+				/* We now expect colon block */
+				const auto &expect_colon_block = next_block_functor();
+
+				if (expect_colon_block.tag != css_consumed_block::parser_tag_type::css_component) {
+
+					state = ignore_value; /* Ignore up to the next rule */
+				}
+				else {
+					const auto &expect_colon_tok = expect_colon_block.get_token_or_empty();
+
+					if (expect_colon_tok.type != css_parser_token::token_type::colon_token) {
+						msg_debug_css("invalid rule, no colon after property");
+						state = ignore_value; /* Ignore up to the next rule */
+					}
+					else {
+						state = parse_value;
+						cur_rule = std::make_unique<css_rule>(cur_property);
+					}
+				}
+			}
+			else if (state == parse_value) {
+				/* Check semicolon */
+				if (next_tok.is_token()) {
+					const auto &parser_tok = next_tok.get_token_or_empty();
+
+					if (parser_tok.type == css_parser_token::token_type::semicolon_token) {
+						ret.push_back(std::move(cur_rule));
+						state = parse_property;
+						continue;
+					}
+				}
+
+				auto maybe_value = css_value::from_css_block(next_tok);
+
+				if (maybe_value) {
+					cur_rule->add_value(maybe_value.value());
+				}
+			}
+			else {
+				/* Ignore all till ; */
+				if (next_tok.is_token()) {
+					const auto &parser_tok = next_tok.get_token_or_empty();
+
+					if (parser_tok.type == css_parser_token::token_type::semicolon_token) {
+						state = parse_property;
+					}
+				}
+			}
+			break;
+		case css_consumed_block::parser_tag_type::css_function:
+		case css_consumed_block::parser_tag_type::css_function_arg:
+			if (state == parse_value) {
+				auto maybe_value = css_value::from_css_block(next_tok);
+
+				if (maybe_value) {
+					cur_rule->add_value(maybe_value.value());
+				}
+			}
+			break;
+		case css_consumed_block::parser_tag_type::css_eof_block:
+		default:
+			can_continue = false;
+			break;
+		}
+	}
 
 	return ret; /* copy elision */
 }
diff --git a/src/libserver/css/css_rule.hxx b/src/libserver/css/css_rule.hxx
index 725b6448b..929c5b263 100644
--- a/src/libserver/css/css_rule.hxx
+++ b/src/libserver/css/css_rule.hxx
@@ -20,7 +20,7 @@
 
 #include "css_value.hxx"
 #include "css_property.hxx"
-#include "css_tokeniser.hxx"
+#include "css_parser.hxx"
 #include <vector>
 #include <memory>
 
@@ -38,7 +38,7 @@ public:
 	css_rule(css_rule &&other) = default;
 	explicit css_rule(css_property &&prop, css_values_vec &&values) :
 		prop(prop), values(std::forward<css_values_vec>(values)) {}
-	explicit css_rule(css_property &&prop) : prop(prop), values{} {}
+	explicit css_rule(const css_property &prop) : prop(prop), values{} {}
 	/* Methods */
 	void add_value(std::unique_ptr<css_value> &&value) {
 		values.emplace_back(std::forward<std::unique_ptr<css_value>>(value));
@@ -53,7 +53,7 @@ public:
 using declarations_vec = std::vector<std::unique_ptr<css_rule>>;
 
 auto process_declaration_tokens(rspamd_mempool_t *pool,
*** OUTPUT TRUNCATED, 120 LINES SKIPPED ***


More information about the Commits mailing list