commit 20b0002: [Project] Css: Start css selectors parsing logic

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Mar 16 20:49:07 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-03-16 15:58:34 +0000
URL: https://github.com/rspamd/rspamd/commit/20b0002b125ec1315ca558b1233d34fb616817ac

[Project] Css: Start css selectors parsing logic

---
 src/libserver/css/css_parser.cxx    |   3 -
 src/libserver/css/css_parser.hxx    |   1 -
 src/libserver/css/css_selector.cxx  | 157 ++++++++++++++++++++++++++++++++++++
 src/libserver/css/css_selector.hxx  |  29 +++++--
 src/libserver/css/css_tokeniser.hxx |   8 ++
 5 files changed, 189 insertions(+), 9 deletions(-)

diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 019849122..915e04f12 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -83,9 +83,6 @@ auto css_consumed_block::token_type_str(void) const -> const char *
 	case parser_tag_type::css_component:
 		ret = "component";
 		break;
-	case parser_tag_type::css_selector:
-		ret = "selector";
-		break;
 	case parser_tag_type::css_eof_block:
 		ret = "eof";
 		break;
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
index 034c135c6..f51960b71 100644
--- a/src/libserver/css/css_parser.hxx
+++ b/src/libserver/css/css_parser.hxx
@@ -46,7 +46,6 @@ public:
 		css_function,
 		css_function_arg,
 		css_component,
-		css_selector,
 		css_eof_block,
 	};
 	using consumed_block_ptr = std::unique_ptr<css_consumed_block>;
diff --git a/src/libserver/css/css_selector.cxx b/src/libserver/css/css_selector.cxx
index 2f1f29aca..a1ed0e234 100644
--- a/src/libserver/css/css_selector.cxx
+++ b/src/libserver/css/css_selector.cxx
@@ -15,6 +15,7 @@
  */
 
 #include "css_selector.hxx"
+#include "fmt/core.h"
 
 namespace rspamd::css {
 
@@ -23,9 +24,165 @@ auto process_selector_tokens(rspamd_mempool_t *pool,
 	-> selectors_vec
 {
 	selectors_vec ret;
+	bool can_continue = true;
+	enum class selector_process_state {
+		selector_parse_start = 0,
+		selector_expect_ident,
+		selector_ident_consumed,
+		selector_ignore_attribute,
+		selector_ignore_function,
+		selector_ignore_combination
+	} state = selector_process_state::selector_parse_start;
+	std::unique_ptr<css_selector> cur_selector;
+
+
+	while (can_continue) {
+		const auto &next_tok = next_token_functor();
+
+		if (next_tok.tag == css_consumed_block::parser_tag_type::css_component) {
+			const auto &parser_tok = next_tok.get_token_or_empty();
+
+			if (state == selector_process_state::selector_parse_start) {
+				/*
+				 * At the beginning of the parsing we can expect either
+				 * delim or an ident, everything else is discarded for now
+				 */
+				msg_debug_css("start consume selector");
+
+				switch (parser_tok.type) {
+				case css_parser_token::token_type::delim_token: {
+					auto delim_c = parser_tok.get_delim();
+
+					if (delim_c == '.') {
+						cur_selector = std::make_unique<css_selector>(
+								css_selector::selector_type::SELECTOR_CLASS);
+						state = selector_process_state::selector_expect_ident;
+					}
+					else if (delim_c == '#') {
+						cur_selector = std::make_unique<css_selector>(
+								css_selector::selector_type::SELECTOR_ID);
+						state = selector_process_state::selector_expect_ident;
+					}
+					else if (delim_c == '*') {
+						cur_selector = std::make_unique<css_selector>(
+								css_selector::selector_type::SELECTOR_ALL);
+						state = selector_process_state::selector_ident_consumed;
+					}
+					break;
+				}
+				case css_parser_token::token_type::ident_token:
+					cur_selector = std::make_unique<css_selector>(
+							css_selector::selector_type::SELECTOR_ELEMENT);
+					cur_selector->value = parser_tok.get_string_or_default("");
+					state = selector_process_state::selector_ident_consumed;
+					break;
+				case css_parser_token::token_type::hash_token:
+					cur_selector = std::make_unique<css_selector>(
+							css_selector::selector_type::SELECTOR_ID);
+					cur_selector->value =
+							parser_tok.get_string_or_default("").substr(1);
+					state = selector_process_state::selector_ident_consumed;
+					break;
+				default:
+					msg_debug_css("cannot consume more of a selector, invalid parser token: %*s; expected start",
+							next_tok.token_type_str());
+					can_continue = false;
+					break;
+				}
+			}
+			else if (state == selector_process_state::selector_expect_ident) {
+				/*
+				 * We got something like a selector start, so we expect
+				 * a plain ident
+				 */
+				if (parser_tok.type == css_parser_token::token_type::ident_token && cur_selector) {
+					cur_selector->value = parser_tok.get_string_or_default("");
+					state = selector_process_state::selector_ident_consumed;
+				}
+				else {
+					msg_debug_css("cannot consume more of a selector, invalid parser token: %*s; expected ident",
+							next_tok.token_type_str());
+					can_continue = false;
+				}
+			}
+			else if (state == selector_process_state::selector_ident_consumed) {
+				if (parser_tok.type == css_parser_token::token_type::comma_token) {
+					/* Got full selector, attach it to the vector and go further */
+					msg_debug_css("attached selector: %s", cur_selector->debug_str().c_str());
+					ret.push_back(std::move(cur_selector));
+					state = selector_process_state::selector_parse_start;
+				}
+				else if (parser_tok.type == css_parser_token::token_type::semicolon_token) {
+					/* TODO: implement adjustments */
+					state = selector_process_state::selector_ignore_function;
+				}
+				else if (parser_tok.type == css_parser_token::token_type::osqbrace_token) {
+					/* TODO: implement attributes checks */
+					state = selector_process_state::selector_ignore_attribute;
+				}
+				else {
+					/* TODO: implement selectors combinations */
+					state = selector_process_state::selector_ignore_combination;
+				}
+			}
+			else {
+				/* Ignore state; ignore all till ',' token or eof token */
+				if (parser_tok.type == css_parser_token::token_type::comma_token) {
+					/* Got full selector, attach it to the vector and go further */
+					ret.push_back(std::move(cur_selector));
+					state = selector_process_state::selector_parse_start;
+				}
+				else {
+					auto debug_str = parser_tok.get_string_or_default("");
+					msg_debug_css("ignore token %*s", (int)debug_str.size(),
+							debug_str.data());
+				}
+			}
+		}
+		else {
+			/* End of parsing */
+			if (state == selector_process_state::selector_ident_consumed && cur_selector) {
+				msg_debug_css("attached selector: %s", cur_selector->debug_str().c_str());
+				ret.push_back(std::move(cur_selector));
+			}
+			can_continue = false;
+		}
+
+	}
 
 	return ret; /* copy elision */
 }
 
+auto
+css_selector::debug_str() const -> std::string
+{
+	std::string ret;
+
+	if (type == selector_type::SELECTOR_ID) {
+		ret += "#";
+	}
+	else if (type == selector_type::SELECTOR_CLASS) {
+		ret += ".";
+	}
+	else if (type == selector_type::SELECTOR_ALL) {
+		ret = "*";
+
+		return ret;
+	}
+
+	std::visit([&](auto arg) -> void {
+		using T = std::decay_t<decltype(arg)>;
+
+		if constexpr (std::is_same_v<T, tag_id_t>) {
+			ret += fmt::format("tag: {}", static_cast<int>(arg));
+		}
+		else {
+			ret += arg;
+		}
+	}, value);
+
+	return ret;
+}
+
 }
 
diff --git a/src/libserver/css/css_selector.hxx b/src/libserver/css/css_selector.hxx
index a701e20f6..5ed4d54b8 100644
--- a/src/libserver/css/css_selector.hxx
+++ b/src/libserver/css/css_selector.hxx
@@ -37,13 +37,27 @@ namespace rspamd::css {
  */
 struct css_selector {
 	enum class selector_type {
-		SELECTOR_ELEMENT, /* e.g. .tr, for this value we use tag_id_t */
-		SELECTOR_CLASS, /* generic class */
-		SELECTOR_ID /* e.g. #id */
+		SELECTOR_ELEMENT, /* e.g. tr, for this value we use tag_id_t */
+		SELECTOR_CLASS, /* generic class, e.g. .class */
+		SELECTOR_ID, /* e.g. #id */
+		SELECTOR_ALL /* * selector */
 	};
 
 	selector_type type;
-	std::variant<tag_id_t, std::string> value;
+	std::variant<tag_id_t, std::string_view> value;
+
+	/* Conditions for the css selector */
+	/* Dependency on attributes */
+	struct css_attribute_condition {
+		std::string_view attribute;
+		std::string_view op = "";
+		std::string_view value = "";
+	};
+
+	/* General dependency chain */
+	using css_selector_ptr = std::unique_ptr<css_selector>;
+	using css_selector_dep = std::variant<css_attribute_condition, css_selector_ptr>;
+	std::vector<css_selector_dep> dependencies;
 
 	 auto to_tag(void) const -> std::optional<tag_id_t> {
 		if (type == selector_type::SELECTOR_ELEMENT) {
@@ -54,12 +68,17 @@ struct css_selector {
 
 	auto to_string(void) const -> std::optional<const std::string_view> {
 		if (type == selector_type::SELECTOR_ELEMENT) {
-			return std::string_view(std::get<std::string>(value));
+			return std::string_view(std::get<std::string_view>(value));
 		}
 		return std::nullopt;
 	};
+
+	explicit css_selector(selector_type t) : type(t) {}
+
+	auto debug_str(void) const -> std::string;
 };
 
+
 using selectors_vec = std::vector<std::unique_ptr<css_selector>>;
 
 /*
diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx
index 53ec4f2db..4a484ecd6 100644
--- a/src/libserver/css/css_tokeniser.hxx
+++ b/src/libserver/css/css_tokeniser.hxx
@@ -111,6 +111,14 @@ struct css_parser_token {
 		return def;
 	}
 
+	auto get_delim() const -> char {
+		if (std::holds_alternative<char>(value)) {
+			return std::get<char>(value);
+		}
+
+		return (char)-1;
+	}
+
 	auto get_number_or_default(double def) const -> double {
 		if (std::holds_alternative<double>(value)) {
 			auto dbl = std::get<double>(value);


More information about the Commits mailing list