commit 20b0002: [Project] Css: Start css selectors parsing logic
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Mar 16 20:49:07 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-03-16 15:58:34 +0000
URL: https://github.com/rspamd/rspamd/commit/20b0002b125ec1315ca558b1233d34fb616817ac
[Project] Css: Start css selectors parsing logic
---
src/libserver/css/css_parser.cxx | 3 -
src/libserver/css/css_parser.hxx | 1 -
src/libserver/css/css_selector.cxx | 157 ++++++++++++++++++++++++++++++++++++
src/libserver/css/css_selector.hxx | 29 +++++--
src/libserver/css/css_tokeniser.hxx | 8 ++
5 files changed, 189 insertions(+), 9 deletions(-)
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 019849122..915e04f12 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -83,9 +83,6 @@ auto css_consumed_block::token_type_str(void) const -> const char *
case parser_tag_type::css_component:
ret = "component";
break;
- case parser_tag_type::css_selector:
- ret = "selector";
- break;
case parser_tag_type::css_eof_block:
ret = "eof";
break;
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
index 034c135c6..f51960b71 100644
--- a/src/libserver/css/css_parser.hxx
+++ b/src/libserver/css/css_parser.hxx
@@ -46,7 +46,6 @@ public:
css_function,
css_function_arg,
css_component,
- css_selector,
css_eof_block,
};
using consumed_block_ptr = std::unique_ptr<css_consumed_block>;
diff --git a/src/libserver/css/css_selector.cxx b/src/libserver/css/css_selector.cxx
index 2f1f29aca..a1ed0e234 100644
--- a/src/libserver/css/css_selector.cxx
+++ b/src/libserver/css/css_selector.cxx
@@ -15,6 +15,7 @@
*/
#include "css_selector.hxx"
+#include "fmt/core.h"
namespace rspamd::css {
@@ -23,9 +24,165 @@ auto process_selector_tokens(rspamd_mempool_t *pool,
-> selectors_vec
{
selectors_vec ret;
+ bool can_continue = true;
+ enum class selector_process_state {
+ selector_parse_start = 0,
+ selector_expect_ident,
+ selector_ident_consumed,
+ selector_ignore_attribute,
+ selector_ignore_function,
+ selector_ignore_combination
+ } state = selector_process_state::selector_parse_start;
+ std::unique_ptr<css_selector> cur_selector;
+
+
+ while (can_continue) {
+ const auto &next_tok = next_token_functor();
+
+ if (next_tok.tag == css_consumed_block::parser_tag_type::css_component) {
+ const auto &parser_tok = next_tok.get_token_or_empty();
+
+ if (state == selector_process_state::selector_parse_start) {
+ /*
+ * At the beginning of the parsing we can expect either
+ * delim or an ident, everything else is discarded for now
+ */
+ msg_debug_css("start consume selector");
+
+ switch (parser_tok.type) {
+ case css_parser_token::token_type::delim_token: {
+ auto delim_c = parser_tok.get_delim();
+
+ if (delim_c == '.') {
+ cur_selector = std::make_unique<css_selector>(
+ css_selector::selector_type::SELECTOR_CLASS);
+ state = selector_process_state::selector_expect_ident;
+ }
+ else if (delim_c == '#') {
+ cur_selector = std::make_unique<css_selector>(
+ css_selector::selector_type::SELECTOR_ID);
+ state = selector_process_state::selector_expect_ident;
+ }
+ else if (delim_c == '*') {
+ cur_selector = std::make_unique<css_selector>(
+ css_selector::selector_type::SELECTOR_ALL);
+ state = selector_process_state::selector_ident_consumed;
+ }
+ break;
+ }
+ case css_parser_token::token_type::ident_token:
+ cur_selector = std::make_unique<css_selector>(
+ css_selector::selector_type::SELECTOR_ELEMENT);
+ cur_selector->value = parser_tok.get_string_or_default("");
+ state = selector_process_state::selector_ident_consumed;
+ break;
+ case css_parser_token::token_type::hash_token:
+ cur_selector = std::make_unique<css_selector>(
+ css_selector::selector_type::SELECTOR_ID);
+ cur_selector->value =
+ parser_tok.get_string_or_default("").substr(1);
+ state = selector_process_state::selector_ident_consumed;
+ break;
+ default:
+ msg_debug_css("cannot consume more of a selector, invalid parser token: %*s; expected start",
+ next_tok.token_type_str());
+ can_continue = false;
+ break;
+ }
+ }
+ else if (state == selector_process_state::selector_expect_ident) {
+ /*
+ * We got something like a selector start, so we expect
+ * a plain ident
+ */
+ if (parser_tok.type == css_parser_token::token_type::ident_token && cur_selector) {
+ cur_selector->value = parser_tok.get_string_or_default("");
+ state = selector_process_state::selector_ident_consumed;
+ }
+ else {
+ msg_debug_css("cannot consume more of a selector, invalid parser token: %*s; expected ident",
+ next_tok.token_type_str());
+ can_continue = false;
+ }
+ }
+ else if (state == selector_process_state::selector_ident_consumed) {
+ if (parser_tok.type == css_parser_token::token_type::comma_token) {
+ /* Got full selector, attach it to the vector and go further */
+ msg_debug_css("attached selector: %s", cur_selector->debug_str().c_str());
+ ret.push_back(std::move(cur_selector));
+ state = selector_process_state::selector_parse_start;
+ }
+ else if (parser_tok.type == css_parser_token::token_type::semicolon_token) {
+ /* TODO: implement adjustments */
+ state = selector_process_state::selector_ignore_function;
+ }
+ else if (parser_tok.type == css_parser_token::token_type::osqbrace_token) {
+ /* TODO: implement attributes checks */
+ state = selector_process_state::selector_ignore_attribute;
+ }
+ else {
+ /* TODO: implement selectors combinations */
+ state = selector_process_state::selector_ignore_combination;
+ }
+ }
+ else {
+ /* Ignore state; ignore all till ',' token or eof token */
+ if (parser_tok.type == css_parser_token::token_type::comma_token) {
+ /* Got full selector, attach it to the vector and go further */
+ ret.push_back(std::move(cur_selector));
+ state = selector_process_state::selector_parse_start;
+ }
+ else {
+ auto debug_str = parser_tok.get_string_or_default("");
+ msg_debug_css("ignore token %*s", (int)debug_str.size(),
+ debug_str.data());
+ }
+ }
+ }
+ else {
+ /* End of parsing */
+ if (state == selector_process_state::selector_ident_consumed && cur_selector) {
+ msg_debug_css("attached selector: %s", cur_selector->debug_str().c_str());
+ ret.push_back(std::move(cur_selector));
+ }
+ can_continue = false;
+ }
+
+ }
return ret; /* copy elision */
}
+auto
+css_selector::debug_str() const -> std::string
+{
+ std::string ret;
+
+ if (type == selector_type::SELECTOR_ID) {
+ ret += "#";
+ }
+ else if (type == selector_type::SELECTOR_CLASS) {
+ ret += ".";
+ }
+ else if (type == selector_type::SELECTOR_ALL) {
+ ret = "*";
+
+ return ret;
+ }
+
+ std::visit([&](auto arg) -> void {
+ using T = std::decay_t<decltype(arg)>;
+
+ if constexpr (std::is_same_v<T, tag_id_t>) {
+ ret += fmt::format("tag: {}", static_cast<int>(arg));
+ }
+ else {
+ ret += arg;
+ }
+ }, value);
+
+ return ret;
+}
+
}
diff --git a/src/libserver/css/css_selector.hxx b/src/libserver/css/css_selector.hxx
index a701e20f6..5ed4d54b8 100644
--- a/src/libserver/css/css_selector.hxx
+++ b/src/libserver/css/css_selector.hxx
@@ -37,13 +37,27 @@ namespace rspamd::css {
*/
struct css_selector {
enum class selector_type {
- SELECTOR_ELEMENT, /* e.g. .tr, for this value we use tag_id_t */
- SELECTOR_CLASS, /* generic class */
- SELECTOR_ID /* e.g. #id */
+ SELECTOR_ELEMENT, /* e.g. tr, for this value we use tag_id_t */
+ SELECTOR_CLASS, /* generic class, e.g. .class */
+ SELECTOR_ID, /* e.g. #id */
+ SELECTOR_ALL /* * selector */
};
selector_type type;
- std::variant<tag_id_t, std::string> value;
+ std::variant<tag_id_t, std::string_view> value;
+
+ /* Conditions for the css selector */
+ /* Dependency on attributes */
+ struct css_attribute_condition {
+ std::string_view attribute;
+ std::string_view op = "";
+ std::string_view value = "";
+ };
+
+ /* General dependency chain */
+ using css_selector_ptr = std::unique_ptr<css_selector>;
+ using css_selector_dep = std::variant<css_attribute_condition, css_selector_ptr>;
+ std::vector<css_selector_dep> dependencies;
auto to_tag(void) const -> std::optional<tag_id_t> {
if (type == selector_type::SELECTOR_ELEMENT) {
@@ -54,12 +68,17 @@ struct css_selector {
auto to_string(void) const -> std::optional<const std::string_view> {
if (type == selector_type::SELECTOR_ELEMENT) {
- return std::string_view(std::get<std::string>(value));
+ return std::string_view(std::get<std::string_view>(value));
}
return std::nullopt;
};
+
+ explicit css_selector(selector_type t) : type(t) {}
+
+ auto debug_str(void) const -> std::string;
};
+
using selectors_vec = std::vector<std::unique_ptr<css_selector>>;
/*
diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx
index 53ec4f2db..4a484ecd6 100644
--- a/src/libserver/css/css_tokeniser.hxx
+++ b/src/libserver/css/css_tokeniser.hxx
@@ -111,6 +111,14 @@ struct css_parser_token {
return def;
}
+ auto get_delim() const -> char {
+ if (std::holds_alternative<char>(value)) {
+ return std::get<char>(value);
+ }
+
+ return (char)-1;
+ }
+
auto get_number_or_default(double def) const -> double {
if (std::holds_alternative<double>(value)) {
auto dbl = std::get<double>(value);
More information about the Commits
mailing list