commit fcfaab4: [Project] Css: Rework tokens structure

Mon Jan 25 16:42:13 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-01-25 16:35:23 +0000
URL: https://github.com/rspamd/rspamd/commit/fcfaab40b8ea772ce9d72773930c329a6277da6d (HEAD -> master)

[Project] Css: Rework tokens structure

---
 src/libserver/css/css_parser.cxx    | 12 +++----
 src/libserver/css/css_tokeniser.cxx | 72 ++++++++++++++++++++++++++-----------
 src/libserver/css/css_tokeniser.hxx | 72 +++++++++++++++++++++++++------------
 3 files changed, 106 insertions(+), 50 deletions(-)

diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 207cfcb9d..68f03cdfa 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -94,16 +94,16 @@ bool css_parser::consume_input(const std::string_view &sv)
 	css_tokeniser css_tokeniser(pool, sv);
 
 	while (!eof) {
-		auto token_pair = css_tokeniser.next_token();
+		auto next_token = css_tokeniser.next_token();
 
 		/* Top level parser */
-		switch (token_pair.first) {
-		case css_parser_token::eof_token:
+		switch (next_token.type) {
+		case css_parser_token::token_type::eof_token:
 			eof = true;
 			break;
-		case css_parser_token::whitespace_token:
-		case css_parser_token::cdc_token:
-		case css_parser_token::cdo_token:
+		case css_parser_token::token_type::whitespace_token:
+		case css_parser_token::token_type::cdc_token:
+		case css_parser_token::token_type::cdo_token:
 			/* Ignore tokens */
 			break;
 		}
diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx
index 40f202b01..058f7504e 100644
--- a/src/libserver/css/css_tokeniser.cxx
+++ b/src/libserver/css/css_tokeniser.cxx
@@ -19,8 +19,46 @@
 
 namespace rspamd::css {
 
+/* Helpers to create tokens */
 
-auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string_view>
+/*
+ * This helper is intended to create tokens either with a tag and value
+ * or with just a tag.
+ */
+template<css_parser_token::token_type T, typename ...Args>
+auto make_token(const Args&... args) -> css_parser_token;
+
+template<>
+auto make_token<css_parser_token::token_type::string_token, std::string_view>(const std::string_view &s)
+        -> css_parser_token
+{
+	return css_parser_token{css_parser_token::token_type::string_token, s};
+}
+
+template<>
+auto make_token<css_parser_token::token_type::whitespace_token, std::string_view>(const std::string_view &s)
+        -> css_parser_token
+{
+	return css_parser_token{css_parser_token::token_type::whitespace_token, s};
+}
+
+template<>
+auto make_token<css_parser_token::token_type::delim_token, char>(const char &c)
+        -> css_parser_token
+{
+	return css_parser_token{css_parser_token::token_type::delim_token, c};
+}
+
+/*
+ * Generic tokens with no value (non-terminals)
+ */
+template<css_parser_token::token_type T>
+auto make_token(void) -> css_parser_token
+{
+	return css_parser_token{T, css_parser_token_placeholder()};
+}
+
+auto css_tokeniser::next_token(void) -> struct css_parser_token
 {
 	/* Helpers */
 
@@ -29,7 +67,7 @@ auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string
 	 * offset is set to the next character after a comment (or eof)
 	 * Nothing is returned
 	 */
-	auto consume_comment = [this] () {
+	auto consume_comment = [this]() {
 		auto i = offset;
 		auto nested = 0;
 
@@ -64,7 +102,7 @@ auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string
 	 * is set one character after the string. Css unescaping is done automatically
 	 * Accepts a quote char to find end of string
 	 */
-	auto consume_string = [this] (auto quote_char) -> auto {
+	auto consume_string = [this](auto quote_char) -> auto {
 		auto i = offset;
 		bool need_unescape = false;
 
@@ -122,8 +160,7 @@ auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string
 			}
 			else {
 				offset = i + 1;
-				return std::make_pair (css_parser_token::delim_token,
-						std::string_view (&input[offset - 1], 1));
+				return make_token<css_parser_token::token_type::delim_token>(c);
 			}
 			break;
 		case ' ':
@@ -136,48 +173,41 @@ auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string
 				c = input[++i];
 			} while (i < input.size () && g_ascii_isspace (c));
 
-			auto ret = std::make_pair (css_parser_token::whitespace_token,
-					std::string_view (&input[offset], i - offset));
+			auto ret = make_token<css_parser_token::token_type::whitespace_token>(
+					std::string_view(&input[offset], i - offset));
 			offset = i;
 			return ret;
 		}
 		case '"':
 		case '\'':
 			offset = i + 1;
-			return std::make_pair (css_parser_token::string_token,
-					consume_string (c));
+			return make_token<css_parser_token::token_type::string_token>(consume_string(c));
 		case '(':
 			offset = i + 1;
-			return std::make_pair (css_parser_token::obrace_token,
-					std::string_view (&input[offset - 1], 1));
+			return make_token<css_parser_token::token_type::obrace_token>();
 		case ')':
 			offset = i + 1;
-			return std::make_pair (css_parser_token::ebrace_token,
-					std::string_view (&input[offset - 1], 1));
+			return make_token<css_parser_token::token_type::ebrace_token>();
 		case ',':
-			offset = i + 1;
-			return std::make_pair (css_parser_token::comma_token,
-					std::string_view (&input[offset - 1], 1));
+			return make_token<css_parser_token::token_type::comma_token>();
 		case '<':
 			/* Maybe an xml like comment */
 			if (i + 3 < input.size () && input[i + 1] == '!'
 				&& input[i + 2] == '-' && input[i + 3] == '-') {
 				offset += 3;
 
-				return std::make_pair (css_parser_token::cdo_token,
-						std::string_view (&input[offset - 3], 3));
+				return make_token<css_parser_token::token_type::cdo_token>();
 			}
 			else {
 				offset = i + 1;
-				return std::make_pair (css_parser_token::delim_token,
-						std::string_view (&input[offset - 1], 1));
+				return make_token<css_parser_token::token_type::delim_token>(c);
 			}
 			break;
 		}
 
 	}
 
-	return std::make_pair (css_parser_token::eof_token, std::string_view ());
+	return make_token<css_parser_token::token_type::eof_token>();
 }
 
 }
\ No newline at end of file
diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx
index 4c6824389..cff5877c2 100644
--- a/src/libserver/css/css_tokeniser.hxx
+++ b/src/libserver/css/css_tokeniser.hxx
@@ -21,41 +21,67 @@
 
 #include <string_view>
 #include <utility>
+#include <variant>
 #include "mem_pool.h"
 
 namespace rspamd::css {
 
-enum class css_parser_token {
-	whitespace_token,
-	ident_token,
-	function_token,
-	at_keyword_token,
-	hash_token,
-	string_token,
-	number_token,
-	url_token,
-	dimension_token,
-	percentage_token,
-	cdo_token, /* xml open comment */
-	cdc_token, /* xml close comment */
-	delim_token,
-	obrace_token, /* ( */
-	ebrace_token, /* ) */
-	osqbrace_token, /* [ */
-	esqbrace_token, /* ] */
-	comma_token,
-	colon_token,
-	semicolon_token,
-	eof_token,
+struct css_parser_token_placeholder {}; /* For empty tokens */
+
+struct css_parser_token {
+	enum class token_type : std::uint8_t {
+		whitespace_token,
+		ident_token,
+		function_token,
+		at_keyword_token,
+		hash_token,
+		string_token,
+		number_token,
+		url_token,
+		dimension_token,
+		percentage_token,
+		cdo_token, /* xml open comment */
+		cdc_token, /* xml close comment */
+		delim_token,
+		obrace_token, /* ( */
+		ebrace_token, /* ) */
+		osqbrace_token, /* [ */
+		esqbrace_token, /* ] */
+		comma_token,
+		colon_token,
+		semicolon_token,
+		eof_token,
+	};
+
+	static const std::uint8_t default_flags = 0;
+	static const std::uint8_t flag_bad_string = (1u << 0u);
+	using value_type = std::variant<std::string_view, /* For strings and string like tokens */
+			char, /* For delimiters (might need to move to unicode point) */
+			double, /* For numeric stuff */
+			css_parser_token_placeholder /* For general no token stuff */
+	>;
+
+	/* Typed storage */
+	value_type value;
+	token_type type;
+	std::uint8_t flags = default_flags;
+
+	css_parser_token() = delete;
+	explicit css_parser_token(token_type type, const value_type &value) :
+			value(value), type(type) {}
 };
 
+/* Ensure that parser tokens are simple enough */
+static_assert(std::is_trivially_copyable_v<css_parser_token>);
+
 class css_tokeniser {
 public:
 	css_tokeniser() = delete;
 	css_tokeniser(rspamd_mempool_t *pool, const std::string_view &sv) :
 			input(sv), offset(0), pool(pool) {}
 
-	auto next_token(void) -> std::pair<css_parser_token, std::string_view>;
+	auto next_token(void) -> struct css_parser_token;
+	auto get_offset(void) const { return offset; }
 private:
 	std::string_view input;
 	std::size_t offset;