commit 50e3e98: [Project] Css: rework tokeniser

Mon Jan 25 16:42:12 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-01-25 14:35:41 +0000
URL: https://github.com/rspamd/rspamd/commit/50e3e98a741cf2946ec0b3e4cf396d53cc9e4ae4

[Project] Css: rework tokeniser

---
 src/libserver/css/CMakeLists.txt                   |   2 +
 src/libserver/css/css.hxx                          |   2 +
 src/libserver/css/css_parser.cxx                   | 160 ++----------------
 src/libserver/css/css_parser.hxx                   |   2 +
 src/libserver/css/css_property.hxx                 |   1 +
 src/libserver/css/css_rule.hxx                     |   1 +
 src/libserver/css/css_selector.hxx                 |   1 +
 src/libserver/css/css_style.hxx                    |   2 +
 src/libserver/css/css_tokeniser.cxx                | 183 +++++++++++++++++++++
 src/libserver/css/css_tokeniser.hxx                |  68 ++++++++
 src/libserver/css/{css_parser.cxx => css_util.cxx} | 118 +------------
 src/libserver/css/{css_parser.hxx => css_util.hxx} |  25 +--
 src/libserver/css/css_value.hxx                    |   2 +
 src/libserver/css/parse_error.hxx                  |   1 +
 14 files changed, 304 insertions(+), 264 deletions(-)

diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt
index c8f7921b1..84ed2cf8b 100644
--- a/src/libserver/css/CMakeLists.txt
+++ b/src/libserver/css/CMakeLists.txt
@@ -14,6 +14,8 @@ SET(LIBCSSSRC    "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx"
+                 "${CMAKE_CURRENT_SOURCE_DIR}/css_tokeniser.cxx"
+                 "${CMAKE_CURRENT_SOURCE_DIR}/css_util.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx"
                  "${RAGEL_ragel_css_selector_parser_OUTPUTS}"
                  "${RAGEL_ragel_css_rule_parser_OUTPUTS}"
diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx
index 8f2550d7b..1a511dcfd 100644
--- a/src/libserver/css/css.hxx
+++ b/src/libserver/css/css.hxx
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
+
 #ifndef RSPAMD_CSS_HXX
 #define RSPAMD_CSS_HXX
 
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 4134b933c..207cfcb9d 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -15,6 +15,7 @@
  */
 
 #include "css_parser.hxx"
+#include "css_tokeniser.hxx"
 #include <unicode/utf8.h>
 
 
@@ -36,9 +37,6 @@ public:
 		return tl::make_unexpected (error);
 	}
 
-	/* Public for unit tests */
-	std::string_view unescape_css(const std::string_view &sv);
-
 private:
 	enum class parser_state {
 		initial_state,
@@ -49,6 +47,7 @@ private:
 	};
 	parser_state state = parser_state::initial_state;
 	std::unique_ptr<css_style_sheet> style_object;
+
 	css_parse_error error;
 	rspamd_mempool_t *pool;
 
@@ -88,136 +87,26 @@ css_parser::need_unescape(const std::string_view &sv)
 	return false;
 }
 
-/*
- * Unescape css escapes
- * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
- * \0020AC : must be 6 digits long, no space needed (but can be included)
- */
-std::string_view
-css_parser::unescape_css(const std::string_view &sv)
-{
-	auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
-	auto *d = nspace;
-	auto nleft = sv.length ();
 
-	enum {
-		normal = 0,
-		quoted,
-		escape,
-		skip_spaces,
-	} state = normal;
-
-	char quote_char, prev_c = 0;
-	auto escape_offset = 0, i = 0;
-
-#define MAYBE_CONSUME_CHAR(c) do { \
-    if (c == '"' || c == '\'') { \
-        state = quoted; \
-        quote_char = c; \
-        nleft--; \
-        *d++ = c; \
-    } \
-    else if (c == '\\') { \
-        escape_offset = i; \
-        state = escape; \
-    } \
-    else { \
-        state = normal; \
-        nleft--; \
-        *d++ = c; \
-    } \
-} while (0)
-
-	for (const auto c : sv) {
-		if (nleft == 0) {
-			msg_err_css("cannot unescape css: truncated buffer of size %d",
-					(int)sv.length());
-			break;
-		}
-		switch (state) {
-		case normal:
-			MAYBE_CONSUME_CHAR(c);
-			break;
-		case quoted:
-			if (c == quote_char) {
-				if (prev_c != '\\') {
-					state = normal;
-				}
-			}
-			prev_c = c;
-			nleft --;
-			*d++ = c;
-			break;
-		case escape:
-			if (!g_ascii_isxdigit(c)) {
-				if (i > escape_offset + 1) {
-					/* Try to decode an escape */
-					const auto *escape_start = &sv[escape_offset + 1];
-					unsigned long val;
+bool css_parser::consume_input(const std::string_view &sv)
+{
+	bool eof = false;
+	css_tokeniser css_tokeniser(pool, sv);
 
-					if (!rspamd_xstrtoul(escape_start, i - escape_offset - 1, &val)) {
-						msg_debug_css("invalid broken escape found at pos %d",
-								escape_offset);
-					}
-					else {
-						if (val < 0x80) {
-							/* Trivial case: ascii character */
-							*d++ = (unsigned char)val;
-							nleft --;
-						}
-						else {
-							UChar32 uc = val;
-							auto off = 0;
-							UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off,
-									sv.length (), uc);
-							d += off;
-							nleft -= off;
-						}
-					}
-				}
-				else {
-					/* Empty escape, ignore it */
-					msg_debug_css("invalid empty escape found at pos %d",
-							escape_offset);
-				}
+	while (!eof) {
+		auto token_pair = css_tokeniser.next_token();
 
-				if (nleft <= 0) {
-					msg_err_css("cannot unescape css: truncated buffer of size %d",
-							(int)sv.length());
-				}
-				else {
-					/* Escape is done, advance forward */
-					if (g_ascii_isspace (c)) {
-						state = skip_spaces;
-					}
-					else {
-						MAYBE_CONSUME_CHAR(c);
-					}
-				}
-			}
+		/* Top level parser */
+		switch (token_pair.first) {
+		case css_parser_token::eof_token:
+			eof = true;
 			break;
-		case skip_spaces:
-			if (!g_ascii_isspace(c)) {
-				MAYBE_CONSUME_CHAR(c);
-			}
-			/* Ignore spaces */
+		case css_parser_token::whitespace_token:
+		case css_parser_token::cdc_token:
+		case css_parser_token::cdo_token:
+			/* Ignore tokens */
 			break;
 		}
-
-		i ++;
-	}
-
-	return std::string_view{nspace, sv.size() - nleft};
-};
-
-bool css_parser::consume_input(const std::string_view &sv)
-{
-	auto our_sv = sv;
-
-	if (need_unescape(sv)) {
-		our_sv = unescape_css(sv);
-		msg_debug_css("unescaped css: input size %d, unescaped size %d",
-				(int)sv.size(), (int)our_sv.size());
 	}
 
 	return true;
@@ -237,20 +126,3 @@ auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
 }
 
 }
-
-/* C API */
-const gchar *rspamd_css_unescape (rspamd_mempool_t *pool,
-							const guchar *begin,
-							gsize len,
-							gsize *outlen)
-{
-	rspamd::css::css_parser parser(pool);
-	auto sv = parser.unescape_css({(const char*)begin, len});
-	const auto *v = sv.begin();
-
-	if (outlen) {
-		*outlen = sv.size();
-	}
-
-	return v;
-}
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
index 8d1468a01..e009fef70 100644
--- a/src/libserver/css/css_parser.hxx
+++ b/src/libserver/css/css_parser.hxx
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #ifndef RSPAMD_CSS_PARSER_HXX
 #define RSPAMD_CSS_PARSER_HXX
 
diff --git a/src/libserver/css/css_property.hxx b/src/libserver/css/css_property.hxx
index 06a345ad4..2e668c640 100644
--- a/src/libserver/css/css_property.hxx
+++ b/src/libserver/css/css_property.hxx
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #ifndef RSPAMD_CSS_PROPERTY_HXX
 #define RSPAMD_CSS_PROPERTY_HXX
diff --git a/src/libserver/css/css_rule.hxx b/src/libserver/css/css_rule.hxx
index 878322f78..6afaa8bc6 100644
--- a/src/libserver/css/css_rule.hxx
+++ b/src/libserver/css/css_rule.hxx
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #ifndef RSPAMD_CSS_RULE_HXX
 #define RSPAMD_CSS_RULE_HXX
diff --git a/src/libserver/css/css_selector.hxx b/src/libserver/css/css_selector.hxx
index 4c12b3b41..c9f3046d5 100644
--- a/src/libserver/css/css_selector.hxx
+++ b/src/libserver/css/css_selector.hxx
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#pragma once
 
 #ifndef RSPAMD_CSS_SELECTOR_HXX
 #define RSPAMD_CSS_SELECTOR_HXX
diff --git a/src/libserver/css/css_style.hxx b/src/libserver/css/css_style.hxx
index f3d1e664d..2a97f8f0e 100644
--- a/src/libserver/css/css_style.hxx
+++ b/src/libserver/css/css_style.hxx
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #ifndef RSPAMD_CSS_STYLE_HXX
 #define RSPAMD_CSS_STYLE_HXX
 
diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx
new file mode 100644
index 000000000..40f202b01
--- /dev/null
+++ b/src/libserver/css/css_tokeniser.cxx
@@ -0,0 +1,183 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "css_tokeniser.hxx"
+#include "css_util.hxx"
+
+namespace rspamd::css {
+
+
+auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string_view>
+{
+	/* Helpers */
+
+	/*
+	 * This lambda eats comment handling nested comments;
+	 * offset is set to the next character after a comment (or eof)
+	 * Nothing is returned
+	 */
+	auto consume_comment = [this] () {
+		auto i = offset;
+		auto nested = 0;
+
+		/* We handle nested comments just because they can exist... */
+		while (i < input.size () - 1) {
+			auto c = input[i];
+			if (c == '*' && input[i + 1] == '/') {
+				if (nested == 0) {
+					offset = i + 2;
+					return;
+				}
+				else {
+					nested--;
+					i += 2;
+					continue;
+				}
+			}
+			else if (c == '/' && input[i + 1] == '*') {
+				nested++;
+				i += 2;
+				continue;
+			}
+
+			i++;
+		}
+
+		offset = i;
+	};
+
+	/*
+	 * Consume quoted string, returns a string_view over a string, offset
+	 * is set one character after the string. Css unescaping is done automatically
+	 * Accepts a quote char to find end of string
+	 */
+	auto consume_string = [this] (auto quote_char) -> auto {
+		auto i = offset;
+		bool need_unescape = false;
+
+		while (i < input.size ()) {
+			auto c = input[i];
+
+			if (c == '\\') {
+				if (i + 1 < input.size ()) {
+					need_unescape = true;
+				}
+				else {
+					/* \ at the end -> ignore */
+
+				}
+			}
+			else if (c == quote_char) {
+				/* End of string */
+				std::string_view res{&input[offset], i - offset};
+
+				if (need_unescape) {
+					res = rspamd::css::unescape_css(pool, res);
+				}
+
+				offset = i + 1;
+
+				return res;
+			}
+			else if (c == '\n') {
+				/* Should be a error, but we ignore it for now */
+			}
+		}
+
+		/* EOF with no quote character, consider it fine */
+		std::string_view res{&input[offset], i - offset};
+
+		if (need_unescape) {
+			res = rspamd::css::unescape_css(pool, res);
+		}
+
+		offset = i;
+
+		return res;
+	};
+
+	/* Main tokenisation loop */
+	for (auto i = offset; i < input.size (); ++i) {
+		auto c = input[i];
+
+		switch (c) {
+		case '/':
+			if (i + 1 < input.size () && input[i + 1] == '*') {
+				offset = i + 2;
+				consume_comment (); /* Consume comment and go forward */
+				return next_token (); /* Tail call */
+			}
+			else {
+				offset = i + 1;
+				return std::make_pair (css_parser_token::delim_token,
+						std::string_view (&input[offset - 1], 1));
+			}
+			break;
+		case ' ':
+		case '\t':
+		case '\n':
+		case '\r':
+		case '\v': {
+			/* Consume as much space as we can */
+			do {
+				c = input[++i];
+			} while (i < input.size () && g_ascii_isspace (c));
+
+			auto ret = std::make_pair (css_parser_token::whitespace_token,
+					std::string_view (&input[offset], i - offset));
+			offset = i;
+			return ret;
+		}
+		case '"':
+		case '\'':
+			offset = i + 1;
+			return std::make_pair (css_parser_token::string_token,
+					consume_string (c));
+		case '(':
+			offset = i + 1;
+			return std::make_pair (css_parser_token::obrace_token,
+					std::string_view (&input[offset - 1], 1));
+		case ')':
+			offset = i + 1;
+			return std::make_pair (css_parser_token::ebrace_token,
+					std::string_view (&input[offset - 1], 1));
+		case ',':
+			offset = i + 1;
+			return std::make_pair (css_parser_token::comma_token,
+					std::string_view (&input[offset - 1], 1));
+		case '<':
+			/* Maybe an xml like comment */
+			if (i + 3 < input.size () && input[i + 1] == '!'
+				&& input[i + 2] == '-' && input[i + 3] == '-') {
+				offset += 3;
+
+				return std::make_pair (css_parser_token::cdo_token,
+						std::string_view (&input[offset - 3], 3));
+			}
+			else {
+				offset = i + 1;
+				return std::make_pair (css_parser_token::delim_token,
+						std::string_view (&input[offset - 1], 1));
+			}
+			break;
+		}
+
+	}
+
+	return std::make_pair (css_parser_token::eof_token, std::string_view ());
+}
+
+}
\ No newline at end of file
diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx
new file mode 100644
index 000000000..4c6824389
--- /dev/null
+++ b/src/libserver/css/css_tokeniser.hxx
@@ -0,0 +1,68 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef RSPAMD_CSS_TOKENISER_HXX
+#define RSPAMD_CSS_TOKENISER_HXX
+
+#include <string_view>
+#include <utility>
+#include "mem_pool.h"
+
+namespace rspamd::css {
+
+enum class css_parser_token {
+	whitespace_token,
+	ident_token,
+	function_token,
+	at_keyword_token,
+	hash_token,
+	string_token,
+	number_token,
+	url_token,
+	dimension_token,
+	percentage_token,
+	cdo_token, /* xml open comment */
+	cdc_token, /* xml close comment */
+	delim_token,
+	obrace_token, /* ( */
+	ebrace_token, /* ) */
+	osqbrace_token, /* [ */
+	esqbrace_token, /* ] */
+	comma_token,
+	colon_token,
+	semicolon_token,
+	eof_token,
+};
+
+class css_tokeniser {
+public:
+	css_tokeniser() = delete;
+	css_tokeniser(rspamd_mempool_t *pool, const std::string_view &sv) :
+			input(sv), offset(0), pool(pool) {}
+
+	auto next_token(void) -> std::pair<css_parser_token, std::string_view>;
+private:
+	std::string_view input;
+	std::size_t offset;
+	rspamd_mempool_t *pool;
+};
+
+}
+
+
+#endif //RSPAMD_CSS_TOKENISER_HXX
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_util.cxx
similarity index 56%
copy from src/libserver/css/css_parser.cxx
copy to src/libserver/css/css_util.cxx
index 4134b933c..7388e49fd 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_util.cxx
@@ -14,87 +14,14 @@
  * limitations under the License.
  */
 
-#include "css_parser.hxx"
+#include "css_util.hxx"
+#include "css.hxx"
 #include <unicode/utf8.h>
 
-
 namespace rspamd::css {
 
-class css_parser {
-public:
-	css_parser(void) = delete; /* Require mempool to be set for logging */
-	explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {}
-
-	bool consume_input(const std::string_view &sv);
-
-	auto get_object_maybe(void) -> tl::expected<std::unique_ptr<css_style_sheet>, css_parse_error> {
-		if (state == parser_state::parse_done) {
-			state = parser_state::initial_state;
-			return std::move (style_object);
-		}
-
-		return tl::make_unexpected (error);
-	}
-
-	/* Public for unit tests */
-	std::string_view unescape_css(const std::string_view &sv);
-
-private:
-	enum class parser_state {
-		initial_state,
-		skip_spaces,
-		parse_selector,
-		ignore_selector, /* e.g. media or namespace */
-		parse_done,
-	};
-	parser_state state = parser_state::initial_state;
-	std::unique_ptr<css_style_sheet> style_object;
-	css_parse_error error;
-	rspamd_mempool_t *pool;
-
-	/* Helper parser methods */
-	bool need_unescape(const std::string_view &sv);
-};
-
-/*
- * Find if we need to unescape css
- */
-bool
-css_parser::need_unescape(const std::string_view &sv)
-{
-	bool in_quote = false;
-	char quote_char, prev_c = 0;
-
-	for (const auto c : sv) {
-		if (!in_quote) {
-			if (c == '"' || c == '\'') {
-				in_quote = true;
-				quote_char = c;
-			}
-			else if (c == '\\') {
-				return true;
-			}
-		}
-		else {
-			if (c == quote_char) {
-				if (prev_c != '\\') {
-					in_quote = false;
-				}
-			}
-			prev_c = c;
-		}
-	}
-
-	return false;
-}
-
-/*
- * Unescape css escapes
- * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
- * \0020AC : must be 6 digits long, no space needed (but can be included)
- */
-std::string_view
-css_parser::unescape_css(const std::string_view &sv)
+std::string_view unescape_css(rspamd_mempool_t *pool,
+							  const std::string_view &sv)
 {
 	auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
 	auto *d = nspace;
@@ -208,44 +135,17 @@ css_parser::unescape_css(const std::string_view &sv)
 	}
 
 	return std::string_view{nspace, sv.size() - nleft};
-};
-
-bool css_parser::consume_input(const std::string_view &sv)
-{
-	auto our_sv = sv;
-
-	if (need_unescape(sv)) {
-		our_sv = unescape_css(sv);
-		msg_debug_css("unescaped css: input size %d, unescaped size %d",
-				(int)sv.size(), (int)our_sv.size());
-	}
-
-	return true;
-}
-
-/*
- * Wrapper for the parser
- */
-auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
-	tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>
*** OUTPUT TRUNCATED, 100 LINES SKIPPED ***