commit 427f887: [Project] Add some methods for css parser

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Jan 22 16:00:30 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-01-21 15:45:21 +0000
URL: https://github.com/rspamd/rspamd/commit/427f8879360595ff48b77400b6b02b5a6968c4d1

[Project] Add some methods for css parser

---
 src/libserver/css/CMakeLists.txt              |   1 +
 src/libserver/css/css.cxx                     |   2 +
 src/libserver/css/css.h                       |   5 +-
 src/libserver/css/css.hxx                     |  12 ++
 src/libserver/css/css_parser.cxx              | 238 ++++++++++++++++++++++++++
 src/libserver/css/{css.cxx => css_parser.hxx} |  27 ++-
 src/libserver/css/parse_error.hxx             |   3 +-
 7 files changed, 270 insertions(+), 18 deletions(-)

diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt
index f5d5affdb..c8f7921b1 100644
--- a/src/libserver/css/CMakeLists.txt
+++ b/src/libserver/css/CMakeLists.txt
@@ -14,6 +14,7 @@ SET(LIBCSSSRC    "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx"
+                 "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx"
                  "${RAGEL_ragel_css_selector_parser_OUTPUTS}"
                  "${RAGEL_ragel_css_rule_parser_OUTPUTS}"
                  PARENT_SCOPE)
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx
index 68ebfeefa..bd148cecd 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -29,6 +29,8 @@ rspamd_css_parse_style (const guchar *begin, gsize len, GError **err)
 
 namespace rspamd::css {
 
+INIT_LOG_MODULE_PUBLIC(css);
+
 class css_style_sheet::impl {
 
 };
diff --git a/src/libserver/css/css.h b/src/libserver/css/css.h
index a87f4424d..169bcf58c 100644
--- a/src/libserver/css/css.h
+++ b/src/libserver/css/css.h
@@ -18,13 +18,16 @@
 #define RSPAMD_CSS_H
 
 #include "config.h"
+#include "mem_pool.h"
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 typedef void * rspamd_css;
 
-rspamd_css rspamd_css_parse_style (const guchar *begin, gsize len, GError **err);
+rspamd_css rspamd_css_parse_style (rspamd_mempool_t *pool,
+								   const guchar *begin,
+								   gsize len, GError **err);
 #ifdef  __cplusplus
 }
 #endif
diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx
index 78e0d0f73..d258b35c9 100644
--- a/src/libserver/css/css.hxx
+++ b/src/libserver/css/css.hxx
@@ -18,9 +18,21 @@
 
 #include <string>
 #include <memory>
+#include "logger.h"
 
 namespace rspamd::css {
 
+extern unsigned int rspamd_css_log_id;
+
+#define msg_debug_css(...)  rspamd_conditional_debug_fast (NULL, NULL, \
+        rspamd_css_log_id, "css", pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+#define msg_err_css(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
+        "css", pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+
 class css_style_sheet {
 public:
 	css_style_sheet();
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
new file mode 100644
index 000000000..9f2023e50
--- /dev/null
+++ b/src/libserver/css/css_parser.cxx
@@ -0,0 +1,238 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "css_parser.hxx"
+#include <unicode/utf8.h>
+
+
+namespace rspamd::css {
+
+class css_parser {
+public:
+	css_parser(void) = delete; /* Require mempool to be set for logging */
+	explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {}
+
+	bool consume_input(const std::string_view &sv);
+
+	auto get_object_maybe(void) -> tl::expected<std::unique_ptr<css_style_sheet>, css_parse_error> {
+		if (state == parser_state::parse_done) {
+			state = parser_state::initial_state;
+			return std::move (style_object);
+		}
+
+		return tl::make_unexpected (error);
+	}
+
+private:
+	enum class parser_state {
+		initial_state,
+		skip_spaces,
+		parse_selector,
+		ignore_selector, /* e.g. media or namespace */
+		parse_done,
+	};
+	parser_state state = parser_state::initial_state;
+	std::unique_ptr<css_style_sheet> style_object;
+	css_parse_error error;
+	rspamd_mempool_t *pool;
+
+	/* Helper parser methods */
+	bool need_unescape(const std::string_view &sv);
+
+	std::string_view unescape_css(const std::string_view &sv);
+};
+
+/*
+ * Find if we need to unescape css
+ */
+bool
+css_parser::need_unescape(const std::string_view &sv)
+{
+	bool in_quote = false;
+	char quote_char, prev_c = 0;
+
+	for (const auto c : sv) {
+		if (!in_quote) {
+			if (c == '"' || c == '\'') {
+				in_quote = true;
+				quote_char = c;
+			}
+			else if (c == '\\') {
+				return true;
+			}
+		}
+		else {
+			if (c == quote_char) {
+				if (prev_c != '\\') {
+					in_quote = false;
+				}
+			}
+			prev_c = c;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Unescape css escapes
+ * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
+ * \0020AC : must be 6 digits long, no space needed (but can be included)
+ */
+std::string_view
+css_parser::unescape_css(const std::string_view &sv)
+{
+	auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
+	auto *d = nspace;
+	auto nleft = sv.length ();
+
+	enum {
+		normal = 0,
+		quoted,
+		escape,
+		skip_spaces,
+	} state = normal;
+
+	char quote_char, prev_c = 0;
+	auto escape_offset = 0, i = 0;
+
+#define MAYBE_CONSUME_CHAR(c) do { \
+    if (c == '"' || c == '\'') { \
+        state = quoted; \
+        quote_char = c; \
+        nleft--; \
+        *d++ = c; \
+    } \
+    else if (c == '\\') { \
+        escape_offset = i; \
+        state = escape; \
+    } \
+    else { \
+        state = normal; \
+        nleft--; \
+        *d++ = c; \
+    } \
+} while (0)
+
+	for (const auto c : sv) {
+		if (nleft == 0) {
+			msg_err_css("cannot unescape css: truncated buffer of size %d",
+					(int)sv.length());
+			break;
+		}
+		switch (state) {
+		case normal:
+			MAYBE_CONSUME_CHAR(c);
+			break;
+		case quoted:
+			if (c == quote_char) {
+				if (prev_c != '\\') {
+					state = normal;
+				}
+			}
+			prev_c = c;
+			nleft --;
+			*d++ = c;
+			break;
+		case escape:
+			if (!g_ascii_isxdigit(c)) {
+				if (i > escape_offset + 1) {
+					/* Try to decode an escape */
+					const auto *escape_start = &sv[escape_offset + 1];
+					unsigned long val;
+
+					if (!rspamd_xstrtoul (escape_start, i - escape_offset - 1, &val)) {
+						msg_debug_css("invalid broken escape found at pos %d",
+								escape_offset);
+					}
+					else {
+						if (val < 0x1f) {
+							/* Trivial case: ascii character */
+							*d++ = (unsigned char)val;
+							nleft --;
+						}
+						else {
+							UChar32 uc = val;
+							auto off = d - nspace;
+							UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off,
+									sv.length (), uc);
+							d = nspace + off;
+							nleft = sv.length () - off;
+						}
+					}
+				}
+				else {
+					/* Empty escape, ignore it */
+					msg_debug_css("invalid empty escape found at pos %d",
+							escape_offset);
+				}
+
+				if (nleft > 0) {
+					msg_err_css("cannot unescape css: truncated buffer of size %d",
+							(int)sv.length());
+				}
+				else {
+					/* Escape is done, advance forward */
+					if (g_ascii_isspace (c)) {
+						state = skip_spaces;
+					}
+					else {
+						MAYBE_CONSUME_CHAR(c);
+					}
+				}
+			}
+			break;
+		case skip_spaces:
+			if (!g_ascii_isspace(c)) {
+				MAYBE_CONSUME_CHAR(c);
+			}
+			/* Ignore spaces */
+			break;
+		}
+
+		i ++;
+	}
+
+	return std::string_view{nspace, sv.size() - nleft};
+};
+
+bool css_parser::consume_input(const std::string_view &sv)
+{
+	auto our_sv = sv;
+
+	if (need_unescape(sv)) {
+		our_sv = unescape_css(sv);
+		msg_debug_css("unescaped css: input size %d, unescaped size %d",
+				(int)sv.size(), (int)our_sv.size());
+	}
+
+	return true;
+}
+
+/*
+ * Wrapper for the parser
+ */
+auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
+	tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>
+{
+	css_parser parser(pool);
+
+	parser.consume_input(st);
+
+	return parser.get_object_maybe();
+}
+
+}
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css_parser.hxx
similarity index 64%
copy from src/libserver/css/css.cxx
copy to src/libserver/css/css_parser.hxx
index 68ebfeefa..8d1468a01 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css_parser.hxx
@@ -14,26 +14,21 @@
  * limitations under the License.
  */
 
-#include "css.h"
-#include "css.hxx"
-#include "css_style.hxx"
-
-rspamd_css
-rspamd_css_parse_style (const guchar *begin, gsize len, GError **err)
-{
-	rspamd::css::css_style_sheet *style = nullptr;
-
+#ifndef RSPAMD_CSS_PARSER_HXX
+#define RSPAMD_CSS_PARSER_HXX
 
-	return reinterpret_cast<rspamd_css>(style);
-}
+#include "css.hxx"
+#include "parse_error.hxx"
+#include "contrib/expected/expected.hpp"
+#include "logger.h"
 
 namespace rspamd::css {
 
-class css_style_sheet::impl {
+INIT_LOG_MODULE(chartable)
 
-};
+auto parse_css (rspamd_mempool_t *pool, const std::string_view &st) ->
+		tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>;
 
-css_style_sheet::css_style_sheet () : pimpl(new impl) {}
-css_style_sheet::~css_style_sheet () {}
+}
 
-}
\ No newline at end of file
+#endif //RSPAMD_CSS_PARSER_HXX
diff --git a/src/libserver/css/parse_error.hxx b/src/libserver/css/parse_error.hxx
index 60b229181..12ad697eb 100644
--- a/src/libserver/css/parse_error.hxx
+++ b/src/libserver/css/parse_error.hxx
@@ -34,13 +34,14 @@ enum class css_parse_error_type {
 };
 
 struct css_parse_error {
-	css_parse_error_type type;
+	css_parse_error_type type = css_parse_error_type::PARSE_ERROR_UNKNOWN_ERROR;
 	std::optional<std::string> description;
 
 	explicit css_parse_error (css_parse_error_type type, const std::string &description) :
 		type(type), description(description) {}
 	explicit css_parse_error (css_parse_error_type type) :
 			type(type) {}
+	css_parse_error() = default;
 };
 
 }


More information about the Commits mailing list