commit 427f887: [Project] Add some methods for css parser
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Jan 22 16:00:30 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-01-21 15:45:21 +0000
URL: https://github.com/rspamd/rspamd/commit/427f8879360595ff48b77400b6b02b5a6968c4d1
[Project] Add some methods for css parser
---
src/libserver/css/CMakeLists.txt | 1 +
src/libserver/css/css.cxx | 2 +
src/libserver/css/css.h | 5 +-
src/libserver/css/css.hxx | 12 ++
src/libserver/css/css_parser.cxx | 238 ++++++++++++++++++++++++++
src/libserver/css/{css.cxx => css_parser.hxx} | 27 ++-
src/libserver/css/parse_error.hxx | 3 +-
7 files changed, 270 insertions(+), 18 deletions(-)
diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt
index f5d5affdb..c8f7921b1 100644
--- a/src/libserver/css/CMakeLists.txt
+++ b/src/libserver/css/CMakeLists.txt
@@ -14,6 +14,7 @@ SET(LIBCSSSRC "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx"
+ "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx"
"${RAGEL_ragel_css_selector_parser_OUTPUTS}"
"${RAGEL_ragel_css_rule_parser_OUTPUTS}"
PARENT_SCOPE)
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx
index 68ebfeefa..bd148cecd 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -29,6 +29,8 @@ rspamd_css_parse_style (const guchar *begin, gsize len, GError **err)
namespace rspamd::css {
+INIT_LOG_MODULE_PUBLIC(css);
+
class css_style_sheet::impl {
};
diff --git a/src/libserver/css/css.h b/src/libserver/css/css.h
index a87f4424d..169bcf58c 100644
--- a/src/libserver/css/css.h
+++ b/src/libserver/css/css.h
@@ -18,13 +18,16 @@
#define RSPAMD_CSS_H
#include "config.h"
+#include "mem_pool.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef void * rspamd_css;
-rspamd_css rspamd_css_parse_style (const guchar *begin, gsize len, GError **err);
+rspamd_css rspamd_css_parse_style (rspamd_mempool_t *pool,
+ const guchar *begin,
+ gsize len, GError **err);
#ifdef __cplusplus
}
#endif
diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx
index 78e0d0f73..d258b35c9 100644
--- a/src/libserver/css/css.hxx
+++ b/src/libserver/css/css.hxx
@@ -18,9 +18,21 @@
#include <string>
#include <memory>
+#include "logger.h"
namespace rspamd::css {
+extern unsigned int rspamd_css_log_id;
+
+#define msg_debug_css(...) rspamd_conditional_debug_fast (NULL, NULL, \
+ rspamd_css_log_id, "css", pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+#define msg_err_css(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
+ "css", pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+
class css_style_sheet {
public:
css_style_sheet();
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
new file mode 100644
index 000000000..9f2023e50
--- /dev/null
+++ b/src/libserver/css/css_parser.cxx
@@ -0,0 +1,238 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "css_parser.hxx"
+#include <unicode/utf8.h>
+
+
+namespace rspamd::css {
+
+class css_parser {
+public:
+ css_parser(void) = delete; /* Require mempool to be set for logging */
+ explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {}
+
+ bool consume_input(const std::string_view &sv);
+
+ auto get_object_maybe(void) -> tl::expected<std::unique_ptr<css_style_sheet>, css_parse_error> {
+ if (state == parser_state::parse_done) {
+ state = parser_state::initial_state;
+ return std::move (style_object);
+ }
+
+ return tl::make_unexpected (error);
+ }
+
+private:
+ enum class parser_state {
+ initial_state,
+ skip_spaces,
+ parse_selector,
+ ignore_selector, /* e.g. media or namespace */
+ parse_done,
+ };
+ parser_state state = parser_state::initial_state;
+ std::unique_ptr<css_style_sheet> style_object;
+ css_parse_error error;
+ rspamd_mempool_t *pool;
+
+ /* Helper parser methods */
+ bool need_unescape(const std::string_view &sv);
+
+ std::string_view unescape_css(const std::string_view &sv);
+};
+
+/*
+ * Find if we need to unescape css
+ */
+bool
+css_parser::need_unescape(const std::string_view &sv)
+{
+ bool in_quote = false;
+ char quote_char, prev_c = 0;
+
+ for (const auto c : sv) {
+ if (!in_quote) {
+ if (c == '"' || c == '\'') {
+ in_quote = true;
+ quote_char = c;
+ }
+ else if (c == '\\') {
+ return true;
+ }
+ }
+ else {
+ if (c == quote_char) {
+ if (prev_c != '\\') {
+ in_quote = false;
+ }
+ }
+ prev_c = c;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Unescape css escapes
+ * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
+ * \0020AC : must be 6 digits long, no space needed (but can be included)
+ */
+std::string_view
+css_parser::unescape_css(const std::string_view &sv)
+{
+ auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
+ auto *d = nspace;
+ auto nleft = sv.length ();
+
+ enum {
+ normal = 0,
+ quoted,
+ escape,
+ skip_spaces,
+ } state = normal;
+
+ char quote_char, prev_c = 0;
+ auto escape_offset = 0, i = 0;
+
+#define MAYBE_CONSUME_CHAR(c) do { \
+ if (c == '"' || c == '\'') { \
+ state = quoted; \
+ quote_char = c; \
+ nleft--; \
+ *d++ = c; \
+ } \
+ else if (c == '\\') { \
+ escape_offset = i; \
+ state = escape; \
+ } \
+ else { \
+ state = normal; \
+ nleft--; \
+ *d++ = c; \
+ } \
+} while (0)
+
+ for (const auto c : sv) {
+ if (nleft == 0) {
+ msg_err_css("cannot unescape css: truncated buffer of size %d",
+ (int)sv.length());
+ break;
+ }
+ switch (state) {
+ case normal:
+ MAYBE_CONSUME_CHAR(c);
+ break;
+ case quoted:
+ if (c == quote_char) {
+ if (prev_c != '\\') {
+ state = normal;
+ }
+ }
+ prev_c = c;
+ nleft --;
+ *d++ = c;
+ break;
+ case escape:
+ if (!g_ascii_isxdigit(c)) {
+ if (i > escape_offset + 1) {
+ /* Try to decode an escape */
+ const auto *escape_start = &sv[escape_offset + 1];
+ unsigned long val;
+
+ if (!rspamd_xstrtoul (escape_start, i - escape_offset - 1, &val)) {
+ msg_debug_css("invalid broken escape found at pos %d",
+ escape_offset);
+ }
+ else {
+ if (val < 0x1f) {
+ /* Trivial case: ascii character */
+ *d++ = (unsigned char)val;
+ nleft --;
+ }
+ else {
+ UChar32 uc = val;
+ auto off = d - nspace;
+ UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off,
+ sv.length (), uc);
+ d = nspace + off;
+ nleft = sv.length () - off;
+ }
+ }
+ }
+ else {
+ /* Empty escape, ignore it */
+ msg_debug_css("invalid empty escape found at pos %d",
+ escape_offset);
+ }
+
+ if (nleft > 0) {
+ msg_err_css("cannot unescape css: truncated buffer of size %d",
+ (int)sv.length());
+ }
+ else {
+ /* Escape is done, advance forward */
+ if (g_ascii_isspace (c)) {
+ state = skip_spaces;
+ }
+ else {
+ MAYBE_CONSUME_CHAR(c);
+ }
+ }
+ }
+ break;
+ case skip_spaces:
+ if (!g_ascii_isspace(c)) {
+ MAYBE_CONSUME_CHAR(c);
+ }
+ /* Ignore spaces */
+ break;
+ }
+
+ i ++;
+ }
+
+ return std::string_view{nspace, sv.size() - nleft};
+};
+
+bool css_parser::consume_input(const std::string_view &sv)
+{
+ auto our_sv = sv;
+
+ if (need_unescape(sv)) {
+ our_sv = unescape_css(sv);
+ msg_debug_css("unescaped css: input size %d, unescaped size %d",
+ (int)sv.size(), (int)our_sv.size());
+ }
+
+ return true;
+}
+
+/*
+ * Wrapper for the parser
+ */
+auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
+ tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>
+{
+ css_parser parser(pool);
+
+ parser.consume_input(st);
+
+ return parser.get_object_maybe();
+}
+
+}
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css_parser.hxx
similarity index 64%
copy from src/libserver/css/css.cxx
copy to src/libserver/css/css_parser.hxx
index 68ebfeefa..8d1468a01 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css_parser.hxx
@@ -14,26 +14,21 @@
* limitations under the License.
*/
-#include "css.h"
-#include "css.hxx"
-#include "css_style.hxx"
-
-rspamd_css
-rspamd_css_parse_style (const guchar *begin, gsize len, GError **err)
-{
- rspamd::css::css_style_sheet *style = nullptr;
-
+#ifndef RSPAMD_CSS_PARSER_HXX
+#define RSPAMD_CSS_PARSER_HXX
- return reinterpret_cast<rspamd_css>(style);
-}
+#include "css.hxx"
+#include "parse_error.hxx"
+#include "contrib/expected/expected.hpp"
+#include "logger.h"
namespace rspamd::css {
-class css_style_sheet::impl {
+INIT_LOG_MODULE(chartable)
-};
+auto parse_css (rspamd_mempool_t *pool, const std::string_view &st) ->
+ tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>;
-css_style_sheet::css_style_sheet () : pimpl(new impl) {}
-css_style_sheet::~css_style_sheet () {}
+}
-}
\ No newline at end of file
+#endif //RSPAMD_CSS_PARSER_HXX
diff --git a/src/libserver/css/parse_error.hxx b/src/libserver/css/parse_error.hxx
index 60b229181..12ad697eb 100644
--- a/src/libserver/css/parse_error.hxx
+++ b/src/libserver/css/parse_error.hxx
@@ -34,13 +34,14 @@ enum class css_parse_error_type {
};
struct css_parse_error {
- css_parse_error_type type;
+ css_parse_error_type type = css_parse_error_type::PARSE_ERROR_UNKNOWN_ERROR;
std::optional<std::string> description;
explicit css_parse_error (css_parse_error_type type, const std::string &description) :
type(type), description(description) {}
explicit css_parse_error (css_parse_error_type type) :
type(type) {}
+ css_parse_error() = default;
};
}
More information about the Commits
mailing list