commit 50e3e98: [Project] Css: rework tokeniser
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Jan 25 16:42:12 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-01-25 14:35:41 +0000
URL: https://github.com/rspamd/rspamd/commit/50e3e98a741cf2946ec0b3e4cf396d53cc9e4ae4
[Project] Css: rework tokeniser
---
src/libserver/css/CMakeLists.txt | 2 +
src/libserver/css/css.hxx | 2 +
src/libserver/css/css_parser.cxx | 160 ++----------------
src/libserver/css/css_parser.hxx | 2 +
src/libserver/css/css_property.hxx | 1 +
src/libserver/css/css_rule.hxx | 1 +
src/libserver/css/css_selector.hxx | 1 +
src/libserver/css/css_style.hxx | 2 +
src/libserver/css/css_tokeniser.cxx | 183 +++++++++++++++++++++
src/libserver/css/css_tokeniser.hxx | 68 ++++++++
src/libserver/css/{css_parser.cxx => css_util.cxx} | 118 +------------
src/libserver/css/{css_parser.hxx => css_util.hxx} | 25 +--
src/libserver/css/css_value.hxx | 2 +
src/libserver/css/parse_error.hxx | 1 +
14 files changed, 304 insertions(+), 264 deletions(-)
diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt
index c8f7921b1..84ed2cf8b 100644
--- a/src/libserver/css/CMakeLists.txt
+++ b/src/libserver/css/CMakeLists.txt
@@ -14,6 +14,8 @@ SET(LIBCSSSRC "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx"
+ "${CMAKE_CURRENT_SOURCE_DIR}/css_tokeniser.cxx"
+ "${CMAKE_CURRENT_SOURCE_DIR}/css_util.cxx"
"${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx"
"${RAGEL_ragel_css_selector_parser_OUTPUTS}"
"${RAGEL_ragel_css_rule_parser_OUTPUTS}"
diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx
index 8f2550d7b..1a511dcfd 100644
--- a/src/libserver/css/css.hxx
+++ b/src/libserver/css/css.hxx
@@ -13,6 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+#pragma once
+
#ifndef RSPAMD_CSS_HXX
#define RSPAMD_CSS_HXX
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
index 4134b933c..207cfcb9d 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_parser.cxx
@@ -15,6 +15,7 @@
*/
#include "css_parser.hxx"
+#include "css_tokeniser.hxx"
#include <unicode/utf8.h>
@@ -36,9 +37,6 @@ public:
return tl::make_unexpected (error);
}
- /* Public for unit tests */
- std::string_view unescape_css(const std::string_view &sv);
-
private:
enum class parser_state {
initial_state,
@@ -49,6 +47,7 @@ private:
};
parser_state state = parser_state::initial_state;
std::unique_ptr<css_style_sheet> style_object;
+
css_parse_error error;
rspamd_mempool_t *pool;
@@ -88,136 +87,26 @@ css_parser::need_unescape(const std::string_view &sv)
return false;
}
-/*
- * Unescape css escapes
- * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
- * \0020AC : must be 6 digits long, no space needed (but can be included)
- */
-std::string_view
-css_parser::unescape_css(const std::string_view &sv)
-{
- auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
- auto *d = nspace;
- auto nleft = sv.length ();
- enum {
- normal = 0,
- quoted,
- escape,
- skip_spaces,
- } state = normal;
-
- char quote_char, prev_c = 0;
- auto escape_offset = 0, i = 0;
-
-#define MAYBE_CONSUME_CHAR(c) do { \
- if (c == '"' || c == '\'') { \
- state = quoted; \
- quote_char = c; \
- nleft--; \
- *d++ = c; \
- } \
- else if (c == '\\') { \
- escape_offset = i; \
- state = escape; \
- } \
- else { \
- state = normal; \
- nleft--; \
- *d++ = c; \
- } \
-} while (0)
-
- for (const auto c : sv) {
- if (nleft == 0) {
- msg_err_css("cannot unescape css: truncated buffer of size %d",
- (int)sv.length());
- break;
- }
- switch (state) {
- case normal:
- MAYBE_CONSUME_CHAR(c);
- break;
- case quoted:
- if (c == quote_char) {
- if (prev_c != '\\') {
- state = normal;
- }
- }
- prev_c = c;
- nleft --;
- *d++ = c;
- break;
- case escape:
- if (!g_ascii_isxdigit(c)) {
- if (i > escape_offset + 1) {
- /* Try to decode an escape */
- const auto *escape_start = &sv[escape_offset + 1];
- unsigned long val;
+bool css_parser::consume_input(const std::string_view &sv)
+{
+ bool eof = false;
+ css_tokeniser css_tokeniser(pool, sv);
- if (!rspamd_xstrtoul(escape_start, i - escape_offset - 1, &val)) {
- msg_debug_css("invalid broken escape found at pos %d",
- escape_offset);
- }
- else {
- if (val < 0x80) {
- /* Trivial case: ascii character */
- *d++ = (unsigned char)val;
- nleft --;
- }
- else {
- UChar32 uc = val;
- auto off = 0;
- UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off,
- sv.length (), uc);
- d += off;
- nleft -= off;
- }
- }
- }
- else {
- /* Empty escape, ignore it */
- msg_debug_css("invalid empty escape found at pos %d",
- escape_offset);
- }
+ while (!eof) {
+ auto token_pair = css_tokeniser.next_token();
- if (nleft <= 0) {
- msg_err_css("cannot unescape css: truncated buffer of size %d",
- (int)sv.length());
- }
- else {
- /* Escape is done, advance forward */
- if (g_ascii_isspace (c)) {
- state = skip_spaces;
- }
- else {
- MAYBE_CONSUME_CHAR(c);
- }
- }
- }
+ /* Top level parser */
+ switch (token_pair.first) {
+ case css_parser_token::eof_token:
+ eof = true;
break;
- case skip_spaces:
- if (!g_ascii_isspace(c)) {
- MAYBE_CONSUME_CHAR(c);
- }
- /* Ignore spaces */
+ case css_parser_token::whitespace_token:
+ case css_parser_token::cdc_token:
+ case css_parser_token::cdo_token:
+ /* Ignore tokens */
break;
}
-
- i ++;
- }
-
- return std::string_view{nspace, sv.size() - nleft};
-};
-
-bool css_parser::consume_input(const std::string_view &sv)
-{
- auto our_sv = sv;
-
- if (need_unescape(sv)) {
- our_sv = unescape_css(sv);
- msg_debug_css("unescaped css: input size %d, unescaped size %d",
- (int)sv.size(), (int)our_sv.size());
}
return true;
@@ -237,20 +126,3 @@ auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
}
}
-
-/* C API */
-const gchar *rspamd_css_unescape (rspamd_mempool_t *pool,
- const guchar *begin,
- gsize len,
- gsize *outlen)
-{
- rspamd::css::css_parser parser(pool);
- auto sv = parser.unescape_css({(const char*)begin, len});
- const auto *v = sv.begin();
-
- if (outlen) {
- *outlen = sv.size();
- }
-
- return v;
-}
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
index 8d1468a01..e009fef70 100644
--- a/src/libserver/css/css_parser.hxx
+++ b/src/libserver/css/css_parser.hxx
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
#ifndef RSPAMD_CSS_PARSER_HXX
#define RSPAMD_CSS_PARSER_HXX
diff --git a/src/libserver/css/css_property.hxx b/src/libserver/css/css_property.hxx
index 06a345ad4..2e668c640 100644
--- a/src/libserver/css/css_property.hxx
+++ b/src/libserver/css/css_property.hxx
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+#pragma once
#ifndef RSPAMD_CSS_PROPERTY_HXX
#define RSPAMD_CSS_PROPERTY_HXX
diff --git a/src/libserver/css/css_rule.hxx b/src/libserver/css/css_rule.hxx
index 878322f78..6afaa8bc6 100644
--- a/src/libserver/css/css_rule.hxx
+++ b/src/libserver/css/css_rule.hxx
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+#pragma once
#ifndef RSPAMD_CSS_RULE_HXX
#define RSPAMD_CSS_RULE_HXX
diff --git a/src/libserver/css/css_selector.hxx b/src/libserver/css/css_selector.hxx
index 4c12b3b41..c9f3046d5 100644
--- a/src/libserver/css/css_selector.hxx
+++ b/src/libserver/css/css_selector.hxx
@@ -14,6 +14,7 @@
* limitations under the License.
*/
+#pragma once
#ifndef RSPAMD_CSS_SELECTOR_HXX
#define RSPAMD_CSS_SELECTOR_HXX
diff --git a/src/libserver/css/css_style.hxx b/src/libserver/css/css_style.hxx
index f3d1e664d..2a97f8f0e 100644
--- a/src/libserver/css/css_style.hxx
+++ b/src/libserver/css/css_style.hxx
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
#ifndef RSPAMD_CSS_STYLE_HXX
#define RSPAMD_CSS_STYLE_HXX
diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx
new file mode 100644
index 000000000..40f202b01
--- /dev/null
+++ b/src/libserver/css/css_tokeniser.cxx
@@ -0,0 +1,183 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "css_tokeniser.hxx"
+#include "css_util.hxx"
+
+namespace rspamd::css {
+
+
+auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string_view>
+{
+ /* Helpers */
+
+ /*
+ * This lambda eats comment handling nested comments;
+ * offset is set to the next character after a comment (or eof)
+ * Nothing is returned
+ */
+ auto consume_comment = [this] () {
+ auto i = offset;
+ auto nested = 0;
+
+ /* We handle nested comments just because they can exist... */
+ while (i < input.size () - 1) {
+ auto c = input[i];
+ if (c == '*' && input[i + 1] == '/') {
+ if (nested == 0) {
+ offset = i + 2;
+ return;
+ }
+ else {
+ nested--;
+ i += 2;
+ continue;
+ }
+ }
+ else if (c == '/' && input[i + 1] == '*') {
+ nested++;
+ i += 2;
+ continue;
+ }
+
+ i++;
+ }
+
+ offset = i;
+ };
+
+ /*
+ * Consume quoted string, returns a string_view over a string, offset
+ * is set one character after the string. Css unescaping is done automatically
+ * Accepts a quote char to find end of string
+ */
+ auto consume_string = [this] (auto quote_char) -> auto {
+ auto i = offset;
+ bool need_unescape = false;
+
+ while (i < input.size ()) {
+ auto c = input[i];
+
+ if (c == '\\') {
+ if (i + 1 < input.size ()) {
+ need_unescape = true;
+ }
+ else {
+ /* \ at the end -> ignore */
+
+ }
+ }
+ else if (c == quote_char) {
+ /* End of string */
+ std::string_view res{&input[offset], i - offset};
+
+ if (need_unescape) {
+ res = rspamd::css::unescape_css(pool, res);
+ }
+
+ offset = i + 1;
+
+ return res;
+ }
+ else if (c == '\n') {
+ /* Should be a error, but we ignore it for now */
+ }
+ }
+
+ /* EOF with no quote character, consider it fine */
+ std::string_view res{&input[offset], i - offset};
+
+ if (need_unescape) {
+ res = rspamd::css::unescape_css(pool, res);
+ }
+
+ offset = i;
+
+ return res;
+ };
+
+ /* Main tokenisation loop */
+ for (auto i = offset; i < input.size (); ++i) {
+ auto c = input[i];
+
+ switch (c) {
+ case '/':
+ if (i + 1 < input.size () && input[i + 1] == '*') {
+ offset = i + 2;
+ consume_comment (); /* Consume comment and go forward */
+ return next_token (); /* Tail call */
+ }
+ else {
+ offset = i + 1;
+ return std::make_pair (css_parser_token::delim_token,
+ std::string_view (&input[offset - 1], 1));
+ }
+ break;
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\v': {
+ /* Consume as much space as we can */
+ do {
+ c = input[++i];
+ } while (i < input.size () && g_ascii_isspace (c));
+
+ auto ret = std::make_pair (css_parser_token::whitespace_token,
+ std::string_view (&input[offset], i - offset));
+ offset = i;
+ return ret;
+ }
+ case '"':
+ case '\'':
+ offset = i + 1;
+ return std::make_pair (css_parser_token::string_token,
+ consume_string (c));
+ case '(':
+ offset = i + 1;
+ return std::make_pair (css_parser_token::obrace_token,
+ std::string_view (&input[offset - 1], 1));
+ case ')':
+ offset = i + 1;
+ return std::make_pair (css_parser_token::ebrace_token,
+ std::string_view (&input[offset - 1], 1));
+ case ',':
+ offset = i + 1;
+ return std::make_pair (css_parser_token::comma_token,
+ std::string_view (&input[offset - 1], 1));
+ case '<':
+ /* Maybe an xml like comment */
+ if (i + 3 < input.size () && input[i + 1] == '!'
+ && input[i + 2] == '-' && input[i + 3] == '-') {
+ offset += 3;
+
+ return std::make_pair (css_parser_token::cdo_token,
+ std::string_view (&input[offset - 3], 3));
+ }
+ else {
+ offset = i + 1;
+ return std::make_pair (css_parser_token::delim_token,
+ std::string_view (&input[offset - 1], 1));
+ }
+ break;
+ }
+
+ }
+
+ return std::make_pair (css_parser_token::eof_token, std::string_view ());
+}
+
+}
\ No newline at end of file
diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx
new file mode 100644
index 000000000..4c6824389
--- /dev/null
+++ b/src/libserver/css/css_tokeniser.hxx
@@ -0,0 +1,68 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef RSPAMD_CSS_TOKENISER_HXX
+#define RSPAMD_CSS_TOKENISER_HXX
+
+#include <string_view>
+#include <utility>
+#include "mem_pool.h"
+
+namespace rspamd::css {
+
+enum class css_parser_token {
+ whitespace_token,
+ ident_token,
+ function_token,
+ at_keyword_token,
+ hash_token,
+ string_token,
+ number_token,
+ url_token,
+ dimension_token,
+ percentage_token,
+ cdo_token, /* xml open comment */
+ cdc_token, /* xml close comment */
+ delim_token,
+ obrace_token, /* ( */
+ ebrace_token, /* ) */
+ osqbrace_token, /* [ */
+ esqbrace_token, /* ] */
+ comma_token,
+ colon_token,
+ semicolon_token,
+ eof_token,
+};
+
+class css_tokeniser {
+public:
+ css_tokeniser() = delete;
+ css_tokeniser(rspamd_mempool_t *pool, const std::string_view &sv) :
+ input(sv), offset(0), pool(pool) {}
+
+ auto next_token(void) -> std::pair<css_parser_token, std::string_view>;
+private:
+ std::string_view input;
+ std::size_t offset;
+ rspamd_mempool_t *pool;
+};
+
+}
+
+
+#endif //RSPAMD_CSS_TOKENISER_HXX
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_util.cxx
similarity index 56%
copy from src/libserver/css/css_parser.cxx
copy to src/libserver/css/css_util.cxx
index 4134b933c..7388e49fd 100644
--- a/src/libserver/css/css_parser.cxx
+++ b/src/libserver/css/css_util.cxx
@@ -14,87 +14,14 @@
* limitations under the License.
*/
-#include "css_parser.hxx"
+#include "css_util.hxx"
+#include "css.hxx"
#include <unicode/utf8.h>
-
namespace rspamd::css {
-class css_parser {
-public:
- css_parser(void) = delete; /* Require mempool to be set for logging */
- explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {}
-
- bool consume_input(const std::string_view &sv);
-
- auto get_object_maybe(void) -> tl::expected<std::unique_ptr<css_style_sheet>, css_parse_error> {
- if (state == parser_state::parse_done) {
- state = parser_state::initial_state;
- return std::move (style_object);
- }
-
- return tl::make_unexpected (error);
- }
-
- /* Public for unit tests */
- std::string_view unescape_css(const std::string_view &sv);
-
-private:
- enum class parser_state {
- initial_state,
- skip_spaces,
- parse_selector,
- ignore_selector, /* e.g. media or namespace */
- parse_done,
- };
- parser_state state = parser_state::initial_state;
- std::unique_ptr<css_style_sheet> style_object;
- css_parse_error error;
- rspamd_mempool_t *pool;
-
- /* Helper parser methods */
- bool need_unescape(const std::string_view &sv);
-};
-
-/*
- * Find if we need to unescape css
- */
-bool
-css_parser::need_unescape(const std::string_view &sv)
-{
- bool in_quote = false;
- char quote_char, prev_c = 0;
-
- for (const auto c : sv) {
- if (!in_quote) {
- if (c == '"' || c == '\'') {
- in_quote = true;
- quote_char = c;
- }
- else if (c == '\\') {
- return true;
- }
- }
- else {
- if (c == quote_char) {
- if (prev_c != '\\') {
- in_quote = false;
- }
- }
- prev_c = c;
- }
- }
-
- return false;
-}
-
-/*
- * Unescape css escapes
- * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
- * \0020AC : must be 6 digits long, no space needed (but can be included)
- */
-std::string_view
-css_parser::unescape_css(const std::string_view &sv)
+std::string_view unescape_css(rspamd_mempool_t *pool,
+ const std::string_view &sv)
{
auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
auto *d = nspace;
@@ -208,44 +135,17 @@ css_parser::unescape_css(const std::string_view &sv)
}
return std::string_view{nspace, sv.size() - nleft};
-};
-
-bool css_parser::consume_input(const std::string_view &sv)
-{
- auto our_sv = sv;
-
- if (need_unescape(sv)) {
- our_sv = unescape_css(sv);
- msg_debug_css("unescaped css: input size %d, unescaped size %d",
- (int)sv.size(), (int)our_sv.size());
- }
-
- return true;
-}
-
-/*
- * Wrapper for the parser
- */
-auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
- tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>
*** OUTPUT TRUNCATED, 100 LINES SKIPPED ***
More information about the Commits
mailing list