commit 52cb3f8: [Project] Css: Implement numbers and ident parsers
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Jan 26 15:49:06 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-01-26 14:44:01 +0000
URL: https://github.com/rspamd/rspamd/commit/52cb3f8d019522aee0c049d091772d082a8502f1
[Project] Css: Implement numbers and ident parsers
---
src/libserver/css/css_tokeniser.cxx | 367 +++++++++++++++++++++++++++++++++++-
src/libserver/css/css_tokeniser.hxx | 27 +++
2 files changed, 385 insertions(+), 9 deletions(-)
diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx
index 058f7504e..f545af47a 100644
--- a/src/libserver/css/css_tokeniser.cxx
+++ b/src/libserver/css/css_tokeniser.cxx
@@ -16,6 +16,9 @@
#include "css_tokeniser.hxx"
#include "css_util.hxx"
+#include "css.hxx"
+#include <charconv>
+#include <string>
namespace rspamd::css {
@@ -35,6 +38,13 @@ auto make_token<css_parser_token::token_type::string_token, std::string_view>(co
return css_parser_token{css_parser_token::token_type::string_token, s};
}
+template<>
+auto make_token<css_parser_token::token_type::ident_token, std::string_view>(const std::string_view &s)
+-> css_parser_token
+{
+ return css_parser_token{css_parser_token::token_type::ident_token, s};
+}
+
template<>
auto make_token<css_parser_token::token_type::whitespace_token, std::string_view>(const std::string_view &s)
-> css_parser_token
@@ -49,6 +59,13 @@ auto make_token<css_parser_token::token_type::delim_token, char>(const char &c)
return css_parser_token{css_parser_token::token_type::delim_token, c};
}
+template<>
+auto make_token<css_parser_token::token_type::number_token, double>(const double &d)
+-> css_parser_token
+{
+ return css_parser_token{css_parser_token::token_type::number_token, d};
+}
+
/*
* Generic tokens with no value (non-terminals)
*/
@@ -58,6 +75,287 @@ auto make_token(void) -> css_parser_token
return css_parser_token{T, css_parser_token_placeholder()};
}
+static constexpr inline auto is_plain_ident(char c) -> bool
+{
+ if ((c & 0x80) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
+ return true;
+ }
+
+ return false;
+};
+
+auto
+css_parser_token::adjust_dim(const css_parser_token &dim_token) -> bool
+{
+ if (!std::holds_alternative<double>(value) ||
+ !std::holds_alternative<std::string_view>(dim_token.value)) {
+ /* Invalid tokens */
+ return false;
+ }
+
+ auto num = std::get<double>(value);
+ auto sv = std::get<std::string_view>(dim_token.value);
+
+ if (sv == "px") {
+ dim_type = css_parser_token::dim_type::dim_px;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)num; /* Round to number */
+ }
+ else if (sv == "em") {
+ dim_type = css_parser_token::dim_type::dim_em;
+ flags |= css_parser_token::number_dimension;
+ /* EM is 16 px, so multiply and round */
+ num = (unsigned)(num * 16.0);
+ }
+ else if (sv == "rem") {
+ /* equal to EM in our case */
+ dim_type = css_parser_token::dim_type::dim_rem;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)(num * 16.0);
+ }
+ else if (sv == "ex") {
+ /*
+ * Represents the x-height of the element's font.
+ * On fonts with the "x" letter, this is generally the height
+ * of lowercase letters in the font; 1ex = 0.5em in many fonts.
+ */
+ dim_type = css_parser_token::dim_type::dim_ex;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)(num * 8.0);
+ }
+ else if (sv == "wv") {
+ /*
+ * Vewport width in percentages:
+ * we assume 1% of viewport width as 8px
+ */
+ dim_type = css_parser_token::dim_type::dim_wv;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)(num * 8.0);
+ }
+ else if (sv == "wh") {
+ /*
+ * Vewport height in percentages
+ * we assume 1% of viewport width as 6px
+ */
+ dim_type = css_parser_token::dim_type::dim_wh;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)(num * 6.0);
+ }
+ else if (sv == "vmax") {
+ /*
+ * Vewport width in percentages
+ * we assume 1% of viewport width as 6px
+ */
+ dim_type = css_parser_token::dim_type::dim_vmax;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)(num * 8.0);
+ }
+ else if (sv == "vmin") {
+ /*
+ * Vewport height in percentages
+ * we assume 1% of viewport width as 6px
+ */
+ dim_type = css_parser_token::dim_type::dim_vmin;
+ flags |= css_parser_token::number_dimension;
+ num = (unsigned)(num * 6.0);
+ }
+ else if (sv == "pt") {
+ dim_type = css_parser_token::dim_type::dim_pt;
+ flags |= css_parser_token::number_dimension;
+ num = (num * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
+ }
+ else if (sv == "cm") {
+ dim_type = css_parser_token::dim_type::dim_cm;
+ flags |= css_parser_token::number_dimension;
+ num = (num * 96.0 / 2.54); /* 96px/2.54 */
+ }
+ else if (sv == "mm") {
+ dim_type = css_parser_token::dim_type::dim_mm;
+ flags |= css_parser_token::number_dimension;
+ num = (num * 9.6 / 2.54); /* 9.6px/2.54 */
+ }
+ else if (sv == "in") {
+ dim_type = css_parser_token::dim_type::dim_in;
+ flags |= css_parser_token::number_dimension;
+ num = (num * 96.0); /* 96px */
+ }
+ else if (sv == "pc") {
+ dim_type = css_parser_token::dim_type::dim_pc;
+ flags |= css_parser_token::number_dimension;
+ num = (num * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
+ }
+ else {
+ flags |= css_parser_token::flag_bad_dimension;
+
+ return false;
+ }
+
+ value = num;
+
+ return true;
+}
+
+
+/*
+ * Consume functions: return a token and advance lexer offset
+ */
+auto css_tokeniser::consume_ident() -> struct css_parser_token
+{
+ auto i = offset;
+ auto need_escape = false;
+
+ /* Ident token can start from `-` or `--` */
+ if (input[i] == '-') {
+ i ++;
+
+ if (i < input.size() && input[i] == '-') {
+ i ++;
+ }
+ }
+
+ while (i < input.size()) {
+ auto c = input[i];
+
+ if (!is_plain_ident(c)) {
+ if (c == '\\' && i + 1 < input.size ()) {
+ need_escape = true;
+ auto nhex = 0;
+
+ /* Need to find an escape end */
+ do {
+ c = input[++i];
+ if (g_ascii_isxdigit(c)) {
+ nhex++;
+
+ if (nhex > 6) {
+ /* End of the escape */
+ break;
+ }
+ }
+ else if (nhex > 0 && c == ' ') {
+ /* \[hex]{1,6} */
+ i++; /* Skip one space */
+ break;
+ }
+ else {
+ /* Single \ + char */
+ break;
+ }
+ } while (i < input.size ());
+ }
+ else {
+ i --; /* Push token back */
+ break; /* Not an ident token */
+ }
+ } /* !plain ident */
+
+ i ++;
+ }
+
+ if (need_escape) {
+ auto escaped = rspamd::css::unescape_css(pool, {&input[offset], i - offset});
+ offset = i;
+
+ return make_token<css_parser_token::token_type::ident_token>(escaped);
+ }
+
+ auto result = std::string_view{&input[offset], i - offset};
+ offset = i;
+
+ return make_token<css_parser_token::token_type::ident_token>(result);
+}
+
+auto css_tokeniser::consume_number() -> struct css_parser_token
+{
+ auto i = offset;
+ auto seen_dot = false, seen_exp = false;
+
+ if (input[i] == '-') {
+ i ++;
+ }
+ if (input[i] == '.' && i < input.size()) {
+ seen_dot = true;
+ i ++;
+ }
+
+ while (i < input.size()) {
+ auto c = input[i];
+
+ if (!g_ascii_isdigit(c)) {
+ if (c == '.') {
+ if (!seen_dot) {
+ seen_dot = true;
+ }
+ else {
+ i --; /* Push back */
+ break;
+ }
+ }
+ else if (c == 'e' || c == 'E') {
+ if (!seen_exp) {
+ seen_exp = true;
+ seen_dot = true; /* dots are not allowed after e */
+
+ if (i + 1 < input.size()) {
+ auto next_c = input[i + 1];
+ if (next_c == '+' || next_c == '-') {
+ i ++;
+ }
+ }
+ }
+ else {
+ i --; /* Push back */
+ break;
+ }
+ }
+ }
+
+ i ++;
+ }
+
+ if (i > offset) {
+ double num;
+
+ /* I wish it was supported properly */
+ //auto conv_res = std::from_chars(&input[offset], &input[i], num);
+ std::string numbuf{&input[offset], (i - offset)};
+ num = std::stod(numbuf);
+
+ auto ret = make_token<css_parser_token::token_type::number_token>(num);
+
+ if (i < input.size()) {
+ if (input[i] == '%') {
+ ret.flags |= css_parser_token::number_percent;
+ i ++;
+ }
+ else if (is_plain_ident(input[i])) {
+ auto dim_token = consume_ident();
+
+ if (dim_token.type == css_parser_token::token_type::ident_token) {
+ if (!dim_token.adjust_dim(dim_token)) {
+ auto sv = std::get<std::string_view>(dim_token.value);
+ msg_debug_css("cannot apply dimension from the token %*s; number value = %.1f",
+ (int)sv.size(), sv.begin(), num);
+ }
+ }
+ }
+ }
+
+ return ret;
+ }
+ else {
+ msg_err_css("internal error: invalid number, empty token");
+ i ++;
+ }
+
+ offset = i;
+ /* Should not happen */
+ return make_token<css_parser_token::token_type::delim_token>(input[i - 1]);
+}
+
+/*
+ * Main routine to produce lexer tokens
+ */
auto css_tokeniser::next_token(void) -> struct css_parser_token
{
/* Helpers */
@@ -72,7 +370,7 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
auto nested = 0;
/* We handle nested comments just because they can exist... */
- while (i < input.size () - 1) {
+ while (i < input.size() - 1) {
auto c = input[i];
if (c == '*' && input[i + 1] == '/') {
if (nested == 0) {
@@ -106,16 +404,15 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
auto i = offset;
bool need_unescape = false;
- while (i < input.size ()) {
+ while (i < input.size()) {
auto c = input[i];
if (c == '\\') {
- if (i + 1 < input.size ()) {
+ if (i + 1 < input.size()) {
need_unescape = true;
}
else {
/* \ at the end -> ignore */
-
}
}
else if (c == quote_char) {
@@ -133,6 +430,8 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
else if (c == '\n') {
/* Should be a error, but we ignore it for now */
}
+
+ i ++;
}
/* EOF with no quote character, consider it fine */
@@ -148,15 +447,15 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
};
/* Main tokenisation loop */
- for (auto i = offset; i < input.size (); ++i) {
+ for (auto i = offset; i < input.size(); ++i) {
auto c = input[i];
switch (c) {
case '/':
- if (i + 1 < input.size () && input[i + 1] == '*') {
+ if (i + 1 < input.size() && input[i + 1] == '*') {
offset = i + 2;
- consume_comment (); /* Consume comment and go forward */
- return next_token (); /* Tail call */
+ consume_comment(); /* Consume comment and go forward */
+ return next_token(); /* Tail call */
}
else {
offset = i + 1;
@@ -171,7 +470,7 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
/* Consume as much space as we can */
do {
c = input[++i];
- } while (i < input.size () && g_ascii_isspace (c));
+ } while (i < input.size() && g_ascii_isspace(c));
auto ret = make_token<css_parser_token::token_type::whitespace_token>(
std::string_view(&input[offset], i - offset));
@@ -188,8 +487,22 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
case ')':
offset = i + 1;
return make_token<css_parser_token::token_type::ebrace_token>();
+ case '[':
+ offset = i + 1;
+ return make_token<css_parser_token::token_type::osqbrace_token>();
+ case ']':
+ offset = i + 1;
+ return make_token<css_parser_token::token_type::esqbrace_token>();
+ case '{':
+ offset = i + 1;
+ return make_token<css_parser_token::token_type::ocurlbrace_token>();
+ case '}':
+ offset = i + 1;
+ return make_token<css_parser_token::token_type::ecurlbrace_token>();
case ',':
return make_token<css_parser_token::token_type::comma_token>();
+ case ';':
+ return make_token<css_parser_token::token_type::semicolon_token>();
case '<':
/* Maybe an xml like comment */
if (i + 3 < input.size () && input[i + 1] == '!'
@@ -202,6 +515,42 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
offset = i + 1;
return make_token<css_parser_token::token_type::delim_token>(c);
}
+ break;
+ case '-':
+ if (i + 1 < input.size()) {
+ auto next_c = input[i + 1];
+
+ if (g_ascii_isdigit(next_c)) {
+ /* negative number */
+ return consume_number();
+ }
+ else if (next_c == '-') {
+ if (i + 2 < input.size() && input[i + 2] == '>') {
+ /* XML like comment */
+ return make_token<css_parser_token::token_type::cdc_token>();
+ }
+ }
+ }
+ /* No other options, a delimiter - */
+ offset = i + 1;
+ return make_token<css_parser_token::token_type::delim_token>(c);
+
+ break;
+ case '+':
+ case '.':
+ /* Maybe number */
+ if (i + 1 < input.size()) {
+ auto next_c = input[i + 1];
+
+ if (g_ascii_isdigit(next_c)) {
+ /* Numeric token */
+ return consume_number();
+ }
+ }
+ /* No other options, a delimiter - */
+ offset = i + 1;
+ return make_token<css_parser_token::token_type::delim_token>(c);
+
break;
}
diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx
index cff5877c2..5880241c1 100644
--- a/src/libserver/css/css_tokeniser.hxx
+++ b/src/libserver/css/css_tokeniser.hxx
@@ -47,14 +47,36 @@ struct css_parser_token {
ebrace_token, /* ) */
osqbrace_token, /* [ */
esqbrace_token, /* ] */
+ ocurlbrace_token, /* { */
+ ecurlbrace_token, /* } */
comma_token,
colon_token,
semicolon_token,
eof_token,
};
+ enum class dim_type : std::uint8_t {
+ dim_px,
+ dim_em,
+ dim_rem,
+ dim_ex,
+ dim_wv,
+ dim_wh,
+ dim_vmax,
+ dim_vmin,
+ dim_pt,
+ dim_cm,
+ dim_mm,
+ dim_in,
+ dim_pc,
+ };
+
static const std::uint8_t default_flags = 0;
static const std::uint8_t flag_bad_string = (1u << 0u);
+ static const std::uint8_t number_dimension = (1u << 1u);
+ static const std::uint8_t number_percent = (1u << 2u);
+ static const std::uint8_t flag_bad_dimension = (1u << 3u);
+
using value_type = std::variant<std::string_view, /* For strings and string like tokens */
char, /* For delimiters (might need to move to unicode point) */
double, /* For numeric stuff */
@@ -65,10 +87,12 @@ struct css_parser_token {
value_type value;
token_type type;
std::uint8_t flags = default_flags;
+ dim_type dim_type;
css_parser_token() = delete;
explicit css_parser_token(token_type type, const value_type &value) :
value(value), type(type) {}
+ auto adjust_dim(const css_parser_token &dim_token) -> bool;
};
/* Ensure that parser tokens are simple enough */
@@ -86,6 +110,9 @@ private:
std::string_view input;
std::size_t offset;
rspamd_mempool_t *pool;
+
+ auto consume_number() -> struct css_parser_token;
+ auto consume_ident() -> struct css_parser_token;
};
}
More information about the Commits
mailing list