commit 515ea80: [Rework] Rework tags parsing machine
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Jul 9 13:28:05 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-07-09 14:27:06 +0100
URL: https://github.com/rspamd/rspamd/commit/515ea8079302c7fde5301a0a6cb3fd853ad5fd09 (HEAD -> master)
[Rework] Rework tags parsing machine
---
src/libserver/html/html.cxx | 293 +++++++++++++++++---------------------------
1 file changed, 111 insertions(+), 182 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index b8a5e1d32..b02cd0e1d 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -216,40 +216,15 @@ html_component_from_string(const std::string_view &st) -> std::optional<html_com
}
}
-static auto
-find_tag_component_name(rspamd_mempool_t *pool,
- const gchar *begin,
- const gchar *end) -> std::optional<html_component_type>
-{
- if (end <= begin) {
- return std::nullopt;
- }
-
- auto *p = rspamd_mempool_alloc_buffer(pool, end - begin);
- memcpy(p, begin, end - begin);
- auto len = decode_html_entitles_inplace(p, end - begin);
- len = rspamd_str_lc(p, len);
- auto known_component_it = html_components_map.find({p, len});
-
- if (known_component_it != html_components_map.end()) {
- return known_component_it->second;
- }
- else {
- return std::nullopt;
- }
-}
-
struct tag_content_parser_state {
int cur_state = 0;
- const char *saved_p = nullptr;
- const char *tag_name_start = nullptr;
+ std::string buf;
std::optional<html_component_type> cur_component;
void reset()
{
cur_state = 0;
- saved_p = nullptr;
- tag_name_start = nullptr;
+ buf.clear();
cur_component = std::nullopt;
}
};
@@ -273,13 +248,12 @@ html_parse_tag_content(rspamd_mempool_t *pool,
parse_sqvalue,
parse_end_squote,
parse_value,
- spaces_after_name,
spaces_before_eq,
spaces_after_eq,
spaces_after_param,
ignore_bad_tag,
+ tag_end,
} state;
- gboolean store = FALSE;
state = static_cast<enum tag_parser_state>(parser_env.cur_state);
@@ -289,23 +263,56 @@ html_parse_tag_content(rspamd_mempool_t *pool,
* Parser env is set to clear the current html attribute fields (saved_p and
* cur_component)
*/
- auto store_tag_component = [&]() -> void {
- if (parser_env.saved_p != nullptr && parser_env.cur_component &&
- in > parser_env.saved_p) {
-
- /* We ignore repeated attributes */
- auto sz = (std::size_t)(in - parser_env.saved_p);
- auto *s = rspamd_mempool_alloc_buffer(pool, sz);
- memcpy(s, parser_env.saved_p, sz);
- sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p);
+ auto store_component_value = [&]() -> void {
+ if (parser_env.cur_component) {
+
+ if (parser_env.buf.empty()) {
+ tag->components.emplace_back(parser_env.cur_component.value(),
+ std::string_view{});
+ }
+ else {
+ /* We need to copy buf to a persistent storage */
+ auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
+ memcpy(s, parser_env.buf.data(), parser_env.buf.size());
+ auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
tag->components.emplace_back(parser_env.cur_component.value(),
std::string_view{s, sz});
+ }
}
- parser_env.saved_p = nullptr;
+ parser_env.buf.clear();
parser_env.cur_component = std::nullopt;
};
+ auto store_component_name = [&]() -> bool {
+ decode_html_entitles_inplace(parser_env.buf);
+ auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
+ parser_env.buf.clear();
+
+ if (known_component_it != html_components_map.end()) {
+ parser_env.cur_component = known_component_it->second;
+
+ return true;
+ }
+ else {
+ parser_env.cur_component = std::nullopt;
+ }
+
+ return false;
+ };
+
+ auto store_value_character = [&](bool lc) -> void {
+ auto c = lc ? g_ascii_tolower(*in) : *in;
+
+ if (c == '\0') {
+ /* Replace with u0FFD */
+ parser_env.buf.append(u8"\uFFFD");
+ }
+ else {
+ parser_env.buf.push_back(c);
+ }
+ };
+
switch (state) {
case parse_start:
if (!g_ascii_isalpha (*in) && !g_ascii_isspace (*in)) {
@@ -316,46 +323,30 @@ html_parse_tag_content(rspamd_mempool_t *pool,
}
else if (g_ascii_isalpha (*in)) {
state = parse_name;
- parser_env.tag_name_start = in;
+ store_value_character(true);
}
break;
case parse_name:
- if ((g_ascii_isspace (*in) || *in == '>' || *in == '/') && parser_env.tag_name_start) {
- const auto *start = parser_env.tag_name_start;
- g_assert (in >= start);
-
+ if ((g_ascii_isspace (*in) || *in == '>' || *in == '/')) {
if (*in == '/') {
tag->flags |= FL_CLOSED;
}
- const auto tag_name_len = in - start;
-
- if (tag_name_len== 0) {
+ if (parser_env.buf.empty()) {
hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
tag->id = N_TAGS;
tag->flags |= FL_BROKEN;
state = ignore_bad_tag;
}
else {
- /*
- * Copy tag name to the temporary buffer for modifications.
- * We use static buffer as legit tag names are usually short enough
- * to save some space in memory pool.
- */
- char s[32];
-
- auto nsize = rspamd_strlcpy(s, parser_env.tag_name_start,
- MIN(sizeof(s), tag_name_len + 1));
- nsize = rspamd_html_decode_entitles_inplace(s, nsize);
- nsize = rspamd_str_lc_utf8(s, nsize);
-
- const auto *tag_def = rspamd::html::html_tags_defs.by_name({s, nsize});
+ decode_html_entitles_inplace(parser_env.buf);
+ const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
if (tag_def == nullptr) {
hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
/* Assign -hash to match closing tag if needed */
- auto nhash = static_cast<std::int32_t>(std::hash<std::string_view>{}({s, nsize}));
+ auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
/* Always negative */
tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
}
@@ -364,92 +355,46 @@ html_parse_tag_content(rspamd_mempool_t *pool,
tag->flags = tag_def->flags;
}
- state = spaces_after_name;
+ parser_env.buf.clear();
+
+ state = spaces_after_param;
}
}
+ else {
+ store_value_character(true);
+ }
break;
case parse_attr_name:
- if (parser_env.saved_p == nullptr) {
- state = ignore_bad_tag;
+ if (*in == '=') {
+ store_component_name();
+ state = parse_equal;
+ }
+ else if (g_ascii_isspace(*in)) {
+ store_component_name();
+ state = spaces_before_eq;
+ }
+ else if (*in == '/') {
+ store_component_name();
+ tag->flags |= FL_CLOSED;
+ state = spaces_before_eq;
+ }
+ else if (*in == '>') {
+ store_component_name();
+ state = tag_end;
}
else {
- const auto *attr_name_end = in;
-
- if (*in == '=') {
- state = parse_equal;
- }
- else if (*in == '"') {
- /* No equal or something sane but we have quote character */
- state = parse_start_dquote;
- attr_name_end = in - 1;
-
- while (attr_name_end > parser_env.saved_p) {
- if (!g_ascii_isalnum (*attr_name_end)) {
- attr_name_end--;
- }
- else {
- break;
- }
- }
-
- /* One character forward to obtain length */
- attr_name_end++;
- }
- else if (g_ascii_isspace (*in)) {
- state = spaces_before_eq;
- }
- else if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
- else if (!g_ascii_isgraph (*in)) {
- state = parse_value;
- attr_name_end = in - 1;
-
- while (attr_name_end > parser_env.saved_p) {
- if (!g_ascii_isalnum (*attr_name_end)) {
- attr_name_end--;
- }
- else {
- break;
- }
- }
-
- /* One character forward to obtain length */
- attr_name_end++;
+ if (*in == '"' || *in == '\'' || *in == '<') {
+ /* Should never be in attribute names but ignored */
+ tag->flags |= FL_BROKEN;
}
else {
- return;
- }
-
- parser_env.cur_component = find_tag_component_name(pool,
- parser_env.saved_p,
- attr_name_end);
-
- if (!parser_env.cur_component) {
- /* Ignore unknown params */
- parser_env.saved_p = nullptr;
- }
- else if (state == parse_value) {
- parser_env.saved_p = in + 1;
+ store_value_character(true);
}
}
break;
- case spaces_after_name:
- if (!g_ascii_isspace (*in)) {
- parser_env.saved_p = in;
-
- if (*in == '/') {
- tag->flags |= FL_CLOSED;
- }
- else if (*in != '>') {
- state = parse_attr_name;
- }
- }
- break;
-
case spaces_before_eq:
if (*in == '=') {
state = parse_equal;
@@ -468,21 +413,19 @@ html_parse_tag_content(rspamd_mempool_t *pool,
* Should be okay (empty attribute). The rest is handled outside
* this automata.
*/
-
+ state = tag_end;
}
- else if (*in == '"' || *in == '\'') {
+ else if (*in == '"' || *in == '\'' || *in == '<') {
/* Attribute followed by quote... Missing '=' ? Dunno, need to test */
hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
tag->flags |= FL_BROKEN;
- state = ignore_bad_tag;
+ store_component_value();
+ state = spaces_after_param;
}
else {
- /*
- * Just start another attribute ignoring an empty attributes for
- * now. We don't use them in fact...
- */
- state = parse_attr_name;
- parser_env.saved_p = in;
+ /* Empty attribute */
+ store_component_value();
+ state = spaces_after_param;
}
}
break;
@@ -495,10 +438,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_start_squote;
}
else if (!g_ascii_isspace (*in)) {
- if (parser_env.saved_p != nullptr) {
- /* We need to save this param */
- parser_env.saved_p = in;
- }
+ store_value_character(true);
state = parse_value;
}
break;
@@ -514,81 +454,63 @@ html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_start_squote;
}
else {
- if (parser_env.saved_p != nullptr) {
- /* We need to save this param */
- parser_env.saved_p = in;
- }
+ store_value_character(true);
state = parse_value;
}
break;
case parse_start_dquote:
if (*in == '"') {
- if (parser_env.saved_p != nullptr) {
- /* We have an empty attribute value */
- parser_env.saved_p = nullptr;
- }
state = spaces_after_param;
}
else {
- if (parser_env.saved_p != nullptr) {
- /* We need to save this param */
- parser_env.saved_p = in;
- }
+ store_value_character(false);
state = parse_dqvalue;
}
break;
case parse_start_squote:
if (*in == '\'') {
- if (parser_env.saved_p != nullptr) {
- /* We have an empty attribute value */
- parser_env.saved_p = nullptr;
- }
state = spaces_after_param;
}
else {
- if (parser_env.saved_p != nullptr) {
- /* We need to save this param */
- parser_env.saved_p = in;
- }
+ store_value_character(false);
state = parse_sqvalue;
}
break;
case parse_dqvalue:
if (*in == '"') {
- store = TRUE;
+ store_component_value();
state = parse_end_dquote;
}
-
- if (store) {
- store_tag_component();
+ else {
+ store_value_character(false);
}
break;
case parse_sqvalue:
if (*in == '\'') {
- store = TRUE;
+ store_component_value();
state = parse_end_squote;
}
- if (store) {
- store_tag_component();
+ else {
+ store_value_character(false);
}
+
break;
case parse_value:
if (*in == '/' && *(in + 1) == '>') {
tag->flags |= FL_CLOSED;
- store = TRUE;
+ store_component_value();
}
else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
- store = TRUE;
+ store_component_value();
state = spaces_after_param;
}
-
- if (store) {
- store_tag_component();
+ else {
+ store_value_character(false);
}
break;
@@ -603,7 +525,8 @@ html_parse_tag_content(rspamd_mempool_t *pool,
else {
/* No space, proceed immediately to the attribute name */
state = parse_attr_name;
- parser_env.saved_p = in;
+ store_component_value();
+ store_value_character(true);
}
break;
@@ -612,13 +535,19 @@ html_parse_tag_content(rspamd_mempool_t *pool,
if (*in == '/' && *(in + 1) == '>') {
tag->flags |= FL_CLOSED;
}
-
- state = parse_attr_name;
- parser_env.saved_p = in;
+ else if (*in == '=') {
+ /* Attributes cannot start with '=' */
+ tag->flags |= FL_BROKEN;
+ }
+ else {
+ store_value_character(true);
+ state = parse_attr_name;
+ }
}
break;
case ignore_bad_tag:
+ case tag_end:
break;
}
More information about the Commits
mailing list