commit 4a51df3: [Project] Html: Implement rawtext state machine
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Jul 13 15:56:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-07-13 14:57:07 +0100
URL: https://github.com/rspamd/rspamd/commit/4a51df3cc2e822a1401137698adc94bfa49d229a
[Project] Html: Implement rawtext state machine
---
src/libserver/html/html.cxx | 74 ++++++++++++++++++++++-----------------------
1 file changed, 37 insertions(+), 37 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 5d2479ab4..bde7c0117 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1223,7 +1223,8 @@ html_process_input(rspamd_mempool_t *pool,
tag_end_closing,
html_text_content,
xml_tag_end,
- content_style,
+ tag_raw_text,
+ tag_raw_text_less_than,
tags_limit_overflow,
} state = parse_start;
@@ -1643,44 +1644,24 @@ html_process_input(rspamd_mempool_t *pool,
}
break;
- case content_style: {
-
- /*
- * We just search for the first </style> substring and then pass
- * the content to the parser (if needed)
- *
- * TODO: Handle other stuff, we actually need an FSM here to find
- * the ending tag...
- */
- auto end_style = rspamd_substring_search_caseless(p, end - p,
- "</style>", 8);
- if (end_style == -1) {
- /* Invalid style */
- state = html_text_content;
+ case tag_raw_text:
+ if (t == '<') {
+ c = p;
+ state = tag_raw_text_less_than;
}
- else {
-
- if (allow_css) {
- auto ret_maybe = rspamd::css::parse_css(pool, {p, std::size_t(end_style)},
- std::move(hc->css_style));
-
- if (!ret_maybe.has_value()) {
- auto err_str = fmt::format("cannot parse css (error code: {}): {}",
- static_cast<int>(ret_maybe.error().type),
- ret_maybe.error().description.value_or("unknown error"));
- msg_info_pool ("cannot parse css: %*s",
- (int) err_str.size(), err_str.data());
- }
- else {
- hc->css_style = ret_maybe.value();
- }
- }
-
- p += end_style;
+ p ++;
+ break;
+ case tag_raw_text_less_than:
+ if (t == '/') {
+ /* Shift back */
+ p = c;
state = tag_begin;
}
+ else {
+ p ++;
+ state = tag_raw_text;
+ }
break;
- }
case sgml_content:
/* TODO: parse DOCTYPE here */
if (t == '>') {
@@ -1719,8 +1700,8 @@ html_process_input(rspamd_mempool_t *pool,
state = html_text_content;
if (cur_tag) {
- if (cur_tag->id == Tag_STYLE) {
- state = content_style;
+ if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
+ state = tag_raw_text;
}
if (html_document_state == html_document_state::doctype) {
if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
@@ -1806,6 +1787,25 @@ html_process_input(rspamd_mempool_t *pool,
parent_tag = cur_tag->parent;
g_assert(cur_tag->parent != &cur_closing_tag);
}
+
+ if (cur_tag->id == Tag_STYLE && cur_tag->closing.start > cur_tag->content_offset) {
+ if (allow_css) {
+ auto ret_maybe = rspamd::css::parse_css(pool,
+ {start + cur_tag->content_offset, cur_tag->closing.start - cur_tag->content_offset},
+ std::move(hc->css_style));
+
+ if (!ret_maybe.has_value()) {
+ auto err_str = fmt::format("cannot parse css (error code: {}): {}",
+ static_cast<int>(ret_maybe.error().type),
+ ret_maybe.error().description.value_or("unknown error"));
+ msg_info_pool ("cannot parse css: %*s",
+ (int) err_str.size(), err_str.data());
+ }
+ else {
+ hc->css_style = ret_maybe.value();
+ }
+ }
+ }
} /* if cur_tag != nullptr */
state = html_text_content;
p++;
More information about the Commits
mailing list