commit 909a594: [Rework] Save invisible content to a separate buffer
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Sep 7 14:07:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-09-07 15:00:38 +0100
URL: https://github.com/rspamd/rspamd/commit/909a594f8d003dbe89462ad02d1634371f42bec3
[Rework] Save invisible content to a separate buffer
---
src/libserver/html/html.cxx | 115 ++++++++++++++++++++++++++++------------
src/libserver/html/html.hxx | 1 +
src/libserver/html/html_tag.hxx | 16 ++----
src/lua/lua_html.cxx | 6 +--
4 files changed, 89 insertions(+), 49 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e4cc137b4..97009749f 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -985,12 +985,15 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
}
static inline auto
-html_append_parsed(struct html_content *hc, std::string_view data, bool transparent,
- std::size_t input_len) -> std::size_t
+html_append_parsed(struct html_content *hc,
+ std::string_view data,
+ bool transparent,
+ std::size_t input_len,
+ std::string &dest) -> std::size_t
{
- auto cur_offset = hc->parsed.size();
+ auto cur_offset = dest.size();
- if (hc->parsed.size() > input_len) {
+ if (dest.size() > input_len) {
/* Impossible case, refuse to append */
return 0;
}
@@ -999,9 +1002,9 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar
/* Handle multiple spaces at the begin */
if (cur_offset > 0) {
- auto last = hc->parsed.back();
+ auto last = dest.back();
if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
- hc->parsed.append(" ");
+ dest.append(" ");
data = {data.data() + 1, data.size() - 1};
cur_offset++;
}
@@ -1020,24 +1023,24 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar
}
};
- hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD"));
- replace_zero_func(data, hc->parsed);
+ dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
+ replace_zero_func(data, dest);
hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
}
else {
- hc->parsed.append(data);
+ dest.append(data);
}
}
- auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset,
- hc->parsed.size() - cur_offset, true);
+ auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
+ dest.size() - cur_offset, true);
- hc->parsed.resize(nlen + cur_offset);
+ dest.resize(nlen + cur_offset);
if (transparent) {
/* Replace all visible characters with spaces */
- auto start = std::next(hc->parsed.begin(), cur_offset);
- std::replace_if(start, std::end(hc->parsed), [](const auto c) {
+ auto start = std::next(dest.begin(), cur_offset);
+ std::replace_if(start, std::end(dest), [](const auto c) {
return !g_ascii_isspace(c);
}, ' ');
}
@@ -1076,11 +1079,18 @@ html_append_tag_content(rspamd_mempool_t *pool,
{
auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
goffset next_tag_offset = tag->closing.end,
- initial_dest_offset = hc->parsed.size();
+ initial_parsed_offset = hc->parsed.size(),
+ initial_invisible_offset = hc->invisible.size();
- auto calculate_final_tag_offsets = [&tag, initial_dest_offset, hc]() -> void {
- tag->content_offset = initial_dest_offset;
- tag->closing.start = hc->parsed.size();
+ auto calculate_final_tag_offsets = [&]() -> void {
+ if (is_visible) {
+ tag->content_offset = initial_parsed_offset;
+ tag->closing.start = hc->parsed.size();
+ }
+ else {
+ tag->content_offset = initial_invisible_offset;
+ tag->closing.start = hc->invisible.size();
+ }
};
if (tag->closing.end == -1) {
@@ -1098,17 +1108,18 @@ html_append_tag_content(rspamd_mempool_t *pool,
}
auto append_margin = [&](char c) -> void {
+ /* We do care about visible margins only */
if (is_visible) {
if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
if (hc->parsed.back() == ' ') {
/* We also strip extra spaces at the end, but limiting the start */
- auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_dest_offset);
+ auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
auto first = std::find_if(hc->parsed.rbegin(), last,
[](auto ch) -> auto {
return ch != ' ';
});
hc->parsed.erase(first.base(), hc->parsed.end());
- g_assert(hc->parsed.size() >= initial_dest_offset);
+ g_assert(hc->parsed.size() >= initial_parsed_offset);
}
hc->parsed.push_back(c);
}
@@ -1177,10 +1188,17 @@ html_append_tag_content(rspamd_mempool_t *pool,
auto enclosed_start = cld->tag_start;
goffset initial_part_len = enclosed_start - cur_offset;
- if (is_visible && initial_part_len > 0) {
- html_append_parsed(hc,
- {start + cur_offset, std::size_t(initial_part_len)},
- is_transparent, len);
+ if (initial_part_len > 0) {
+ if (is_visible) {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(initial_part_len)},
+ is_transparent, len, hc->parsed);
+ }
+ else {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(initial_part_len)},
+ is_transparent, len, hc->invisible);
+ }
}
auto next_offset = html_append_tag_content(pool, start, len,
@@ -1195,11 +1213,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
if (cur_offset < tag->closing.start) {
goffset final_part_len = tag->closing.start - cur_offset;
- if (is_visible && final_part_len > 0) {
- html_append_parsed(hc,
- {start + cur_offset, std::size_t(final_part_len)},
- is_transparent,
- len);
+ if (final_part_len > 0) {
+ if (is_visible) {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(final_part_len)},
+ is_transparent,
+ len,
+ hc->parsed);
+ }
+ else {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(final_part_len)},
+ is_transparent,
+ len,
+ hc->invisible);
+ }
}
}
if (is_block) {
@@ -1211,11 +1239,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
if (is_visible) {
if (tag->id == Tag_A) {
- auto written_len = hc->parsed.size() - initial_dest_offset;
+ auto written_len = hc->parsed.size() - initial_parsed_offset;
html_process_displayed_href_tag(pool, hc,
- {hc->parsed.data() + initial_dest_offset, written_len},
+ {hc->parsed.data() + initial_parsed_offset, written_len},
tag, exceptions,
- url_set, initial_dest_offset);
+ url_set, initial_parsed_offset);
}
else if (tag->id == Tag_IMG) {
/* Process ALT if presented */
@@ -1997,7 +2025,7 @@ html_process_input(rspamd_mempool_t *pool,
break;
case tags_limit_overflow:
html_append_parsed(hc, {c, (std::size_t) (end - c)},
- false, end - start);
+ false, end - start, hc->parsed);
break;
default:
/* Do nothing */
@@ -2084,6 +2112,27 @@ auto html_tag_by_name(const std::string_view &name)
return std::nullopt;
}
+auto
+html_tag::get_content(const struct html_content *hc) const -> std::string_view
+{
+ const std::string *dest = &hc->parsed;
+
+ if (block && !block->is_visible()) {
+ dest = &hc->invisible;
+ }
+ const auto clen = get_content_length();
+ if (content_offset < dest->size()) {
+ if (dest->size() - content_offset >= clen) {
+ return std::string_view{*dest}.substr(content_offset, clen);
+ }
+ else {
+ return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
+ }
+ }
+
+ return std::string_view{};
+}
+
}
void *
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
index 7e63bedce..5c16d085a 100644
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -47,6 +47,7 @@ struct html_content {
std::vector<html_image *> images;
std::vector<std::unique_ptr<struct html_tag>> all_tags;
std::string parsed;
+ std::string invisible;
std::shared_ptr<css::css_style_sheet> css_style;
/* Preallocate and reserve all internal structures */
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index b6fc73120..5971ca179 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -31,6 +31,8 @@ struct html_image;
namespace rspamd::html {
+struct html_content; /* Forward declaration */
+
enum class html_component_type : std::uint8_t {
RSPAMD_HTML_COMPONENT_NAME = 0,
RSPAMD_HTML_COMPONENT_HREF,
@@ -141,19 +143,7 @@ struct html_tag {
return 0;
}
- constexpr auto get_content(std::string_view parsed) const -> std::string_view {
- const auto clen = get_content_length();
- if (content_offset < parsed.size()) {
- if (parsed.size() - content_offset >= clen) {
- return parsed.substr(content_offset, clen);
- }
- else {
- return parsed.substr(content_offset, parsed.size() - content_offset);
- }
- }
-
- return std::string_view{};
- }
+ auto get_content(const struct html_content *hc) const -> std::string_view;
};
static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY);
diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx
index 848bfbdf3..8767d1a11 100644
--- a/src/lua/lua_html.cxx
+++ b/src/lua/lua_html.cxx
@@ -448,7 +448,7 @@ lua_html_foreach_tag (lua_State *L)
auto *ltag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(lua_html_tag)));
ltag->tag = tag;
ltag->html = hc;
- auto ct = ltag->tag->get_content(hc->parsed);
+ auto ct = ltag->tag->get_content(hc);
rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
lua_pushinteger (L, ct.size());
@@ -582,7 +582,7 @@ lua_html_tag_get_content (lua_State *L)
if (ltag) {
if (ltag->html) {
- auto ct = ltag->tag->get_content(ltag->html->parsed);
+ auto ct = ltag->tag->get_content(ltag->html);
if (ct.size() > 0) {
t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
rspamd_lua_setclass(L, "rspamd{text}", -1);
@@ -613,7 +613,7 @@ lua_html_tag_get_content_length (lua_State *L)
if (ltag) {
if (ltag->html) {
- auto ct = ltag->tag->get_content(ltag->html->parsed);
+ auto ct = ltag->tag->get_content(ltag->html);
lua_pushinteger (L, ct.size());
}
else {
More information about the Commits
mailing list