commit 56f52fc: [Minor] Another set of fixes in the spaces normalisation
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Jun 23 11:14:05 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-23 12:04:05 +0100
URL: https://github.com/rspamd/rspamd/commit/56f52fcf7a180a3fb8a0b803142ad13c9478e5f6
[Minor] Another set of fixes in the spaces normalisation
---
src/libserver/html/html.cxx | 24 ++++++++++++++++--------
src/libserver/html/html_entities.cxx | 23 +++++++++++++++++++++++
2 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 32862ed20..20a38ee09 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1090,9 +1090,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
initial_dest_offset = hc->parsed.size();
if (tag->id == Tag_BR || tag->id == Tag_HR) {
- if (!hc->parsed.empty()) {
- hc->parsed.append("\n");
- }
+ hc->parsed.append("\n");
return tag->content_offset;
}
@@ -1163,17 +1161,19 @@ html_append_tag_content(rspamd_mempool_t *pool,
cur_offset = html_append_tag_content(pool, start, len, hc, next_enclosed,
nested_stack, exceptions, url_set);
- initial_part_len = next_tag_offset - cur_offset;
- if (is_visible && initial_part_len > 0) {
- html_append_content(hc, {start + cur_offset,
- std::size_t(initial_part_len)});
+ if (enclosed_tags.empty()) {
+ initial_part_len = next_tag_offset - cur_offset;
+ if (is_visible && initial_part_len > 0) {
+ html_append_content(hc, {start + cur_offset,
+ std::size_t(initial_part_len)});
+ }
}
}
} while (!enclosed_tags.empty());
if (is_block && is_visible) {
- if (!hc->parsed.empty()) {
+ if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
hc->parsed.append("\n");
}
}
@@ -1817,6 +1817,14 @@ TEST_CASE("html text extraction")
{"<div>foo</div><div>bar</div>", "foo\nbar\n"},
{"<a href=https://example.com>test</a>", "test"},
{"<img alt=test>", "test"},
+ {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
+ " <body>\n"
+ " <p><br>\n"
+ " </p>\n"
+ " <div class=\"moz-forward-container\"><br>\n"
+ " <br>\n"
+ " test</div>"
+ "</body>", "\ntest\n"},
};
rspamd_url_init(NULL);
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 2cc3c11f5..97c84f64e 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2236,6 +2236,7 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
normal_content,
ampersand,
skip_multi_spaces,
+ skip_start_spaces,
} state = parser_state::normal_content;
end = s + len;
@@ -2441,6 +2442,10 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
return false;
};
+ if (norm_spaces && g_ascii_isspace(*h)) {
+ state = parser_state::skip_start_spaces;
+ }
+
while (h - s < len && t <= h) {
switch (state) {
case parser_state::normal_content:
@@ -2516,6 +2521,14 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
state = parser_state::normal_content;
}
break;
+ case parser_state::skip_start_spaces:
+ if (g_ascii_isspace(*h)) {
+ h ++;
+ }
+ else {
+ state = parser_state::normal_content;
+ }
+ break;
}
}
@@ -2537,6 +2550,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
}
}
+ if (norm_spaces && g_ascii_isspace(*t)) {
+ do {
+ t --;
+ } while (t > s && g_ascii_isspace(*t));
+
+ if (!g_ascii_isspace(*t)) {
+ t++; /* Preserve last space character */
+ }
+ }
+
return (t - s);
}
More information about the Commits
mailing list