commit 570007a: [Minor] Html: Fix some more mess with bad closing tags
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Jul 2 10:42:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-07-02 11:33:52 +0100
URL: https://github.com/rspamd/rspamd/commit/570007aa24ad1125ebc4b6ab98d383cf9b16ca3e
[Minor] Html: Fix some more mess with bad closing tags
---
src/libserver/html/html.cxx | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 82c5d213c..967411b2a 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1062,7 +1062,10 @@ html_append_tag_content(rspamd_mempool_t *pool,
};
if (tag->id == Tag_BR || tag->id == Tag_HR) {
- hc->parsed.append("\n");
+
+ if (!(tag->flags & FL_IGNORE)) {
+ hc->parsed.append("\n");
+ }
return tag->content_offset;
}
@@ -1331,6 +1334,10 @@ html_process_input(rspamd_mempool_t *pool,
if (!(cur_tag->flags & CM_EMPTY)) {
html_process_block_tag(pool, cur_tag, hc);
}
+ else {
+ /* Implicitly close */
+ cur_tag->flags |= FL_CLOSED;
+ }
if (cur_tag->flags & FL_CLOSED) {
cur_tag->closing.end = cur_tag->content_offset;
@@ -1660,6 +1667,11 @@ html_process_input(rspamd_mempool_t *pool,
break;
case tag_end_closing: {
if (cur_tag) {
+
+ if (cur_tag->flags & CM_EMPTY) {
+ /* Ignore closing empty tags */
+ cur_tag->flags |= FL_IGNORE;
+ }
/* cur_tag here is a closing tag */
auto *next_cur_tag = html_check_balance(hc, cur_tag,
c - start, p - start + 1);
@@ -1687,7 +1699,7 @@ html_process_input(rspamd_mempool_t *pool,
auto &&vtag = std::make_unique<html_tag>();
vtag->id = cur_tag->id;
- vtag->flags = FL_VIRTUAL | FL_CLOSED;
+ vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
vtag->tag_start = cur_tag->closing.start;
vtag->content_offset = p - start + 1;
vtag->closing = cur_tag->closing;
@@ -1918,6 +1930,13 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
+ {" <body>\n"
+ " <!-- page content -->\n"
+ " Hello, world!<br>test</br><br>content</hr>more content<br>\n"
+ " <div>\n"
+ " content inside div\n"
+ " </div>\n"
+ " </body>", "Hello, world!\ntest\ncontent\nmore content\ncontent inside div\n"},
{" <body>\n"
" <!-- escape content -->\n"
" a b a > b a < b a & b 'a "a"\n"
@@ -1981,6 +2000,7 @@ TEST_CASE("html text extraction")
" <td>data2</td>\n"
" </tr>\n"
" </table>", "heada headb\ndata1 data2\n"},
+
};
rspamd_url_init(NULL);
More information about the Commits
mailing list