commit f8351d7: [Project] Html: More spaces logic fixes
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Jun 24 16:42:05 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-24 17:38:20 +0100
URL: https://github.com/rspamd/rspamd/commit/f8351d7aa84a526dfb8ac92fec9894f51e2a6359 (HEAD -> master)
[Project] Html: More spaces logic fixes
---
src/libserver/html/html.cxx | 20 ++++++++++++++++++++
src/libserver/html/html_entities.cxx | 7 +++++++
2 files changed, 27 insertions(+)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 45094e7f8..894b1ee45 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1849,6 +1849,26 @@ TEST_CASE("html text extraction")
"</body>", "\n\n\ntest\n"},
{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
"sh<span style=\"FONT-SIZE: 0px\">aring </span></div>", "fish\n"},
+ /* FIXME: broken until rework */
+ //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
+ // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
+ {"<p><!--comment-->test", "test"},
+ {"<!DOCTYPE html>\n"
+ "<html lang=\"en\">\n"
+ " <head>\n"
+ " <meta charset=\"utf-8\">\n"
+ " <title>title</title>\n"
+ " <link rel=\"stylesheet\" href=\"style.css\">\n"
+ " <script src=\"script.js\"></script>\n"
+ " </head>\n"
+ " <body>\n"
+ " <!-- page content -->\n"
+ " Hello, world! <b>test</b>\n"
+ " <p>data<>\n"
+ " </P>\n"
+ " <b>stuff</p>?\n"
+ " </body>\n"
+ "</html>", "Hello, world! test\ndata<> \nstuff?"}
};
rspamd_url_init(NULL);
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 573872f43..4cbdf02bf 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2552,9 +2552,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
}
if (norm_spaces) {
+ bool seen_spaces = false;
+
while (t > s && g_ascii_isspace(*(t - 1))) {
+ seen_spaces = true;
t --;
}
+
+ if (seen_spaces) {
+ *t++ = ' ';
+ }
}
return (t - s);
More information about the Commits
mailing list