commit ddda200: [Minor] Fix xml tags and comments processing
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Jun 28 13:07:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-28 11:51:31 +0100
URL: https://github.com/rspamd/rspamd/commit/ddda2007228617f8689f815e6a5f944b284ec5b0
[Minor] Fix xml tags and comments processing
---
src/libserver/html/html.cxx | 31 +++++++++++++++++++------------
1 file changed, 19 insertions(+), 12 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 8d312b733..c5d35105c 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -244,7 +244,7 @@ html_process_tag(rspamd_mempool_t *pool,
if (!(tag->flags & (CM_EMPTY))) {
/* Block tag */
- if ((tag->flags & (FL_CLOSING | FL_CLOSED))) {
+ if (tag->flags & FL_CLOSING) {
/* Closed block tag */
if (parent == nullptr) {
msg_debug_html ("bad parent node");
@@ -1178,21 +1178,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
return tag->content_offset;
}
- if (!tag->block) {
- if ((tag->flags & (FL_COMMENT|FL_XML))) {
+ if ((tag->flags & (FL_COMMENT|FL_XML))) {
+ is_visible = false;
+ }
+ else {
+ if (!tag->block) {
+ is_visible = true;
+ }
+ else if (!tag->block->is_visible()) {
is_visible = false;
}
else {
- is_visible = true;
+ is_block = tag->block->has_display() &&
+ tag->block->display == css::css_display_value::DISPLAY_BLOCK;
}
}
- else if (!tag->block->is_visible()) {
- is_visible = false;
- }
- else {
- is_block = tag->block->has_display() &&
- tag->block->display == css::css_display_value::DISPLAY_BLOCK;
- }
if (is_block) {
if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
@@ -1913,6 +1913,12 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
+ /* XML tags */
+ {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
+ " <!DOCTYPE html\n"
+ " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
+ " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
+ "<body>test</body>", "test"},
{"test", "test"},
{"test ", "test"},
{"test foo, bar", "test foo, bar"},
@@ -1938,6 +1944,7 @@ TEST_CASE("html text extraction")
//{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
// "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
{"<p><!--comment-->test", "test"},
+ /* Complex html with bad tags */
{"<!DOCTYPE html>\n"
"<html lang=\"en\">\n"
" <head>\n"
@@ -1953,7 +1960,7 @@ TEST_CASE("html text extraction")
" </P>\n"
" <b>stuff</p>?\n"
" </body>\n"
- "</html>", "Hello, world! test\ndata<> \nstuff?"}
+ "</html>", "Hello, world! test\ndata<> \nstuff?"},
};
rspamd_url_init(NULL);
More information about the Commits
mailing list