commit ddda200: [Minor] Fix xml tags and comments processing

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jun 28 13:07:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-28 11:51:31 +0100
URL: https://github.com/rspamd/rspamd/commit/ddda2007228617f8689f815e6a5f944b284ec5b0

[Minor] Fix xml tags and comments processing

---
 src/libserver/html/html.cxx | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 8d312b733..c5d35105c 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -244,7 +244,7 @@ html_process_tag(rspamd_mempool_t *pool,
 
 	if (!(tag->flags & (CM_EMPTY))) {
 		/* Block tag */
-		if ((tag->flags & (FL_CLOSING | FL_CLOSED))) {
+		if (tag->flags & FL_CLOSING) {
 			/* Closed block tag */
 			if (parent == nullptr) {
 				msg_debug_html ("bad parent node");
@@ -1178,21 +1178,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		return tag->content_offset;
 	}
 
-	if (!tag->block) {
-		if ((tag->flags & (FL_COMMENT|FL_XML))) {
+	if ((tag->flags & (FL_COMMENT|FL_XML))) {
+		is_visible = false;
+	}
+	else {
+		if (!tag->block) {
+			is_visible = true;
+		}
+		else if (!tag->block->is_visible()) {
 			is_visible = false;
 		}
 		else {
-			is_visible = true;
+			is_block = tag->block->has_display() &&
+					   tag->block->display == css::css_display_value::DISPLAY_BLOCK;
 		}
 	}
-	else if (!tag->block->is_visible()) {
-		is_visible = false;
-	}
-	else {
-		is_block = tag->block->has_display() &&
-				   tag->block->display == css::css_display_value::DISPLAY_BLOCK;
-	}
 
 	if (is_block) {
 		if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
@@ -1913,6 +1913,12 @@ TEST_CASE("html text extraction")
 {
 
 	const std::vector<std::pair<std::string, std::string>> cases{
+			/* XML tags */
+			{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
+			 " <!DOCTYPE html\n"
+			 "   PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
+			 "   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
+			 "<body>test</body>", "test"},
 			{"test", "test"},
 			{"test   ", "test"},
 			{"test   foo,   bar", "test foo, bar"},
@@ -1938,6 +1944,7 @@ TEST_CASE("html text extraction")
 			//{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
 			// "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
 			{"<p><!--comment-->test", "test"},
+			/* Complex html with bad tags */
 			{"<!DOCTYPE html>\n"
 			 "<html lang=\"en\">\n"
 			 "  <head>\n"
@@ -1953,7 +1960,7 @@ TEST_CASE("html text extraction")
 			 "    </P>\n"
 			 "    <b>stuff</p>?\n"
 			 "  </body>\n"
-			 "</html>", "Hello, world! test\ndata<> \nstuff?"}
+			 "</html>", "Hello, world! test\ndata<> \nstuff?"},
 	};
 
 	rspamd_url_init(NULL);


More information about the Commits mailing list