commit 5497542: [Project] Html: Try to deal with bad unknown tags properly

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jul 5 16:42:06 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-05 17:40:59 +0100
URL: https://github.com/rspamd/rspamd/commit/549754288a5715cf278ecccd57c7ed5f9a23da8a

[Project] Html: Try to deal with bad unknown tags properly

---
 src/libserver/html/html.cxx     | 48 +++++++++++++++++++++++++++--------------
 src/libserver/html/html.hxx     |  2 +-
 src/libserver/html/html_tag.hxx |  4 ++--
 src/libserver/html/html_tags.h  |  5 +++--
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index d46d291d6..b29a7d37d 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -354,7 +354,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
 
 				if (tag_def == nullptr) {
 					hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
-					tag->id = N_TAGS;
+					/* Assign -hash to match closing tag if needed */
+					auto nhash = static_cast<std::int32_t>(std::hash<std::string_view>{}({s, nsize}));
+					/* Always negative */
+					tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
 				}
 				else {
 					tag->id = tag_def->id;
@@ -1040,7 +1043,7 @@ static auto
 html_append_tag_content(rspamd_mempool_t *pool,
 						const gchar *start, gsize len,
 						struct html_content *hc,
-						const html_tag *tag,
+						html_tag *tag,
 						GList **exceptions,
 						khash_t (rspamd_url_hash) *url_set) -> goffset
 {
@@ -1048,6 +1051,20 @@ html_append_tag_content(rspamd_mempool_t *pool,
 	goffset next_tag_offset = tag->closing.end,
 			initial_dest_offset = hc->parsed.size();
 
+	if (tag->closing.end == -1) {
+		if (tag->closing.start != -1) {
+			next_tag_offset = tag->closing.start;
+			tag->closing.end = tag->closing.start;
+		}
+		else {
+			next_tag_offset = len;
+			tag->closing.end = len;
+		}
+	}
+	if (tag->closing.start == -1) {
+		tag->closing.start = tag->closing.end;
+	}
+
 	auto append_margin = [&](char c) -> void {
 		if (is_visible) {
 			if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
@@ -1072,7 +1089,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 
 		return tag->content_offset;
 	}
-	else if (tag->id == Tag_HEAD || tag->id >= N_TAGS) {
+	else if (tag->id == Tag_HEAD) {
 		return tag->closing.end;
 	}
 
@@ -1235,7 +1252,7 @@ html_process_input(rspamd_mempool_t *pool,
 	};
 
 	auto process_opening_tag = [&]() {
-		if (cur_tag->id < N_TAGS) {
+		if (cur_tag->id > Tag_UNKNOWN) {
 			if (cur_tag->flags & CM_UNIQUE) {
 				if (!hc->tags_seen[cur_tag->id]) {
 					/* Duplicate tag has been found */
@@ -1796,12 +1813,6 @@ html_process_input(rspamd_mempool_t *pool,
 
 	/* Leftover after content */
 	switch (state) {
-	case html_text_content:
-	case content_before_start:
-		if (p > c) {
-			html_append_content(hc, {c, std::size_t(p - c)});
-		}
-		break;
 	case tag_end_opening:
 		if (cur_tag != nullptr) {
 			process_opening_tag();
@@ -1934,12 +1945,6 @@ TEST_CASE("html text extraction")
 {
 
 	const std::vector<std::pair<std::string, std::string>> cases{
-			{"</head>\n"
-			 "<body>\n"
-			 "<p> Hello. I have some bad news.\n"
-			 "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
-			 "</body>\n"
-			 "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
 			{"  <body>\n"
 			 "    <!-- escape content -->\n"
 			 "    a b a > b a < b a & b 'a "a"\n"
@@ -2003,6 +2008,7 @@ TEST_CASE("html text extraction")
 			 "        <td>data2</td>\n"
 			 "      </tr>\n"
 			 "    </table>", "heada headb\ndata1 data2\n"},
+			 /* Invalid closing br and hr + comment */
 			{"  <body>\n"
 			 "    <!-- page content -->\n"
 			 "    Hello, world!<br>test</br><br>content</hr>more content<br>\n"
@@ -2010,6 +2016,16 @@ TEST_CASE("html text extraction")
 			 "      content inside div\n"
 			 "    </div>\n"
 			 "  </body>", "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"},
+			 /* First closing tag */
+			{"</head>\n"
+			 "<body>\n"
+			 "<p> Hello. I have some bad news.\n"
+			 "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
+			 "</body>\n"
+			 "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
+			/* Invalid tags */
+			{"lol <sht> omg </sht> oh my!\n"
+			 "<name>words words</name> goodbye","lol omg oh my! words words goodbye"},
 	};
 
 	rspamd_url_init(NULL);
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
index 67ef5a612..5b5d0ddc0 100644
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -51,7 +51,7 @@ struct html_content {
 
 	/* Preallocate and reserve all internal structures */
 	html_content() {
-		tags_seen.resize(N_TAGS, false);
+		tags_seen.resize(Tag_MAX, false);
 		all_tags.reserve(128);
 		parsed.reserve(256);
 	}
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 17c15962a..a221a48ff 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -85,7 +85,7 @@ struct html_tag {
 	unsigned int tag_start = 0;
 	unsigned int content_offset = 0;
 	std::uint32_t flags = 0;
-	tag_id_t id = N_TAGS;
+	tag_id_t id = Tag_UNKNOWN;
 	html_closing_tag closing;
 
 	std::vector<html_tag_component> components;
@@ -116,7 +116,7 @@ struct html_tag {
 	}
 
 	auto clear(void) -> void {
-		id = N_TAGS;
+		id = Tag_UNKNOWN;
 		tag_start = content_offset = 0;
 		extra = std::monostate{};
 		components.clear();
diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h
index 3f209c08e..e94dd6a9a 100644
--- a/src/libserver/html/html_tags.h
+++ b/src/libserver/html/html_tags.h
@@ -22,7 +22,7 @@ extern "C" {
 
 /* Known HTML tags */
 typedef enum {
-	Tag_UNKNOWN, /**< Unknown tag! */
+	Tag_UNKNOWN = 0, /**< Unknown tag! */
 	Tag_A,      /**< A */
 	Tag_ABBR,   /**< ABBR */
 	Tag_ACRONYM, /**< ACRONYM */
@@ -143,8 +143,9 @@ typedef enum {
 	Tag_XMP,    /**< XMP */
 	Tag_XML,    /**< XML */
 	Tag_NEXTID, /**< NEXTID */
+	Tag_MAX,
 
-	N_TAGS      /**< Must be last */
+	N_TAGS  = -1 /**< Must be -1 */
 } tag_id_t;
 
 #define CM_UNKNOWN      0


More information about the Commits mailing list