commit 570007a: [Minor] Html: Fix some more mess with bad closing tags

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Jul 2 10:42:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-02 11:33:52 +0100
URL: https://github.com/rspamd/rspamd/commit/570007aa24ad1125ebc4b6ab98d383cf9b16ca3e

[Minor] Html: Fix some more mess with bad closing tags

---
 src/libserver/html/html.cxx | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 82c5d213c..967411b2a 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1062,7 +1062,10 @@ html_append_tag_content(rspamd_mempool_t *pool,
 	};
 
 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
-		hc->parsed.append("\n");
+
+		if (!(tag->flags & FL_IGNORE)) {
+			hc->parsed.append("\n");
+		}
 
 		return tag->content_offset;
 	}
@@ -1331,6 +1334,10 @@ html_process_input(rspamd_mempool_t *pool,
 		if (!(cur_tag->flags & CM_EMPTY)) {
 			html_process_block_tag(pool, cur_tag, hc);
 		}
+		else {
+			/* Implicitly close */
+			cur_tag->flags |= FL_CLOSED;
+		}
 
 		if (cur_tag->flags & FL_CLOSED) {
 			cur_tag->closing.end = cur_tag->content_offset;
@@ -1660,6 +1667,11 @@ html_process_input(rspamd_mempool_t *pool,
 			break;
 		case tag_end_closing: {
 			if (cur_tag) {
+
+				if (cur_tag->flags & CM_EMPTY) {
+					/* Ignore closing empty tags */
+					cur_tag->flags |= FL_IGNORE;
+				}
 				/* cur_tag here is a closing tag */
 				auto *next_cur_tag = html_check_balance(hc, cur_tag,
 						c - start, p - start + 1);
@@ -1687,7 +1699,7 @@ html_process_input(rspamd_mempool_t *pool,
 
 					auto &&vtag = std::make_unique<html_tag>();
 					vtag->id = cur_tag->id;
-					vtag->flags = FL_VIRTUAL | FL_CLOSED;
+					vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
 					vtag->tag_start = cur_tag->closing.start;
 					vtag->content_offset = p - start + 1;
 					vtag->closing = cur_tag->closing;
@@ -1918,6 +1930,13 @@ TEST_CASE("html text extraction")
 {
 
 	const std::vector<std::pair<std::string, std::string>> cases{
+			{"  <body>\n"
+			 "    <!-- page content -->\n"
+			 "    Hello, world!<br>test</br><br>content</hr>more content<br>\n"
+			 "    <div>\n"
+			 "      content inside div\n"
+			 "    </div>\n"
+			 "  </body>", "Hello, world!\ntest\ncontent\nmore content\ncontent inside div\n"},
 			{"  <body>\n"
 			 "    <!-- escape content -->\n"
 			 "    a b a > b a < b a & b 'a "a"\n"
@@ -1981,6 +2000,7 @@ TEST_CASE("html text extraction")
 			 "        <td>data2</td>\n"
 			 "      </tr>\n"
 			 "    </table>", "heada headb\ndata1 data2\n"},
+
 	};
 
 	rspamd_url_init(NULL);


More information about the Commits mailing list