commit 4ce4d61: [Project] Html: Replace \0 in html content

Mon Jul 12 15:56:05 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-07-12 16:50:07 +0100
URL: https://github.com/rspamd/rspamd/commit/4ce4d6163e3c28e548ddb306fc3f52a82394a02b (HEAD -> master)

[Project] Html: Replace \0 in html content

---
 src/libserver/html/html.cxx | 46 +++++++++++++++++++++++++++++++++------------
 src/libserver/html/html.h   |  1 +
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index e4a3097d0..d5a351341 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -968,7 +968,26 @@ html_append_content(struct html_content *hc, std::string_view data, bool transpa
 			}
 		}
 
-		hc->parsed.append(data);
+		if (data.find('\0') != data.npos) {
+			auto replace_zero_func = [](auto input, auto output) {
+				const auto last = input.cend();
+				for (auto it = input.cbegin(); it != last; ++it) {
+					if (*it == '\0') {
+						output.append(u8"\uFFFD");
+					}
+					else {
+						output.push_back(*it);
+					}
+				}
+			};
+
+			hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD"));
+			replace_zero_func(data, hc->parsed);
+			hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
+		}
+		else {
+			hc->parsed.append(data);
+		}
 	}
 
 	auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset,
@@ -2002,19 +2021,12 @@ TEST_CASE("html parsing")
 
 TEST_CASE("html text extraction")
 {
-
+	using namespace std::string_literals;
 	const std::vector<std::pair<std::string, std::string>> cases{
-			{"  <body>\n"
-			 "    <!-- escape content -->\n"
-			 "    a b a > b a < b a & b 'a "a"\n"
-			 "  </body>", R"|(a b a > b a < b a & b 'a "a")|"},
-			/* XML tags */
-			{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
-			 " <!DOCTYPE html\n"
-			 "   PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
-			 "   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
-			 "<body>test</body>", "test"},
 			{"test", "test"},
+			{"test\0"s, "test\uFFFD"},
+			{"test\0test"s, "test\uFFFDtest"},
+			{"test\0\0test"s, "test\uFFFD\uFFFDtest"},
 			{"test   ", "test"},
 			{"test   foo,   bar", "test foo, bar"},
 			{"<p>text</p>", "text\n"},
@@ -2025,6 +2037,16 @@ TEST_CASE("html text extraction")
 			{"foo<br>baz", "foo\nbaz"},
 			{"<a href=https://example.com>test</a>", "test"},
 			{"<img alt=test>", "test"},
+			{"  <body>\n"
+			 "    <!-- escape content -->\n"
+			 "    a b a > b a < b a & b 'a "a"\n"
+			 "  </body>", R"|(a b a > b a < b a & b 'a "a")|"},
+			/* XML tags */
+			{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
+			 " <!DOCTYPE html\n"
+			 "   PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
+			 "   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
+			 "<body>test</body>", "test"},
 			{"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
 			 "  <body>\n"
 			 "    <p><br>\n"
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index b6307f88f..cc8039c22 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -36,6 +36,7 @@ extern "C" {
 #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
 #define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
 #define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
+#define RSPAMD_HTML_FLAG_HAS_ZEROS (1 << 8)
 
 /*
  * Image flags