commit edf974f: [Minor] Switch from head state on meaningful tags

Tue Jul 13 15:56:05 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-07-13 16:45:46 +0100
URL: https://github.com/rspamd/rspamd/commit/edf974f4d07d08009fe51409d834cf4a0352e792

[Minor] Switch from head state on meaningful tags

---
 src/libserver/html/html.cxx          |  3 +++
 src/libserver/html/html_entities.cxx |  4 ++--
 src/libserver/html/html_tests.cxx    | 29 ++++++++++++++++++++---------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index bde7c0117..cf12b0a01 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1724,6 +1724,9 @@ html_process_input(rspamd_mempool_t *pool,
 
 							html_document_state = html_document_state::body;
 						}
+						else if (cur_tag->id == Tag_BODY) {
+							html_document_state = html_document_state::body;
+						}
 					}
 				}
 
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index fa19463a0..95eb9f988 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2579,9 +2579,9 @@ decode_html_entitles_inplace(std::string &st) -> void
 	st.resize(nlen);
 }
 
-TEST_SUITE("html") {
+TEST_SUITE("html entities") {
 
-	TEST_CASE("html entities") {
+	TEST_CASE("html entities decode") {
 		std::vector<std::pair<std::string, std::string>> cases{
 				{"", ""},
 				{"abc", "abc"},
diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx
index a0c60b299..ac06a353b 100644
--- a/src/libserver/html/html_tests.cxx
+++ b/src/libserver/html/html_tests.cxx
@@ -220,9 +220,15 @@ TEST_CASE("html text extraction")
 TEST_CASE("html urls extraction")
 {
 	using namespace std::string_literals;
-	const std::vector<std::pair<std::string, std::vector<std::string>>> cases{
-			{"<a href=\"https://example.com\">test</a>", {"https://example.com"}},
-			{"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}},
+	const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{
+			{"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"},
+			{"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"},
+			{"<html>\n"
+			 "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n"
+			 "<body>\n"
+			 "<a href=\"https://www.example.com\">hello</a>\n"
+			 "</body>\n"
+			 "</html>", {"https://www.example.com"}, "hello"}
 	};
 
 	rspamd_url_init(NULL);
@@ -232,15 +238,20 @@ TEST_CASE("html urls extraction")
 	for (const auto &c : cases) {
 		SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) {
 			GPtrArray *purls = g_ptr_array_new();
-			GByteArray *tmp = g_byte_array_sized_new(c.first.size());
-			g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+			auto input = std::get<0>(c);
+			GByteArray *tmp = g_byte_array_sized_new(input.size());
+			g_byte_array_append(tmp, (const guint8 *)input.data(), input.size());
 			auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true);
 			CHECK(hc != nullptr);
-			auto expected = c.second;
-			CHECK(expected.size() == purls->len);
-			for (auto j = 0; j < expected.size(); ++j) {
+			auto &expected_text = std::get<2>(c);
+			if (expected_text.has_value()) {
+				CHECK(hc->parsed == expected_text.value());
+			}
+			const auto &expected_urls = std::get<1>(c);
+			CHECK(expected_urls.size() == purls->len);
+			for (auto j = 0; j < expected_urls.size(); ++j) {
 				auto *url = (rspamd_url *)g_ptr_array_index(purls, j);
-				CHECK(expected[j] == std::string{url->string, url->urllen});
+				CHECK(expected_urls[j] == std::string{url->string, url->urllen});
 			}
 			g_byte_array_free(tmp, TRUE);
 			g_ptr_array_free(purls, TRUE);