commit f8351d7: [Project] Html: More spaces logic fixes

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Jun 24 16:42:05 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-24 17:38:20 +0100
URL: https://github.com/rspamd/rspamd/commit/f8351d7aa84a526dfb8ac92fec9894f51e2a6359 (HEAD -> master)

[Project] Html: More spaces logic fixes

---
 src/libserver/html/html.cxx          | 20 ++++++++++++++++++++
 src/libserver/html/html_entities.cxx |  7 +++++++
 2 files changed, 27 insertions(+)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 45094e7f8..894b1ee45 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1849,6 +1849,26 @@ TEST_CASE("html text extraction")
 			 "</body>", "\n\n\ntest\n"},
 			{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
 			 "sh<span style=\"FONT-SIZE: 0px\">aring </span></div>", "fish\n"},
+			/* FIXME: broken until rework */
+			//{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
+			// "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
+			{"<p><!--comment-->test", "test"},
+			{"<!DOCTYPE html>\n"
+			 "<html lang=\"en\">\n"
+			 "  <head>\n"
+			 "    <meta charset=\"utf-8\">\n"
+			 "    <title>title</title>\n"
+			 "    <link rel=\"stylesheet\" href=\"style.css\">\n"
+			 "    <script src=\"script.js\"></script>\n"
+			 "  </head>\n"
+			 "  <body>\n"
+			 "    <!-- page content -->\n"
+			 "    Hello, world! <b>test</b>\n"
+			 "    <p>data<>\n"
+			 "    </P>\n"
+			 "    <b>stuff</p>?\n"
+			 "  </body>\n"
+			 "</html>", "Hello, world! test\ndata<> \nstuff?"}
 	};
 
 	rspamd_url_init(NULL);
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 573872f43..4cbdf02bf 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2552,9 +2552,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 	}
 
 	if (norm_spaces) {
+		bool seen_spaces = false;
+
 		while (t > s && g_ascii_isspace(*(t - 1))) {
+			seen_spaces = true;
 			t --;
 		}
+
+		if (seen_spaces) {
+			*t++ = ' ';
+		}
 	}
 
 	return (t - s);


More information about the Commits mailing list