commit 56f52fc: [Minor] Another set of fixes in the spaces normalisation

Wed Jun 23 11:14:05 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-06-23 12:04:05 +0100
URL: https://github.com/rspamd/rspamd/commit/56f52fcf7a180a3fb8a0b803142ad13c9478e5f6

[Minor] Another set of fixes in the spaces normalisation

---
 src/libserver/html/html.cxx          | 24 ++++++++++++++++--------
 src/libserver/html/html_entities.cxx | 23 +++++++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 32862ed20..20a38ee09 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1090,9 +1090,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 			initial_dest_offset = hc->parsed.size();
 
 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
-		if (!hc->parsed.empty()) {
-			hc->parsed.append("\n");
-		}
+		hc->parsed.append("\n");
 
 		return tag->content_offset;
 	}
@@ -1163,17 +1161,19 @@ html_append_tag_content(rspamd_mempool_t *pool,
 			cur_offset = html_append_tag_content(pool, start, len, hc, next_enclosed,
 					nested_stack, exceptions, url_set);
 
-			initial_part_len = next_tag_offset - cur_offset;
-			if (is_visible && initial_part_len > 0) {
-				html_append_content(hc, {start + cur_offset,
-										 std::size_t(initial_part_len)});
+			if (enclosed_tags.empty()) {
+				initial_part_len = next_tag_offset - cur_offset;
+				if (is_visible && initial_part_len > 0) {
+					html_append_content(hc, {start + cur_offset,
+											 std::size_t(initial_part_len)});
+				}
 			}
 		}
 
 	} while (!enclosed_tags.empty());
 
 	if (is_block && is_visible) {
-		if (!hc->parsed.empty()) {
+		if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
 			hc->parsed.append("\n");
 		}
 	}
@@ -1817,6 +1817,14 @@ TEST_CASE("html text extraction")
 			{"<div>foo</div><div>bar</div>", "foo\nbar\n"},
 			{"<a href=https://example.com>test</a>", "test"},
 			{"<img alt=test>", "test"},
+			{"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
+			 "  <body>\n"
+			 "    <p><br>\n"
+			 "    </p>\n"
+			 "    <div class=\"moz-forward-container\"><br>\n"
+			 "      <br>\n"
+			 "      test</div>"
+			 "</body>", "\ntest\n"},
 	};
 
 	rspamd_url_init(NULL);
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 2cc3c11f5..97c84f64e 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2236,6 +2236,7 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 		normal_content,
 		ampersand,
 		skip_multi_spaces,
+		skip_start_spaces,
 	} state = parser_state::normal_content;
 
 	end = s + len;
@@ -2441,6 +2442,10 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 		return false;
 	};
 
+	if (norm_spaces && g_ascii_isspace(*h)) {
+		state = parser_state::skip_start_spaces;
+	}
+
 	while (h - s < len && t <= h) {
 		switch (state) {
 		case parser_state::normal_content:
@@ -2516,6 +2521,14 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 				state = parser_state::normal_content;
 			}
 			break;
+		case parser_state::skip_start_spaces:
+			if (g_ascii_isspace(*h)) {
+				h ++;
+			}
+			else {
+				state = parser_state::normal_content;
+			}
+			break;
 		}
 	}
 
@@ -2537,6 +2550,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 		}
 	}
 
+	if (norm_spaces && g_ascii_isspace(*t)) {
+		do {
+			t --;
+		} while (t > s && g_ascii_isspace(*t));
+
+		if (!g_ascii_isspace(*t)) {
+			t++; /* Preserve last space character */
+		}
+	}
+
 	return (t - s);
 }