commit f2f16de: [Project] Html: Add rows display type support

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Jul 1 16:49:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-01 17:46:31 +0100
URL: https://github.com/rspamd/rspamd/commit/f2f16de4ab5f5c2ad58d67704ff040ed96058823 (HEAD -> master)

[Project] Html: Add rows display type support

---
 src/libserver/css/css_value.cxx | 19 +++++++------
 src/libserver/css/css_value.hxx |  1 +
 src/libserver/html/html.cxx     | 63 ++++++++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/src/libserver/css/css_value.cxx b/src/libserver/css/css_value.cxx
index 6988ea993..ec44b86a6 100644
--- a/src/libserver/css/css_value.cxx
+++ b/src/libserver/css/css_value.cxx
@@ -310,14 +310,14 @@ constexpr const auto display_names_map = frozen::make_unordered_map<frozen::stri
 		{"list-item",          css_display_value::DISPLAY_BLOCK},
 		{"run-in",             css_display_value::DISPLAY_INLINE},
 		{"table",              css_display_value::DISPLAY_BLOCK},
-		{"table-caption",      css_display_value::DISPLAY_BLOCK},
-		{"table-column-group", css_display_value::DISPLAY_BLOCK},
-		{"table-header-group", css_display_value::DISPLAY_BLOCK},
-		{"table-footer-group", css_display_value::DISPLAY_BLOCK},
-		{"table-row-group",    css_display_value::DISPLAY_BLOCK},
-		{"table-cell",         css_display_value::DISPLAY_BLOCK},
-		{"table-column",       css_display_value::DISPLAY_BLOCK},
-		{"table-row",          css_display_value::DISPLAY_BLOCK},
+		{"table-caption",      css_display_value::DISPLAY_TABLE_ROW},
+		{"table-column-group", css_display_value::DISPLAY_TABLE_ROW},
+		{"table-header-group", css_display_value::DISPLAY_TABLE_ROW},
+		{"table-footer-group", css_display_value::DISPLAY_TABLE_ROW},
+		{"table-row-group",    css_display_value::DISPLAY_TABLE_ROW},
+		{"table-cell",         css_display_value::DISPLAY_TABLE_ROW},
+		{"table-column",       css_display_value::DISPLAY_TABLE_ROW},
+		{"table-row",          css_display_value::DISPLAY_TABLE_ROW},
 		{"initial",            css_display_value::DISPLAY_INLINE},
 });
 
@@ -364,6 +364,9 @@ auto css_value::debug_str() const -> std::string {
 			case css_display_value::DISPLAY_INLINE:
 				ret += "inline";
 				break;
+			case css_display_value::DISPLAY_TABLE_ROW:
+				ret += "table_row";
+				break;
 			}
 		}
 		else if constexpr (std::is_integral_v<T>) {
diff --git a/src/libserver/css/css_value.hxx b/src/libserver/css/css_value.hxx
index d3d06a544..8dcfa63da 100644
--- a/src/libserver/css/css_value.hxx
+++ b/src/libserver/css/css_value.hxx
@@ -75,6 +75,7 @@ struct css_dimension {
 enum class css_display_value : std::uint8_t {
 	DISPLAY_INLINE,
 	DISPLAY_BLOCK,
+	DISPLAY_TABLE_ROW,
 	DISPLAY_HIDDEN
 };
 
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 088202286..ae73b7413 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1041,10 +1041,26 @@ html_append_tag_content(rspamd_mempool_t *pool,
 						GList **exceptions,
 						khash_t (rspamd_url_hash) *url_set) -> goffset
 {
-	auto is_visible = true, is_block = false;
+	auto is_visible = true, is_block = false, is_spaces = false;
 	goffset next_tag_offset = tag->closing.end,
 			initial_dest_offset = hc->parsed.size();
 
+	auto append_margin = [&](char c) -> void {
+		if (is_visible) {
+			if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
+				if (hc->parsed.back() == ' ') {
+					/* We also strip extra spaces at the end */
+					hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
+							[](auto ch) -> auto {
+								return ch != ' ';
+							}).base(),
+							hc->parsed.end());
+				}
+				hc->parsed.push_back(c);
+			}
+		}
+	};
+
 	if (tag->id == Tag_BR || tag->id == Tag_HR) {
 		hc->parsed.append("\n");
 
@@ -1064,16 +1080,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
 		else if (!tag->block->is_visible()) {
 			is_visible = false;
 		}
-		else {
-			is_block = tag->block->has_display() &&
-					   tag->block->display == css::css_display_value::DISPLAY_BLOCK;
+		else if (tag->block->has_display()) {
+			if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+				is_block = true;
+			}
+			else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
+				is_spaces = true;
+			}
 		}
 	}
 
 	if (is_block) {
-		if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
-			hc->parsed.append("\n");
-		}
+		append_margin('\n');
+	}
+	else if (is_spaces) {
+		append_margin(' ');
 	}
 
 	goffset cur_offset = tag->content_offset;
@@ -1104,11 +1125,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
 									 std::size_t(final_part_len)});
 		}
 	}
-
-	if (is_block && is_visible) {
-		if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
-			hc->parsed.append("\n");
-		}
+	if (is_block) {
+		append_margin('\n');
+	}
+	else if (is_spaces) {
+		append_margin(' ');
 	}
 
 	if (is_visible) {
@@ -1707,12 +1728,15 @@ html_process_input(rspamd_mempool_t *pool,
 		if (tag->block) {
 			if (!tag->block->has_display()) {
 				/* If we have no display field, we can check it by tag */
-				if (tag->flags & CM_BLOCK) {
+				if (tag->flags & (CM_BLOCK|CM_TABLE)) {
 					tag->block->set_display(css::css_display_value::DISPLAY_BLOCK);
 				}
 				else if (tag->flags & CM_HEAD) {
 					tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN);
 				}
+				else if (tag->flags & CM_ROW) {
+					tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW);
+				}
 				else {
 					tag->block->set_display(css::css_display_value::DISPLAY_INLINE);
 				}
@@ -1892,6 +1916,17 @@ TEST_CASE("html text extraction")
 {
 
 	const std::vector<std::pair<std::string, std::string>> cases{
+			/* Tables */
+			{"<table>\n"
+			 "      <tr>\n"
+			 "        <th>heada</th>\n"
+			 "        <th>headb</th>\n"
+			 "      </tr>\n"
+			 "      <tr>\n"
+			 "        <td>data1</td>\n"
+			 "        <td>data2</td>\n"
+			 "      </tr>\n"
+			 "    </table>", "heada headb\ndata1 data2\n"},
 			/* XML tags */
 			{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
 			 " <!DOCTYPE html\n"
@@ -1938,7 +1973,7 @@ TEST_CASE("html text extraction")
 			 "    </P>\n"
 			 "    <b>stuff</p>?\n"
 			 "  </body>\n"
-			 "</html>", "Hello, world! test\ndata<> \nstuff?"},
+			 "</html>", "Hello, world! test\ndata<>\nstuff?"},
 			{"<p><!--comment-->test</br></hr><br>", "test\n"},
 
 	};


More information about the Commits mailing list