commit f2f16de: [Project] Html: Add rows display type support
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Jul 1 16:49:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-07-01 17:46:31 +0100
URL: https://github.com/rspamd/rspamd/commit/f2f16de4ab5f5c2ad58d67704ff040ed96058823 (HEAD -> master)
[Project] Html: Add rows display type support
---
src/libserver/css/css_value.cxx | 19 +++++++------
src/libserver/css/css_value.hxx | 1 +
src/libserver/html/html.cxx | 63 ++++++++++++++++++++++++++++++++---------
3 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/src/libserver/css/css_value.cxx b/src/libserver/css/css_value.cxx
index 6988ea993..ec44b86a6 100644
--- a/src/libserver/css/css_value.cxx
+++ b/src/libserver/css/css_value.cxx
@@ -310,14 +310,14 @@ constexpr const auto display_names_map = frozen::make_unordered_map<frozen::stri
{"list-item", css_display_value::DISPLAY_BLOCK},
{"run-in", css_display_value::DISPLAY_INLINE},
{"table", css_display_value::DISPLAY_BLOCK},
- {"table-caption", css_display_value::DISPLAY_BLOCK},
- {"table-column-group", css_display_value::DISPLAY_BLOCK},
- {"table-header-group", css_display_value::DISPLAY_BLOCK},
- {"table-footer-group", css_display_value::DISPLAY_BLOCK},
- {"table-row-group", css_display_value::DISPLAY_BLOCK},
- {"table-cell", css_display_value::DISPLAY_BLOCK},
- {"table-column", css_display_value::DISPLAY_BLOCK},
- {"table-row", css_display_value::DISPLAY_BLOCK},
+ {"table-caption", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-column-group", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-header-group", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-footer-group", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-row-group", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-cell", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-column", css_display_value::DISPLAY_TABLE_ROW},
+ {"table-row", css_display_value::DISPLAY_TABLE_ROW},
{"initial", css_display_value::DISPLAY_INLINE},
});
@@ -364,6 +364,9 @@ auto css_value::debug_str() const -> std::string {
case css_display_value::DISPLAY_INLINE:
ret += "inline";
break;
+ case css_display_value::DISPLAY_TABLE_ROW:
+ ret += "table_row";
+ break;
}
}
else if constexpr (std::is_integral_v<T>) {
diff --git a/src/libserver/css/css_value.hxx b/src/libserver/css/css_value.hxx
index d3d06a544..8dcfa63da 100644
--- a/src/libserver/css/css_value.hxx
+++ b/src/libserver/css/css_value.hxx
@@ -75,6 +75,7 @@ struct css_dimension {
enum class css_display_value : std::uint8_t {
DISPLAY_INLINE,
DISPLAY_BLOCK,
+ DISPLAY_TABLE_ROW,
DISPLAY_HIDDEN
};
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 088202286..ae73b7413 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1041,10 +1041,26 @@ html_append_tag_content(rspamd_mempool_t *pool,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set) -> goffset
{
- auto is_visible = true, is_block = false;
+ auto is_visible = true, is_block = false, is_spaces = false;
goffset next_tag_offset = tag->closing.end,
initial_dest_offset = hc->parsed.size();
+ auto append_margin = [&](char c) -> void {
+ if (is_visible) {
+ if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
+ if (hc->parsed.back() == ' ') {
+ /* We also strip extra spaces at the end */
+ hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
+ [](auto ch) -> auto {
+ return ch != ' ';
+ }).base(),
+ hc->parsed.end());
+ }
+ hc->parsed.push_back(c);
+ }
+ }
+ };
+
if (tag->id == Tag_BR || tag->id == Tag_HR) {
hc->parsed.append("\n");
@@ -1064,16 +1080,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
else if (!tag->block->is_visible()) {
is_visible = false;
}
- else {
- is_block = tag->block->has_display() &&
- tag->block->display == css::css_display_value::DISPLAY_BLOCK;
+ else if (tag->block->has_display()) {
+ if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+ is_block = true;
+ }
+ else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
+ is_spaces = true;
+ }
}
}
if (is_block) {
- if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
- hc->parsed.append("\n");
- }
+ append_margin('\n');
+ }
+ else if (is_spaces) {
+ append_margin(' ');
}
goffset cur_offset = tag->content_offset;
@@ -1104,11 +1125,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
std::size_t(final_part_len)});
}
}
-
- if (is_block && is_visible) {
- if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
- hc->parsed.append("\n");
- }
+ if (is_block) {
+ append_margin('\n');
+ }
+ else if (is_spaces) {
+ append_margin(' ');
}
if (is_visible) {
@@ -1707,12 +1728,15 @@ html_process_input(rspamd_mempool_t *pool,
if (tag->block) {
if (!tag->block->has_display()) {
/* If we have no display field, we can check it by tag */
- if (tag->flags & CM_BLOCK) {
+ if (tag->flags & (CM_BLOCK|CM_TABLE)) {
tag->block->set_display(css::css_display_value::DISPLAY_BLOCK);
}
else if (tag->flags & CM_HEAD) {
tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN);
}
+ else if (tag->flags & CM_ROW) {
+ tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW);
+ }
else {
tag->block->set_display(css::css_display_value::DISPLAY_INLINE);
}
@@ -1892,6 +1916,17 @@ TEST_CASE("html text extraction")
{
const std::vector<std::pair<std::string, std::string>> cases{
+ /* Tables */
+ {"<table>\n"
+ " <tr>\n"
+ " <th>heada</th>\n"
+ " <th>headb</th>\n"
+ " </tr>\n"
+ " <tr>\n"
+ " <td>data1</td>\n"
+ " <td>data2</td>\n"
+ " </tr>\n"
+ " </table>", "heada headb\ndata1 data2\n"},
/* XML tags */
{"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
" <!DOCTYPE html\n"
@@ -1938,7 +1973,7 @@ TEST_CASE("html text extraction")
" </P>\n"
" <b>stuff</p>?\n"
" </body>\n"
- "</html>", "Hello, world! test\ndata<> \nstuff?"},
+ "</html>", "Hello, world! test\ndata<>\nstuff?"},
{"<p><!--comment-->test</br></hr><br>", "test\n"},
};
More information about the Commits
mailing list