commit 6f6f675: [Rework] Html: Start rework of the html content structure
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Jun 2 19:56:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-31 19:51:45 +0100
URL: https://github.com/rspamd/rspamd/commit/6f6f6758fb8414fdb93f18fb667f4919ab588f7c
[Rework] Html: Start rework of the html content structure
---
src/libserver/css/css.cxx | 2 +-
src/libserver/css/css.h | 2 +-
src/libserver/html/html.cxx | 165 +++++++++++++++++++-------------------------
src/libserver/html/html.h | 30 +++-----
4 files changed, 80 insertions(+), 119 deletions(-)
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx
index 033ecdc22..9b0e02230 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -33,7 +33,7 @@ rspamd_css_dtor(void *p)
}
rspamd_css_ptr
-rspamd_css_parse_style(rspamd_mempool_t *pool, const guchar *begin, gsize len,
+rspamd_css_parse_style(rspamd_mempool_t *pool, const gchar *begin, gsize len,
rspamd_css_ptr existing_style,
GError **err)
{
diff --git a/src/libserver/css/css.h b/src/libserver/css/css.h
index 1dabf00b8..607f1fa2c 100644
--- a/src/libserver/css/css.h
+++ b/src/libserver/css/css.h
@@ -26,7 +26,7 @@ extern "C" {
typedef void * rspamd_css_ptr;
rspamd_css_ptr rspamd_css_parse_style (rspamd_mempool_t *pool,
- const guchar *begin,
+ const gchar *begin,
gsize len,
rspamd_css_ptr existing_style,
GError **err);
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 2f124c65f..973649791 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -19,6 +19,7 @@
#include "message.h"
#include "html.h"
#include "html_tags.h"
+#include "html.hxx"
#include "libserver/css/css_value.hxx"
#include "url.h"
@@ -112,15 +113,6 @@ html_process_tag(rspamd_mempool_t *pool,
GNode *nnode;
struct html_tag *parent;
- if (hc->html_tags == NULL) {
- nnode = g_node_new(NULL);
- *cur_level = nnode;
- hc->html_tags = nnode;
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t) g_node_destroy,
- nnode);
- }
-
if (hc->total_tags > rspamd::html::max_tags) {
hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
}
@@ -131,6 +123,10 @@ html_process_tag(rspamd_mempool_t *pool,
return FALSE;
}
+ if (*cur_level == nullptr) {
+ *cur_level = hc->html_tags;
+ }
+
tag->parent = *cur_level;
if (!(tag->flags & (CM_INLINE | CM_EMPTY))) {
@@ -819,8 +815,7 @@ html_process_img_tag(rspamd_mempool_t *pool,
struct html_tag *tag,
struct html_content *hc,
khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls,
- GByteArray *dest)
+ GPtrArray *part_urls)
{
struct html_image *img;
@@ -1667,26 +1662,23 @@ tags_vector_ptr_dtor(void *ptr)
static auto
html_process_part_full (rspamd_mempool_t *pool,
- struct html_content *hc,
GByteArray *in,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
- bool allow_css) -> GByteArray*
+ bool allow_css) -> html_content *
{
- const guchar *p, *c, *end;
+ const gchar *p, *c, *end;
guchar t;
gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
balanced;
- GByteArray *dest;
guint obrace = 0, ebrace = 0;
GNode *cur_level = NULL;
struct rspamd_url *url = NULL;
gint len, href_offset = -1;
struct html_tag *cur_tag = NULL, *content_tag = NULL;
- std::vector<struct html_block *> styles_blocks;
+ std::vector<html_block *> blocks_stack;
struct tag_content_parser_state content_parser_env;
- tags_vector *all_tags;
enum {
parse_start = 0,
@@ -1707,25 +1699,12 @@ html_process_part_full (rspamd_mempool_t *pool,
} state = parse_start;
g_assert (in != NULL);
- g_assert (hc != NULL);
g_assert (pool != NULL);
- all_tags = new tags_vector(128);
- rspamd_mempool_add_destructor(pool, tags_vector_ptr_dtor, all_tags);
-
- hc->tags_seen = (guchar *)rspamd_mempool_alloc0 (pool, NBYTES (N_TAGS));
+ struct html_content *hc = new html_content;
+ rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
- /* Set white background color by default */
- hc->bgcolor.d.comp.alpha = 0;
- hc->bgcolor.d.comp.r = 255;
- hc->bgcolor.d.comp.g = 255;
- hc->bgcolor.d.comp.b = 255;
- hc->bgcolor.valid = TRUE;
-
- dest = g_byte_array_sized_new (in->len / 3 * 2);
- styles_blocks.reserve(32);
-
- p = in->data;
+ p = (const char *)in->data;
c = p;
end = p + in->len;
@@ -1772,8 +1751,8 @@ html_process_part_full (rspamd_mempool_t *pool,
state = tag_content;
content_parser_env.reset();
- all_tags->emplace_back(std::make_unique<html_tag>());
- cur_tag = all_tags->back().get();
+ hc->all_tags.emplace_back(std::make_unique<html_tag>());
+ cur_tag = hc->all_tags.back().get();
break;
}
@@ -1904,7 +1883,7 @@ html_process_part_full (rspamd_mempool_t *pool,
if (p > c) {
if (need_decode) {
- goffset old_offset = dest->len;
+ goffset old_offset = hc->parsed.size();
if (content_tag) {
if (content_tag->content_length == 0) {
@@ -1912,12 +1891,12 @@ html_process_part_full (rspamd_mempool_t *pool,
}
}
- g_byte_array_append (dest, c, (p - c));
+ hc->parsed.append(c, p - c);
len = decode_html_entitles_inplace(
- reinterpret_cast<gchar *>(dest->data + old_offset),
+ hc->parsed.data() + old_offset,
(std::size_t)(p - c));
- dest->len = dest->len + len - (p - c);
+ hc->parsed.resize(hc->parsed.size() + len - (p - c));
if (content_tag) {
content_tag->content_length += len;
@@ -1928,13 +1907,13 @@ html_process_part_full (rspamd_mempool_t *pool,
if (content_tag) {
if (content_tag->content_length == 0) {
- content_tag->content_offset = dest->len;
+ content_tag->content_offset = hc->parsed.size();
}
content_tag->content_length += len;
}
- g_byte_array_append (dest, c, len);
+ hc->parsed.append(c, len);
}
}
@@ -1944,10 +1923,10 @@ html_process_part_full (rspamd_mempool_t *pool,
else {
if (save_space) {
/* Append one space if needed */
- if (dest->len > 0 &&
- !g_ascii_isspace (dest->data[dest->len - 1])) {
- g_byte_array_append (dest,
- reinterpret_cast<const guint8 *>(" "), 1);
+ if (!hc->parsed.empty() &&
+ !g_ascii_isspace (hc->parsed.back())) {
+ hc->parsed += " ";
+
if (content_tag) {
if (content_tag->content_length == 0) {
/*
@@ -1956,7 +1935,7 @@ html_process_part_full (rspamd_mempool_t *pool,
* we have no set content_offset
* so we need to do it here
*/
- content_tag->content_offset = dest->len;
+ content_tag->content_offset = hc->parsed.size();
}
else {
content_tag->content_length++;
@@ -1971,19 +1950,19 @@ html_process_part_full (rspamd_mempool_t *pool,
if (c != p) {
if (need_decode) {
- goffset old_offset = dest->len;
+ goffset old_offset = hc->parsed.size();
if (content_tag) {
if (content_tag->content_length == 0) {
- content_tag->content_offset = dest->len;
+ content_tag->content_offset = hc->parsed.size();
}
}
- g_byte_array_append (dest, c, (p - c));
- len = decode_html_entitles_inplace (
- reinterpret_cast<gchar *>(dest->data + old_offset),
- p - c);
- dest->len = dest->len + len - (p - c);
+ hc->parsed.append(c, p - c);
+ len = decode_html_entitles_inplace(
+ hc->parsed.data() + old_offset,
+ (std::size_t)(p - c));
+ hc->parsed.resize(hc->parsed.size() + len - (p - c));
if (content_tag) {
content_tag->content_length += len;
@@ -1994,13 +1973,13 @@ html_process_part_full (rspamd_mempool_t *pool,
if (content_tag) {
if (content_tag->content_length == 0) {
- content_tag->content_offset = dest->len;
+ content_tag->content_offset = hc->parsed.size();
}
content_tag->content_length += len;
}
- g_byte_array_append (dest, c, len);
+ hc->parsed.append(c, len);
}
}
@@ -2019,7 +1998,7 @@ html_process_part_full (rspamd_mempool_t *pool,
* We just search for the first </s substring and then pass
* the content to the parser (if needed)
*/
- goffset end_style = rspamd_substring_search (reinterpret_cast<const gchar *>(p), end - p,
+ goffset end_style = rspamd_substring_search (p, end - p,
"</", 2);
if (end_style == -1 || g_ascii_tolower (p[end_style + 2]) != 's') {
/* Invalid style */
@@ -2066,8 +2045,7 @@ html_process_part_full (rspamd_mempool_t *pool,
break;
case tag_content:
- parse_tag_content(pool, hc, cur_tag,
- reinterpret_cast<const char *>(p), content_parser_env);
+ parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
if (t == '>') {
if (closing) {
cur_tag->flags |= FL_CLOSING;
@@ -2108,12 +2086,12 @@ html_process_part_full (rspamd_mempool_t *pool,
if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
if (cur_tag->flags & CM_UNIQUE) {
- if (isset (hc->tags_seen, cur_tag->id)) {
+ if (!hc->tags_seen[cur_tag->id]) {
/* Duplicate tag has been found */
hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
}
}
- setbit (hc->tags_seen, cur_tag->id);
+ hc->tags_seen[cur_tag->id] = true;
}
if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
@@ -2122,9 +2100,10 @@ html_process_part_full (rspamd_mempool_t *pool,
/* Handle newlines */
if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
- if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
- g_byte_array_append (dest,
- reinterpret_cast<const guint8 *>("\r\n"), 2);
+ if (!hc->parsed.empty() &&
+ hc->parsed.back() != '\n') {
+
+ hc->parsed += "\r\n";
if (content_tag) {
if (content_tag->content_length == 0) {
@@ -2134,7 +2113,7 @@ html_process_part_full (rspamd_mempool_t *pool,
* we have no set content_offset
* so we need to do it here
*/
- content_tag->content_offset = dest->len;
+ content_tag->content_offset = hc->parsed.size();
}
else {
content_tag->content_length += 2;
@@ -2147,8 +2126,10 @@ html_process_part_full (rspamd_mempool_t *pool,
if ((cur_tag->id == Tag_P ||
cur_tag->id == Tag_TR ||
cur_tag->id == Tag_DIV)) {
- if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
- g_byte_array_append (dest, reinterpret_cast<const guint8 *>("\r\n"), 2);
+ if (!hc->parsed.empty() &&
+ hc->parsed.back() != '\n') {
+
+ hc->parsed += "\r\n";
if (content_tag) {
if (content_tag->content_length == 0) {
@@ -2158,7 +2139,7 @@ html_process_part_full (rspamd_mempool_t *pool,
* we have no set content_offset
* so we need to get it here
*/
- content_tag->content_offset = dest->len;
+ content_tag->content_offset = hc->parsed.size();
}
else {
content_tag->content_length += 2;
@@ -2190,7 +2171,7 @@ html_process_part_full (rspamd_mempool_t *pool,
}
}
- href_offset = dest->len;
+ href_offset = hc->parsed.size();
}
}
@@ -2207,8 +2188,8 @@ html_process_part_full (rspamd_mempool_t *pool,
prev_url = std::get<rspamd_url *>(prev_tag->extra);
std::string_view disp_part{
- reinterpret_cast<const gchar *>(dest->data + href_offset),
- dest->len - href_offset};
+ hc->parsed.data() + href_offset,
+ hc->parsed.size() - href_offset};
html_check_displayed_url (pool,
exceptions, url_set,
disp_part,
@@ -2220,10 +2201,10 @@ html_process_part_full (rspamd_mempool_t *pool,
if (cur_tag->flags & (FL_CLOSING)) {
/* Insert exception */
- if (url != NULL && (gint) dest->len > href_offset) {
+ if (url != NULL && hc->parsed.size() > href_offset) {
std::string_view disp_part{
- reinterpret_cast<const gchar *>(dest->data + href_offset),
- dest->len - href_offset};
+ hc->parsed.data() + href_offset,
+ hc->parsed.size() - href_offset};
html_check_displayed_url (pool,
exceptions, url_set,
disp_part,
@@ -2258,7 +2239,7 @@ html_process_part_full (rspamd_mempool_t *pool,
if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
html_process_img_tag(pool, cur_tag, hc, url_set,
- part_urls, dest);
+ part_urls);
}
else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) {
html_process_link_tag(pool, cur_tag, hc, url_set,
@@ -2269,8 +2250,8 @@ html_process_part_full (rspamd_mempool_t *pool,
if (cur_tag->flags & FL_CLOSING) {
/* Just remove block element from the queue if any */
- if (!styles_blocks.empty()) {
- styles_blocks.pop_back();
+ if (!blocks_stack.empty()) {
+ blocks_stack.pop_back();
}
}
else {
@@ -2279,7 +2260,7 @@ html_process_part_full (rspamd_mempool_t *pool,
if (bl) {
html_propagate_style(hc, cur_tag,
- bl, styles_blocks);
+ bl, blocks_stack);
/* Check visibility */
if (bl->font_size < 3 ||
@@ -2316,32 +2297,27 @@ html_process_part_full (rspamd_mempool_t *pool,
html_propagate_lengths, NULL);
}
- hc->parsed = dest;
-
- return dest;
+ return hc;
}
}
-GByteArray*
-rspamd_html_process_part_full (rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in,
- GList **exceptions,
- khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls,
- bool allow_css)
+void *
+rspamd_html_process_part_full(rspamd_mempool_t *pool,
+ GByteArray *in, GList **exceptions,
+ khash_t (rspamd_url_hash) *url_set,
+ GPtrArray *part_urls,
+ bool allow_css)
{
- return rspamd::html::html_process_part_full(pool, hc, in, exceptions, url_set,
+ return rspamd::html::html_process_part_full(pool, in, exceptions, url_set,
part_urls, allow_css);
}
-GByteArray*
-rspamd_html_process_part (rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in)
+void *
+rspamd_html_process_part(rspamd_mempool_t *pool,
+ GByteArray *in)
{
- return rspamd_html_process_part_full (pool, hc, in, NULL,
+ return rspamd_html_process_part_full (pool, in, NULL,
NULL, NULL, FALSE);
}
@@ -2369,7 +2345,6 @@ rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname)
gint id;
g_assert (hc != NULL);
- g_assert (hc->tags_seen != NULL);
id = rspamd_html_tag_by_name(tagname);
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 6106688f3..23faa47d3 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -102,35 +102,21 @@ struct html_block {
/* Forwarded declaration */
struct rspamd_task;
-
-struct html_content {
- struct rspamd_url *base_url;
- GNode *html_tags;
- gint flags;
- guint total_tags;
- struct html_color bgcolor;
- guchar *tags_seen;
- GPtrArray *images;
- GPtrArray *blocks;
- GByteArray *parsed;
- void *css_style;
-};
+struct html_content;
/*
* Decode HTML entitles in text. Text is modified in place.
*/
guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
-GByteArray *rspamd_html_process_part(rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in);
+void* rspamd_html_process_part(rspamd_mempool_t *pool,
+ GByteArray *in);
-GByteArray *rspamd_html_process_part_full(rspamd_mempool_t *pool,
- struct html_content *hc,
- GByteArray *in, GList **exceptions,
- khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls,
- bool allow_css);
+void *rspamd_html_process_part_full(rspamd_mempool_t *pool,
+ GByteArray *in, GList **exceptions,
+ khash_t (rspamd_url_hash) *url_set,
+ GPtrArray *part_urls,
+ bool allow_css);
/*
* Returns true if a specified tag has been seen in a part
More information about the Commits
mailing list