commit 71d6269: [Rework] Html: Final rework part for the html processing code

Vsevolod Stakhov vsevolod at highsecure.ru
Thu May 27 14:07:11 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-26 17:23:17 +0100
URL: https://github.com/rspamd/rspamd/commit/71d6269deafc1b9286c821db650503720e5c9f9c

[Rework] Html: Final rework part for the html processing code

---
 src/libserver/html/html.cxx | 158 ++++++++++++++++++++++++++------------------
 1 file changed, 95 insertions(+), 63 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 94f02111d..b68f40360 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -103,7 +103,7 @@ rspamd_html_check_balance(GNode *node, GNode **cur_level)
 }
 
 static gboolean
-rspamd_html_process_tag(rspamd_mempool_t *pool,
+html_process_tag(rspamd_mempool_t *pool,
 						struct html_content *hc,
 						struct html_tag *tag,
 						GNode **cur_level,
@@ -269,7 +269,7 @@ parse_tag_content(rspamd_mempool_t *pool,
 				  struct html_content *hc,
 				  struct html_tag *tag,
 				  const char *in,
-				  struct tag_content_parser_state parser_env)
+				  struct tag_content_parser_state &parser_env)
 {
 	enum tag_parser_state {
 		parse_start = 0,
@@ -1554,14 +1554,9 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 	tag->extra = bl;
 }
 
-
-}
-
-/* Unconverted C part */
-
-
-static gboolean
-rspamd_html_propagate_lengths(GNode *node, gpointer _unused) {
+static auto
+html_propagate_lengths(GNode *node, gpointer _unused) -> gboolean
+{
 	GNode *child;
 	struct html_tag *tag = static_cast<html_tag *>(node->data), *cld_tag;
 
@@ -1579,15 +1574,15 @@ rspamd_html_propagate_lengths(GNode *node, gpointer _unused) {
 	return FALSE;
 }
 
-static void
-rspamd_html_propagate_style(struct html_content *hc,
+static auto
+html_propagate_style(struct html_content *hc,
 							struct html_tag *tag,
 							struct html_block *bl,
-							GQueue *blocks) {
+							GQueue *blocks) -> void
+{
 	struct html_block *bl_parent;
 	gboolean push_block = FALSE;
 
-
 	/* Propagate from the parent if needed */
 	bl_parent = static_cast<html_block *>(g_queue_peek_tail(blocks));
 
@@ -1656,27 +1651,38 @@ rspamd_html_propagate_style(struct html_content *hc,
 	}
 }
 
+using tags_vector = std::vector<std::unique_ptr<struct html_tag>>;
 
-GByteArray*
-rspamd_html_process_part_full (rspamd_mempool_t *pool,
-							   struct html_content *hc,
-							   GByteArray *in,
-							   GList **exceptions,
-							   khash_t (rspamd_url_hash) *url_set,
-							   GPtrArray *part_urls,
-							   bool allow_css)
+static auto
+tags_vector_ptr_dtor(void *ptr)
 {
-	const guchar *p, *c, *end, *savep = NULL;
+	auto *ptags = (tags_vector *)ptr;
+
+	delete ptags;
+}
+
+static auto
+html_process_part_full (rspamd_mempool_t *pool,
+						struct html_content *hc,
+						GByteArray *in,
+						GList **exceptions,
+						khash_t (rspamd_url_hash) *url_set,
+						GPtrArray *part_urls,
+						bool allow_css) -> GByteArray*
+{
+	const guchar *p, *c, *end;
 	guchar t;
 	gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
 			balanced;
 	GByteArray *dest;
 	guint obrace = 0, ebrace = 0;
 	GNode *cur_level = NULL;
-	gint substate = 0, len, href_offset = -1;
-	struct html_tag *cur_tag = NULL, *content_tag = NULL;
 	struct rspamd_url *url = NULL;
+	gint len, href_offset = -1;
+	struct html_tag *cur_tag = NULL, *content_tag = NULL;
 	GQueue *styles_blocks;
+	struct tag_content_parser_state content_parser_env;
+	tags_vector *all_tags;
 
 	enum {
 		parse_start = 0,
@@ -1700,6 +1706,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 	g_assert (hc != NULL);
 	g_assert (pool != NULL);
 
+	all_tags = new tags_vector(128);
+	rspamd_mempool_add_destructor(pool, tags_vector_ptr_dtor, all_tags);
+
 	hc->tags_seen = (guchar *)rspamd_mempool_alloc0 (pool, NBYTES (N_TAGS));
 
 	/* Set white background color by default */
@@ -1757,12 +1766,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 				continue;
 			default:
 				state = tag_content;
-				substate = 0;
-				savep = NULL;
-				cur_tag = rspamd_mempool_alloc0_type (pool, struct html_tag);
-				cur_tag->params = g_queue_new ();
-				rspamd_mempool_add_destructor (pool,
-						(rspamd_mempool_destruct_t)g_queue_free, cur_tag->params);
+				content_parser_env.reset();
+
+				all_tags->emplace_back(std::make_unique<html_tag>());
+				cur_tag = all_tags->back().get();
 				break;
 			}
 
@@ -1903,9 +1910,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 
 							g_byte_array_append (dest, c, (p - c));
 
-							len = rspamd_html_decode_entitles_inplace (
+							len = decode_html_entitles_inplace(
 									reinterpret_cast<gchar *>(dest->data + old_offset),
-									p - c);
+									(std::size_t)(p - c));
 							dest->len = dest->len + len - (p - c);
 
 							if (content_tag) {
@@ -1934,8 +1941,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 					if (save_space) {
 						/* Append one space if needed */
 						if (dest->len > 0 &&
-								!g_ascii_isspace (dest->data[dest->len - 1])) {
-							g_byte_array_append (dest, reinterpret_cast<const guint8 *>(" "), 1);
+							!g_ascii_isspace (dest->data[dest->len - 1])) {
+							g_byte_array_append (dest,
+									reinterpret_cast<const guint8 *>(" "), 1);
 							if (content_tag) {
 								if (content_tag->content_length == 0) {
 									/*
@@ -1968,7 +1976,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 						}
 
 						g_byte_array_append (dest, c, (p - c));
-						len = rspamd_html_decode_entitles_inplace (
+						len = decode_html_entitles_inplace (
 								reinterpret_cast<gchar *>(dest->data + old_offset),
 								p - c);
 						dest->len = dest->len + len - (p - c);
@@ -2017,7 +2025,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 
 				if (allow_css) {
 					GError *err = NULL;
-					hc->css_style = rspamd_css_parse_style (pool, p, end_style, hc->css_style,
+					hc->css_style = rspamd_css_parse_style(pool, p, end_style, hc->css_style,
 							&err);
 
 					if (err) {
@@ -2054,8 +2062,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 			break;
 
 		case tag_content:
-			rspamd_html_parse_tag_content (pool, hc, cur_tag,
-					p, &substate, &savep);
+			parse_tag_content(pool, hc, cur_tag,
+					reinterpret_cast<const char *>(p), content_parser_env);
 			if (t == '>') {
 				if (closing) {
 					cur_tag->flags |= FL_CLOSING;
@@ -2075,13 +2083,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 			break;
 
 		case tag_end:
-			substate = 0;
-			savep = NULL;
+			content_parser_env.reset();
 
 			if (cur_tag != NULL) {
 				balanced = TRUE;
 
-				if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level,
+				if (html_process_tag (pool, hc, cur_tag, &cur_level,
 						&balanced)) {
 					state = content_write;
 					need_decode = FALSE;
@@ -2112,7 +2119,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 				/* Handle newlines */
 				if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
 					if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
-						g_byte_array_append (dest, reinterpret_cast<const guint8 *>("\r\n"), 2);
+						g_byte_array_append (dest,
+								reinterpret_cast<const guint8 *>("\r\n"), 2);
 
 						if (content_tag) {
 							if (content_tag->content_length == 0) {
@@ -2133,8 +2141,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 				}
 
 				if ((cur_tag->id == Tag_P ||
-						cur_tag->id == Tag_TR ||
-						cur_tag->id == Tag_DIV)) {
+					 cur_tag->id == Tag_TR ||
+					 cur_tag->id == Tag_DIV)) {
 					if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
 						g_byte_array_append (dest, reinterpret_cast<const guint8 *>("\r\n"), 2);
 
@@ -2159,15 +2167,16 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 				/* XXX: uncomment when styles parsing is not so broken */
 				if (cur_tag->flags & FL_HREF /* && !(cur_tag->flags & FL_IGNORE) */) {
 					if (!(cur_tag->flags & (FL_CLOSING))) {
-						url = rspamd_html_process_url_tag (pool, cur_tag, hc);
+						auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
 
-						if (url != NULL) {
+						if (maybe_url) {
+							url = maybe_url.value();
 
 							if (url_set != NULL) {
 								struct rspamd_url *maybe_existing =
-										rspamd_url_set_add_or_return (url_set, url);
-								if (maybe_existing == url) {
-									rspamd_process_html_url (pool, url, url_set,
+										rspamd_url_set_add_or_return (url_set, maybe_url.value());
+								if (maybe_existing == maybe_url.value()) {
+									process_html_query_url(pool, url, url_set,
 											part_urls);
 								}
 								else {
@@ -2189,13 +2198,17 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 							prev_tag = static_cast<html_tag *>(cur_level->prev->data);
 
 							if (prev_tag->id == Tag_A &&
-									!(prev_tag->flags & (FL_CLOSING)) &&
-									prev_tag->extra) {
+								!(prev_tag->flags & (FL_CLOSING)) &&
+								prev_tag->extra) {
 								prev_url = static_cast<rspamd_url *>(prev_tag->extra);
 
-								rspamd_html_check_displayed_url (pool,
+								std::string_view disp_part{
+										reinterpret_cast<const gchar *>(dest->data + href_offset),
+										dest->len - href_offset};
+								html_check_displayed_url (pool,
 										exceptions, url_set,
-										dest, href_offset,
+										disp_part,
+										href_offset,
 										prev_url);
 							}
 						}
@@ -2204,9 +2217,13 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 
 							/* Insert exception */
 							if (url != NULL && (gint) dest->len > href_offset) {
-								rspamd_html_check_displayed_url (pool,
+								std::string_view disp_part{
+									reinterpret_cast<const gchar *>(dest->data + href_offset),
+									dest->len - href_offset};
+								html_check_displayed_url (pool,
 										exceptions, url_set,
-										dest, href_offset,
+										disp_part,
+										href_offset,
 										url);
 
 							}
@@ -2221,9 +2238,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 					 * Base is allowed only within head tag but HTML is retarded
 					 */
 					if (hc->base_url == NULL) {
-						url = rspamd_html_process_url_tag (pool, cur_tag, hc);
+						auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
 
-						if (url != NULL) {
+						if (maybe_url) {
 							msg_debug_html ("got valid base tag");
 							hc->base_url = url;
 							cur_tag->extra = url;
@@ -2236,11 +2253,11 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 				}
 
 				if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
-					rspamd_html_process_img_tag (pool, cur_tag, hc, url_set,
+					html_process_img_tag(pool, cur_tag, hc, url_set,
 							part_urls, dest);
 				}
 				else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) {
-					rspamd_html_process_link_tag (pool, cur_tag, hc, url_set,
+					html_process_link_tag(pool, cur_tag, hc, url_set,
 							part_urls);
 				}
 				else if (cur_tag->flags & FL_BLOCK) {
@@ -2253,11 +2270,11 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 						}
 					}
 					else {
-						rspamd_html_process_block_tag (pool, cur_tag, hc);
+						html_process_block_tag(pool, cur_tag, hc);
 						bl = static_cast<html_block *>(cur_tag->extra);
 
 						if (bl) {
-							rspamd_html_propagate_style (hc, cur_tag,
+							html_propagate_style(hc, cur_tag,
 									bl, styles_blocks);
 
 							/* Check visibility */
@@ -2292,7 +2309,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 
 	if (hc->html_tags) {
 		g_node_traverse (hc->html_tags, G_POST_ORDER, G_TRAVERSE_ALL, -1,
-				rspamd_html_propagate_lengths, NULL);
+				html_propagate_lengths, NULL);
 	}
 
 	g_queue_free (styles_blocks);
@@ -2301,6 +2318,21 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
 	return dest;
 }
 
+}
+
+GByteArray*
+rspamd_html_process_part_full (rspamd_mempool_t *pool,
+							   struct html_content *hc,
+							   GByteArray *in,
+							   GList **exceptions,
+							   khash_t (rspamd_url_hash) *url_set,
+							   GPtrArray *part_urls,
+							   bool allow_css)
+{
+	return rspamd::html::html_process_part_full(pool, hc, in, exceptions, url_set,
+			part_urls, allow_css);
+}
+
 GByteArray*
 rspamd_html_process_part (rspamd_mempool_t *pool,
 		struct html_content *hc,


More information about the Commits mailing list