commit 9aa268c: [Project] More fixes to calculations

Vsevolod Stakhov vsevolod at highsecure.ru
Thu Jul 1 12:35:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-01 12:53:24 +0100
URL: https://github.com/rspamd/rspamd/commit/9aa268c1c1b900b325f4b4a6060b6e378db8ea8c

[Project] More fixes to calculations

---
 src/libserver/html/html.cxx | 96 ++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 37 deletions(-)

diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 202eebb87..bdd489299 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -91,14 +91,22 @@ html_check_balance(struct html_content *hc,
 	auto calculate_content_length = [tag_start_offset,tag_end_offset](html_tag *t) {
 		auto opening_content_offset = t->content_offset;
 
-		if (opening_content_offset <= tag_start_offset) {
-			t->closing.start = tag_start_offset;
-			t->closing.end = tag_end_offset;
+		if (t->flags & (CM_EMPTY)) {
+			/* Attach closing tag just at the opening tag */
+			t->closing.start = t->tag_start;
+			t->closing.end = t->content_offset - 1;
 		}
 		else {
 
-			t->closing.start = t->content_offset;
-			t->closing.end = tag_end_offset;
+			if (opening_content_offset <= tag_start_offset) {
+				t->closing.start = tag_start_offset;
+				t->closing.end = tag_end_offset;
+			}
+			else {
+
+				t->closing.start = t->content_offset;
+				t->closing.end = tag_end_offset;
+			}
 		}
 	};
 
@@ -1012,6 +1020,9 @@ html_append_tag_content(rspamd_mempool_t *pool,
 
 		return tag->content_offset;
 	}
+	else if (tag->id == Tag_HEAD || tag->id >= N_TAGS) {
+		return tag->closing.end + 1;
+	}
 
 	if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE|CM_HEAD))) {
 		is_visible = false;
@@ -1046,9 +1057,13 @@ html_append_tag_content(rspamd_mempool_t *pool,
 									 std::size_t(initial_part_len)});
 		}
 
-		cur_offset = html_append_tag_content(pool, start, len,
+		auto next_offset = html_append_tag_content(pool, start, len,
 				hc, cld, exceptions, url_set);
 
+		/* Do not allow shifting back */
+		if (next_offset > cur_offset) {
+			cur_offset = next_offset;
+		}
 	}
 
 	if (cur_offset < tag->closing.start) {
@@ -1151,7 +1166,7 @@ html_process_input(rspamd_mempool_t *pool,
 		ntag->tag_start = c - start;
 		ntag->flags = flags;
 
-		if (cur_tag) {
+		if (cur_tag && !(cur_tag->flags & (CM_EMPTY|FL_CLOSED))) {
 			parent_tag = cur_tag;
 		}
 
@@ -1303,12 +1318,12 @@ html_process_input(rspamd_mempool_t *pool,
 		case xml_tag_end:
 			if (t == '>') {
 				state = tag_end_opening;
-				continue;
+				cur_tag->content_offset = p - start + 1;
 			}
 			else {
 				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-				p ++;
 			}
+			p++;
 			break;
 
 		case compound_tag:
@@ -1320,7 +1335,7 @@ html_process_input(rspamd_mempool_t *pool,
 			}
 			else if (t == '>' && obrace == ebrace) {
 				state = tag_end_opening;
-				continue;
+				cur_tag->content_offset = p - start + 1;
 			}
 			p ++;
 			break;
@@ -1362,6 +1377,7 @@ html_process_input(rspamd_mempool_t *pool,
 				ebrace ++;
 			}
 			else if (t == '>' && ebrace >= 2) {
+				cur_tag->content_offset = p - start + 1;
 				state = tag_end_opening;
 				continue;
 			}
@@ -1421,6 +1437,7 @@ html_process_input(rspamd_mempool_t *pool,
 		case sgml_content:
 			/* TODO: parse DOCTYPE here */
 			if (t == '>') {
+				cur_tag->content_offset = p - start + 1;
 				state = html_text_content;
 				/* We don't know a lot about sgml tags, ignore them */
 			}
@@ -1472,39 +1489,39 @@ html_process_input(rspamd_mempool_t *pool,
 						}
 					}
 					hc->tags_seen[cur_tag->id] = true;
+				}
 
-					/* Shift to the first unclosed tag */
-					while (parent_tag && (parent_tag->flags & FL_CLOSED)) {
-						parent_tag = parent_tag->parent;
-					}
+				/* Shift to the first unclosed tag */
+				while (parent_tag && (parent_tag->flags & FL_CLOSED)) {
+					parent_tag = parent_tag->parent;
+				}
 
-					if (parent_tag) {
-						cur_tag->parent = parent_tag;
-						parent_tag->children.push_back(cur_tag);
+				if (parent_tag) {
+					cur_tag->parent = parent_tag;
+					parent_tag->children.push_back(cur_tag);
+				}
+				else {
+					if (hc->root_tag) {
+						cur_tag->parent = hc->root_tag;
+						hc->root_tag->children.push_back(cur_tag);
+						parent_tag = hc->root_tag;
 					}
 					else {
-						if (hc->root_tag) {
-							cur_tag->parent = hc->root_tag;
-							hc->root_tag->children.push_back(cur_tag);
-							parent_tag = hc->root_tag;
+						if (cur_tag->id == Tag_HTML) {
+							hc->root_tag = cur_tag;
 						}
 						else {
-							if (cur_tag->id == Tag_HTML) {
-								hc->root_tag = cur_tag;
-							}
-							else {
-								/* Insert a fake html tag */
-								hc->all_tags.emplace_back(std::make_unique<html_tag>());
-								auto *top_tag = hc->all_tags.back().get();
-								top_tag->tag_start = 0;
-								top_tag->flags = CM_HEAD|FL_VIRTUAL;
-								top_tag->id = Tag_HTML;
-								top_tag->content_offset = 0;
-								top_tag->children.push_back(cur_tag);
-								cur_tag->parent = top_tag;
-								hc->root_tag = top_tag;
-								parent_tag = top_tag;
-							}
+							/* Insert a fake html tag */
+							hc->all_tags.emplace_back(std::make_unique<html_tag>());
+							auto *top_tag = hc->all_tags.back().get();
+							top_tag->tag_start = 0;
+							top_tag->flags = CM_HEAD|FL_VIRTUAL;
+							top_tag->id = Tag_HTML;
+							top_tag->content_offset = 0;
+							top_tag->children.push_back(cur_tag);
+							cur_tag->parent = top_tag;
+							hc->root_tag = top_tag;
+							parent_tag = top_tag;
 						}
 					}
 				}
@@ -1563,6 +1580,11 @@ html_process_input(rspamd_mempool_t *pool,
 				if (!(cur_tag->flags & CM_EMPTY)) {
 					html_process_block_tag(pool, cur_tag, hc);
 				}
+
+				if (cur_tag->flags & FL_CLOSED) {
+					cur_tag->closing.end = cur_tag->content_offset;
+					cur_tag->closing.start = cur_tag->tag_start;
+				}
 			}
 
 			if (cur_tag && (cur_tag->id == Tag_STYLE)) {


More information about the Commits mailing list