commit 49206dc: [Fix] Finally rework parsing entities logic

Fri Jun 18 23:49:04 UTC 2021

Author: Vsevolod Stakhov
Date: 2021-06-19 00:43:39 +0100
URL: https://github.com/rspamd/rspamd/commit/49206dce81192fa923f721207630c3fe926c37e2 (HEAD -> master)

[Fix] Finally rework parsing entities logic

---
 src/libserver/html/html_entities.cxx | 352 +++++++++++++++++++++++------------
 1 file changed, 229 insertions(+), 123 deletions(-)

diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 554730fc8..056dba60e 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -22,6 +22,7 @@
 #include <vector>
 #include <contrib/robin-hood/robin_hood.h>
 #include <unicode/utf8.h>
+#include <unicode/uchar.h>
 #include "libutil/cxx/util.hxx"
 
 #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
@@ -1732,7 +1733,7 @@ static const auto html_entities_array = rspamd::array_of<html_entity_def>(
 		ENTITY_DEF("die", 168, "\xc2\xa8"),
 		ENTITY_DEF("ngt", 8815, "\xe2\x89\xaf"),
 		ENTITY_DEF("vcy", 1074, "\xd0\xb2"),
-		ENTITY_DEF("fjlig", 0, "\x66\x6a"),
+		ENTITY_DEF("fjlig", (unsigned)-1, "\x66\x6a"),
 		ENTITY_DEF("submult", 10945, "\xe2\xab\x81"),
 		ENTITY_DEF("ubrcy", 1118, "\xd1\x9e"),
 		ENTITY_DEF("ovbar", 9021, "\xe2\x8c\xbd"),
@@ -2203,15 +2204,13 @@ static const html_entities_storage html_entities_defs;
 std::size_t
 decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 {
-	long l, rep_len;
 	/*
 	 * t - tortoise (destination ptr)
 	 * h - hare (source ptr)
 	 * e - begin of entity
 	 */
-	char *t = s, *h = s, *e = s, *end_ptr, old_c;
+	char *t = s, *h = s, *e = s;
 	const gchar *end;
-	const gchar *entity;
 	bool seen_hash = false, seen_hex = false;
 	enum {
 		do_undefined,
@@ -2223,19 +2222,214 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 		ampersand,
 		skip_multi_spaces,
 	} state = parser_state::normal_content;
-	int base;
-	UChar32 uc;
 
-	if (len == 0) {
-		return 0;
-	}
-	else {
-		l = len;
-	}
+	end = s + len;
+
+	auto replace_named_entity = [&](const char *entity, std::size_t len) -> bool {
+		const auto *entity_def = html_entities_defs.by_name({entity,
+															 (std::size_t) (h - entity)});
+
+		auto replace_entity = [&]() -> void {
+			auto l = entity_def->replacement.size();
+			memcpy(t, entity_def->replacement.data(), l);
+			t += l;
+		};
+
+		if (entity_def) {
+			replace_entity();
+			return true;
+		}
+		else {
+			/* Try heuristic */
+			/* Try 4 letters replacements */
+			if (h - e > 4) {
+				entity_def = html_entities_defs.by_name({entity, 4});
+
+				if (entity_def) {
+					replace_entity();
+					/* Rewind h by 5 for & character and entity */
+					h = e + 4;
+				}
+			}
+			/* Try 3 letters replacements */
+			if (!entity_def && h - e > 3) {
+				entity_def = html_entities_defs.by_name({entity, 3});
+
+				if (entity_def) {
+					replace_entity();
+					h = e + 3;
+				}
+			}
+			/* Try 2 letters replacements */
+			if (!entity_def && h - e > 2) {
+				entity_def = html_entities_defs.by_name({entity, 2});
+
+				if (entity_def) {
+					replace_entity();
+					h = e + 2;
+				}
+			}
+			/* Leave undecoded */
+			if (!entity_def && (end - t > h - e + 1)) {
+				memmove(t, e, h - e + 1);
+				t += h - e + 1;
+			}
+			else if (entity_def) {
+				return true;
+			}
+		}
+
+		return false;
+	};
+
+	/* Strtoul works merely for 0 terminated strings, so leave it alone... */
+	auto dec_to_int = [](const char *str, std::size_t len) -> std::optional<int> {
+		int n = 0;
+
+		/* Avoid INT_MIN overflow by moving to negative numbers */
+		while (g_ascii_isdigit(*str) && len > 0) {
+			n = 10 * n - (*str++ - '0');
+			len --;
+		}
+
+		if (len == 0) {
+			return -(n);
+		}
+		else {
+			return std::nullopt;
+		}
+	};
+	auto hex_to_int = [](const char *str, std::size_t len) -> std::optional<int> {
+		int n = 0;
+
+		/* Avoid INT_MIN overflow by moving to negative numbers */
+		while (g_ascii_isxdigit(*str) && len > 0) {
+			if (*str <= 0x39) {
+				n = 16 * n - (*str++ - '0');
+			}
+			else {
+				n = 16 * n - (((*str++) | ' ') - 'a' + 10);
+			}
+			len --;
+		}
+
+		if (len == 0) {
+			return -(n);
+		}
+		else {
+			return std::nullopt;
+		}
+	};
+	auto oct_to_int = [](const char *str, std::size_t len) -> std::optional<int> {
+		int n = 0;
+
+		/* Avoid INT_MIN overflow by moving to negative numbers */
+		while (g_ascii_isdigit(*str) && len > 0) {
+			if (*str > '7') {
+				break;
+			}
+			else {
+				n = 8 * n - (*str++ - '0');
+			}
+			len --;
+		}
+
+		if (len == 0) {
+			return -(n);
+		}
+		else {
+			return std::nullopt;
+		}
+	};
+
+	auto replace_numeric_entity = [&](const char *entity) -> bool {
+		UChar32 uc;
+		std::optional<int> maybe_num;
+
+		if (*entity == 'x' || *entity == 'X') {
+			maybe_num = hex_to_int(entity + 1, h - (entity + 1));
+		}
+		else if (*entity == 'o' || *entity == 'O') {
+			maybe_num = oct_to_int(entity + 1, h - (entity + 1));
+		}
+		else {
+			maybe_num = dec_to_int(entity, h - entity);
+		}
+
+		if (!maybe_num) {
+			/* Skip undecoded */
+			if (end - t >= h - e) {
+				memmove(t, e, h - e);
+				t += h - e;
+			}
+
+			return false;
+		}
+		else {
+			uc = maybe_num.value();
+			/* Search for a replacement */
+			const auto *entity_def = html_entities_defs.by_id(uc);
+
+			if (entity_def) {
+				auto rep_len = entity_def->replacement.size();
+
+				if (end - t >= rep_len) {
+					memcpy(t, entity_def->replacement.data(),
+							rep_len);
+					t += rep_len;
+				}
+
+				return true;
+			}
+			else {
+				/* Unicode point */
+				goffset off = t - s;
+				UBool is_error = 0;
+
+				if (uc > 0 && u_isprint(uc)) {
+					U8_APPEND (s, off, len, uc, is_error);
+
+					if (!is_error) {
+						t = s + off;
+					}
+					else {
+						/* Leave invalid entities as is */
+						if (end - t > h - e + 1) {
+							memmove(t, e, h - e + 1);
+							t += h - e + 1;
+						}
+
+						return false;
+					}
+				}
+				else if (end - t > 3) {
+					/* Not printable code point replace with 0xFFFD */
+					*t++ = '\357';
+					*t++ = '\277';
+					*t++ = '\275';
+				}
+			}
+
+			return true;
+		}
+
+		return false;
+	};
+
+	auto replace_entity = [&]() -> bool {
+		const auto *entity_start = e + 1;
 
-	end = s + l;
+		if (*entity_start != '#') {
+			return replace_named_entity(entity_start, (h - entity_start));
+		}
+		else if (entity_start + 1 < h) {
+			return replace_numeric_entity(entity_start + 1);
+		}
 
-	while (h - s < l && t <= h) {
+		return false;
+	};
+
+	while (h - s < len && t <= h) {
 		switch (state) {
 		case parser_state::normal_content:
 			if (*h == '&') {
@@ -2259,106 +2453,8 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 			}
 			break;
 		case parser_state::ampersand:
-			if (*h == ';' && h > e) {
-decode_entity:
-				old_c = *h;
-				*h = '\0';
-				entity = e + 1;
-				uc = 0;
-
-				if (*entity != '#') {
-					const auto *entity_def = html_entities_defs.by_name({entity,
-																		 (std::size_t) (h - entity)});
-					*h = old_c;
-
-					if (entity_def) {
-						rep_len = entity_def->replacement.size();
-
-						if (end - t >= rep_len) {
-							memcpy(t, entity_def->replacement.data(),
-									rep_len);
-							t += rep_len;
-						}
-					}
-					else {
-						if (end - t > h - e + 1) {
-							memmove(t, e, h - e + 1);
-							t += h - e + 1;
-						}
-					}
-				}
-				else if (e + 2 < h) {
-					if (*(e + 2) == 'x' || *(e + 2) == 'X') {
-						base = 16;
-					}
-					else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
-						base = 8;
-					}
-					else {
-						base = 10;
-					}
-
-					if (base == 10) {
-						uc = strtoul((e + 2), &end_ptr, base);
-					}
-					else {
-						uc = strtoul((e + 3), &end_ptr, base);
-					}
-
-					if (end_ptr != nullptr && *end_ptr != '\0') {
-						/* Skip undecoded */
-						*h = old_c;
-
-						if (end - t > h - e + 1) {
-							memmove(t, e, h - e + 1);
-							t += h - e + 1;
-						}
-					}
-					else {
-						/* Search for a replacement */
-						*h = old_c;
-						const auto *entity_def = html_entities_defs.by_id(uc);
-
-						if (entity_def) {
-							rep_len = entity_def->replacement.size();
-
-							if (end - t >= rep_len) {
-								memcpy(t, entity_def->replacement.data(),
-										rep_len);
-								t += rep_len;
-							}
-						}
-						else {
-							/* Unicode point */
-							goffset off = t - s;
-							UBool is_error = 0;
-
-							if (uc > 0) {
-								U8_APPEND (s, off, len, uc, is_error);
-								if (!is_error) {
-									t = s + off;
-								}
-								else {
-									/* Leave invalid entities as is */
-									if (end - t > h - e + 1) {
-										memmove(t, e, h - e + 1);
-										t += h - e + 1;
-									}
-								}
-							}
-							else if (end - t > h - e + 1) {
-								memmove(t, e, h - e + 1);
-								t += h - e + 1;
-							}
-						}
-
-						if (end - t > 0 && old_c != ';') {
-							/* Fuck email clients, fuck them */
-							*t++ = old_c;
-						}
-					}
-				}
-
+			if ((*h == ';' || g_ascii_isspace(*h)) && h > e) {
+				replace_entity();
 				state = parser_state::normal_content;
 			}
 			else if (*h == '&') {
@@ -2389,7 +2485,9 @@ decode_entity:
 				if (seen_digit_only == do_digits_only && seen_hash && h > e) {
 					/* We have seen some digits, so we can try to decode, eh */
 					/* Fuck retarded email clients... */
-					goto decode_entity;
+					replace_entity();
+					state = parser_state::normal_content;
+					continue;
 				}
 
 				seen_digit_only = do_mixed;
@@ -2412,9 +2510,15 @@ decode_entity:
 	/* Leftover */
 	if (state == parser_state::ampersand && h > e) {
 		/* Unfinished entity, copy as is */
-		if (end - t >= h - e) {
-			memmove(t, e, h - e);
-			t += h - e;
+		if (replace_entity()) {
+			/* To follow FSM semantics */
+			h ++;
+		}
+
+		/* Leftover after replacement */
+		if (h < end && t + (end - h) <= end) {
+			memmove(t, h, end - h);
+			t += end - h;
 		}
 	}
 
@@ -2451,11 +2555,13 @@ TEST_SUITE("html") {
 		};
 
 		for (const auto &c : cases) {
-			auto *cpy = new char[c.first.size()];
-			memcpy(cpy, c.first.data(), c.first.size());
-			auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true);
-			CHECK(std::string{cpy,nlen} == c.second);
-			delete[] cpy;
+			SUBCASE(c.first.c_str()) {
+				auto *cpy = new char[c.first.size()];
+				memcpy(cpy, c.first.data(), c.first.size());
+				auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true);
+				CHECK(std::string{cpy, nlen} == c.second);
+				delete[] cpy;
+			}
 		}
 	}
 }