commit 4395db5: [Minor] Use separate htb for heuristical elements in entities

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Jun 19 16:42:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-06-19 17:34:47 +0100
URL: https://github.com/rspamd/rspamd/commit/4395db507d29539c1b828813c0a7e3e2d09cdaa3 (HEAD -> master)

[Minor] Use separate htb for heuristical elements in entities

---
 src/libserver/html/html_entities.cxx | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 50bf34f18..1d72574b3 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2169,6 +2169,7 @@ static const auto html_entities_array = rspamd::array_of<html_entity_def>(
 
 class html_entities_storage {
 	robin_hood::unordered_flat_map<std::string_view, html_entity_def> entity_by_name;
+	robin_hood::unordered_flat_map<std::string_view, html_entity_def> entity_by_name_heur;
 	robin_hood::unordered_flat_map<unsigned, html_entity_def> entity_by_id;
 public:
 	html_entities_storage() {
@@ -2178,13 +2179,25 @@ public:
 		for (const auto &e : html_entities_array) {
 			entity_by_name[e.name] = e;
 			entity_by_id[e.code] = e;
+
+			if (e.allow_heuristic) {
+				entity_by_name_heur[e.name] = e;
+			}
 		}
 	}
 
-	auto by_name(std::string_view name) const -> const html_entity_def * {
-		auto it = entity_by_name.find(name);
+	auto by_name(std::string_view name, bool use_heuristic = false) const -> const html_entity_def * {
+		const decltype(entity_by_name)* htb;
+
+		if (use_heuristic) {
+			htb = &entity_by_name_heur;
+		}
+		else {
+			htb = &entity_by_name;
+		}
+		auto it = htb->find(name);
 
-		if (it != entity_by_name.end()) {
+		if (it != htb->end()) {
 			return &(it->second);
 		}
 
@@ -2229,7 +2242,8 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 
 	auto replace_named_entity = [&](const char *entity, std::size_t len) -> bool {
 		const auto *entity_def = html_entities_defs.by_name({entity,
-															 (std::size_t) (h - entity)});
+															 (std::size_t) (h - entity)},
+															 		false);
 
 		auto replace_entity = [&]() -> void {
 			auto l = entity_def->replacement.size();
@@ -2245,9 +2259,9 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
 			/* Try heuristic */
 			auto heuristic_lookup_func = [&](std::size_t lookup_len) -> bool {
 				if (!entity_def && h - e > lookup_len) {
-					entity_def = html_entities_defs.by_name({entity, lookup_len});
+					entity_def = html_entities_defs.by_name({entity, lookup_len}, true);
 
-					if (entity_def && entity_def->allow_heuristic) {
+					if (entity_def) {
 						replace_entity();
 						/* Adjust h back */
 						h = e + lookup_len;


More information about the Commits mailing list