commit 59e642a: [Fix] Html: Add entities collisions prevention logic (e.g. for mathml entities)

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Oct 13 10:49:14 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-10-13 11:44:02 +0100
URL: https://github.com/rspamd/rspamd/commit/59e642abaab9ffe318e35ad6a702531e576dfabf (HEAD -> master)

[Fix] Html: Add entities collisions prevention logic (e.g. for mathml entities)

---
 src/libserver/html.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 518de98fa..a33e4e7fc 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -208,9 +208,25 @@ rspamd_html_library_init (void)
 
 		for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
 			k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
+
+			if (rc == 0) {
+				/* Collision by id */
+				msg_err ("collision in html tag id: %d (%s) vs %d (%s)",
+						(int)tag_defs[i].id, tag_defs[i].name,
+						(int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
+			}
+
 			kh_val (html_tag_by_id, k) = tag_defs[i];
 
 			k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
+
+			if (rc == 0) {
+				/* Collision by name */
+				msg_err ("collision in html tag name: %d (%s) vs %d (%s)",
+						(int)tag_defs[i].id, tag_defs[i].name,
+						(int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
+			}
+
 			kh_val (html_tag_by_name, k) = tag_defs[i];
 		}
 
@@ -229,11 +245,52 @@ rspamd_html_library_init (void)
 			if (entities_defs[i].code != 0) {
 				k = kh_put (entity_by_number, html_entity_by_number,
 						entities_defs[i].code, &rc);
-				kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
+
+				if (rc == 0) {
+					/* Collision by id */
+					gint cmp_res = strcmp (entities_defs[i].replacement,
+							kh_val (html_entity_by_number, k));
+					if (cmp_res != 0) {
+						if (strlen (entities_defs[i].replacement) <
+							strlen (kh_val (html_entity_by_number, k))) {
+							/* Shorter replacement is more likely to be valid */
+							msg_debug ("1 collision in html entity id: %d (%s); replace %s by %s",
+									(int) entities_defs[i].code, entities_defs[i].name,
+									kh_val (html_entity_by_number, k),
+									entities_defs[i].replacement);
+							kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
+						}
+						else if (strlen (entities_defs[i].replacement) ==
+								 strlen (kh_val (html_entity_by_number, k)) &&
+										 cmp_res < 0) {
+							/* Identical len but lexicographically shorter */
+							msg_debug ("collision in html entity id: %d (%s); replace %s by %s",
+									(int) entities_defs[i].code, entities_defs[i].name,
+									kh_val (html_entity_by_number, k),
+									entities_defs[i].replacement);
+							kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
+						}
+						/* Do not replace otherwise */
+					}
+					/* Identic replacement */
+				}
+				else {
+					kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
+				}
 			}
 
 			k = kh_put (entity_by_name, html_entity_by_name,
 					entities_defs[i].name, &rc);
+
+			if (rc == 0) {
+				/* Collision by name */
+				if (strcmp (kh_val (html_entity_by_number, k),
+						entities_defs[i].replacement) != 0) {
+					msg_err ("collision in html entity name: %d (%s)",
+							(int) entities_defs[i].code, entities_defs[i].name);
+				}
+			}
+
 			kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
 		}
 


More information about the Commits mailing list