commit 4c87703: [Rework] Move entities/tags handling

Vsevolod Stakhov vsevolod at highsecure.ru
Fri May 21 15:35:05 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-21 09:18:07 +0100
URL: https://github.com/rspamd/rspamd/commit/4c87703334b12bcb0981547591463be0bd58b1ae

[Rework] Move entities/tags handling

---
 src/libserver/CMakeLists.txt           |    2 +-
 src/libserver/html/{html.c => html.cc} | 1036 ++++++---------
 src/libserver/html/html_entities.h     | 2164 -------------------------------
 src/libserver/html/html_entities.hxx   | 2196 ++++++++++++++++++++++++++++++++
 src/libserver/logger.h                 |    1 +
 src/libutil/cxx/util.hxx               |   11 +
 6 files changed, 2584 insertions(+), 2826 deletions(-)

diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt
index b17d55e4f..e8267292c 100644
--- a/src/libserver/CMakeLists.txt
+++ b/src/libserver/CMakeLists.txt
@@ -34,7 +34,7 @@ SET(LIBRSPAMDSERVERSRC
 				${CMAKE_CURRENT_SOURCE_DIR}/http/http_context.c
 				${CMAKE_CURRENT_SOURCE_DIR}/maps/map.c
 				${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c
-				${CMAKE_CURRENT_SOURCE_DIR}/html/html.c
+				${CMAKE_CURRENT_SOURCE_DIR}/html/html.cc
 				${LIBCSSSRC})
 
 # Librspamd-server
diff --git a/src/libserver/html/html.c b/src/libserver/html/html.cc
similarity index 67%
rename from src/libserver/html/html.c
rename to src/libserver/html/html.cc
index cfdd0acef..e650cc3e4 100644
--- a/src/libserver/html/html.c
+++ b/src/libserver/html/html.cc
@@ -20,29 +20,27 @@
 #include "html.h"
 #include "html_tags.h"
 #include "html_colors.h"
-#include "html_entities.h"
+
 #include "url.h"
 #include "contrib/libucl/khash.h"
 #include "libmime/images.h"
 #include "css/css.h"
 #include "libutil/cxx/utf8_util.h"
 
+#include "html_tag_defs.hxx"
+#include "html_entities.hxx"
+
+#include <vector>
+
 #include <unicode/uversion.h>
 #include <unicode/ucnv.h>
 #if U_ICU_VERSION_MAJOR_NUM >= 46
 #include <unicode/uidna.h>
 #endif
 
-static sig_atomic_t tags_sorted = 0;
-static sig_atomic_t entities_sorted = 0;
-static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
+namespace rspamd::html {
 
-struct html_tag_def {
-	const gchar *name;
-	gint16 id;
-	guint16 len;
-	guint flags;
-};
+static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
 
 #define msg_debug_html(...)  rspamd_conditional_debug_fast (NULL, NULL, \
         rspamd_html_log_id, "html", pool->tag.uid, \
@@ -51,282 +49,16 @@ struct html_tag_def {
 
 INIT_LOG_MODULE(html)
 
-#define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
-
-static struct html_tag_def tag_defs[] = {
-	/* W3C defined elements */
-	TAG_DEF(Tag_A, "a", FL_HREF),
-	TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
-	TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
-	TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
-	TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
-	TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
-	TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
-	TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
-	TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
-	TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
-	TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
-	TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
-	TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
-	TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
-	TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
-	TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
-	TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
-	TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
-	TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
-	TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
-	TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
-	TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
-	TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
-	TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
-	TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
-	TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
-	TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
-	TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
-	TAG_DEF(Tag_EM, "em", (CM_INLINE)),
-	TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
-	TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
-	TAG_DEF(Tag_FORM, "form", (CM_BLOCK|FL_HREF)),
-	TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
-	TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
-	TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
-	TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
-	TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
-	TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
-	TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
-	TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
-	TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
-	TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
-	TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
-	TAG_DEF(Tag_I, "i", (CM_INLINE)),
-	TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
-	TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
-	TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
-	TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
-	TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
-	TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
-	TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
-	TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
-	TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
-	TAG_DEF(Tag_LINK, "link", (CM_EMPTY|FL_HREF)),
-	TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
-	TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
-	TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
-	TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
-	TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
-	TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
-	TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
-	TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
-	TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
-	TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
-	TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
-	TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
-	TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
-	TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
-	TAG_DEF(Tag_Q, "q", (CM_INLINE)),
-	TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
-	TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
-	TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
-	TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
-	TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
-	TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
-	TAG_DEF(Tag_S, "s", (CM_INLINE)),
-	TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
-	TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
-	TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
-	TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
-	TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
-	TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
-	TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
-	TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
-	TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
-	TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
-	TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
-	TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
-	TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
-	TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
-	TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
-	TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
-	TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
-	TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
-	TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
-	TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
-	TAG_DEF(Tag_U, "u", (CM_INLINE)),
-	TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
-	TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
-	TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
-	TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
-
-	/* proprietary elements */
-	TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
-	TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
-	TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
-	TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
-	TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
-	TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
-	TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
-	TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
-	TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
-	TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
-	TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
-	TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
-	TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
-	TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
-	TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
-	TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
-	TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
-	TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
-};
-
-KHASH_MAP_INIT_INT (entity_by_number, const char *);
-KHASH_MAP_INIT_STR (entity_by_name, const char *);
-KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
-KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
-KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
-		rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
-
-khash_t(entity_by_number) *html_entity_by_number;
-khash_t(entity_by_name) *html_entity_by_name;
-khash_t(tag_by_name) *html_tag_by_name;
-khash_t(tag_by_id) *html_tag_by_id;
-khash_t(color_by_name) *html_color_by_name;
-
-static struct rspamd_url *rspamd_html_process_url (rspamd_mempool_t *pool,
-												   const gchar *start, guint len,
-												   struct html_tag_component *comp);
-
-static void
-rspamd_html_library_init (void)
-{
-	guint i;
-	khiter_t k;
-	gint rc;
-
-	if (!tags_sorted) {
-		html_tag_by_id = kh_init (tag_by_id);
-		html_tag_by_name = kh_init (tag_by_name);
-		kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
-		kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
-
-		for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
-			k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
-
-			if (rc == 0) {
-				/* Collision by id */
-				msg_err ("collision in html tag id: %d (%s) vs %d (%s)",
-						(int)tag_defs[i].id, tag_defs[i].name,
-						(int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
-			}
-
-			kh_val (html_tag_by_id, k) = tag_defs[i];
-
-			k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
-
-			if (rc == 0) {
-				/* Collision by name */
-				msg_err ("collision in html tag name: %d (%s) vs %d (%s)",
-						(int)tag_defs[i].id, tag_defs[i].name,
-						(int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
-			}
-
-			kh_val (html_tag_by_name, k) = tag_defs[i];
-		}
-
-		tags_sorted = 1;
-	}
 
-	if (!entities_sorted) {
-		html_entity_by_number = kh_init (entity_by_number);
-		html_entity_by_name = kh_init (entity_by_name);
-		kh_resize (entity_by_number, html_entity_by_number,
-				G_N_ELEMENTS (entities_defs));
-		kh_resize (entity_by_name, html_entity_by_name,
-				G_N_ELEMENTS (entities_defs));
+[[maybe_unused]] static const html_tags_storage html_tags_defs;
+[[maybe_unused]] static const html_entities_storage html_entities_defs;
 
-		for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
-			if (entities_defs[i].code != 0) {
-				k = kh_put (entity_by_number, html_entity_by_number,
-						entities_defs[i].code, &rc);
-
-				if (rc == 0) {
-					/* Collision by id */
-					gint cmp_res = strcmp (entities_defs[i].replacement,
-							kh_val (html_entity_by_number, k));
-					if (cmp_res != 0) {
-						if (strlen (entities_defs[i].replacement) <
-							strlen (kh_val (html_entity_by_number, k))) {
-							/* Shorter replacement is more likely to be valid */
-							msg_debug ("1 collision in html entity id: %d (%s); replace %s by %s",
-									(int) entities_defs[i].code, entities_defs[i].name,
-									kh_val (html_entity_by_number, k),
-									entities_defs[i].replacement);
-							kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
-						}
-						else if (strlen (entities_defs[i].replacement) ==
-								 strlen (kh_val (html_entity_by_number, k)) &&
-										 cmp_res < 0) {
-							/* Identical len but lexicographically shorter */
-							msg_debug ("collision in html entity id: %d (%s); replace %s by %s",
-									(int) entities_defs[i].code, entities_defs[i].name,
-									kh_val (html_entity_by_number, k),
-									entities_defs[i].replacement);
-							kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
-						}
-						/* Do not replace otherwise */
-					}
-					/* Identic replacement */
-				}
-				else {
-					kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
-				}
-			}
-
-			k = kh_put (entity_by_name, html_entity_by_name,
-					entities_defs[i].name, &rc);
-
-			if (rc == 0) {
-				/* Collision by name */
-				if (strcmp (kh_val (html_entity_by_number, k),
-						entities_defs[i].replacement) != 0) {
-					msg_err ("collision in html entity name: %d (%s)",
-							(int) entities_defs[i].code, entities_defs[i].name);
-				}
-			}
-
-			kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
-		}
-
-		html_color_by_name = kh_init (color_by_name);
-		kh_resize (color_by_name, html_color_by_name,
-				G_N_ELEMENTS (html_colornames));
-
-		rspamd_ftok_t *keys;
-
-		keys = g_malloc0 (sizeof (rspamd_ftok_t) *
-						  G_N_ELEMENTS (html_colornames));
-
-		for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
-			struct html_color c;
-
-			keys[i].begin = html_colornames[i].name;
-			keys[i].len = strlen (html_colornames[i].name);
-			k = kh_put (color_by_name, html_color_by_name,
-					&keys[i], &rc);
-			c.valid = true;
-			c.d.comp.r = html_colornames[i].rgb.r;
-			c.d.comp.g = html_colornames[i].rgb.g;
-			c.d.comp.b = html_colornames[i].rgb.b;
-			c.d.comp.alpha = 255;
-			kh_val (html_color_by_name, k) = c;
-
-		}
-
-		entities_sorted = 1;
-	}
-}
+static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
+												  const gchar *start, guint len,
+												  struct html_tag_component *comp);
 
 static gboolean
-rspamd_html_check_balance (GNode * node, GNode ** cur_level)
+rspamd_html_check_balance(GNode *node, GNode **cur_level)
 {
 	struct html_tag *arg = node->data, *tmp;
 	GNode *cur;
@@ -340,7 +72,7 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level)
 				(tmp->flags & FL_CLOSED) == 0) {
 				tmp->flags |= FL_CLOSED;
 				/* Destroy current node as we find corresponding parent node */
-				g_node_destroy (node);
+				g_node_destroy(node);
 				/* Change level */
 				*cur_level = cur->parent;
 				return TRUE;
@@ -356,8 +88,7 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level)
 }
 
 gint
-rspamd_html_tag_by_name (const gchar *name)
-{
+rspamd_html_tag_by_name(const gchar *name) {
 	khiter_t k;
 
 	k = kh_get (tag_by_name, html_tag_by_name, name);
@@ -370,14 +101,13 @@ rspamd_html_tag_by_name (const gchar *name)
 }
 
 gboolean
-rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
-{
+rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname) {
 	gint id;
 
 	g_assert (hc != NULL);
 	g_assert (hc->tags_seen != NULL);
 
-	id = rspamd_html_tag_by_name (tagname);
+	id = rspamd_html_tag_by_name(tagname);
 
 	if (id != -1) {
 		return isset (hc->tags_seen, id);
@@ -387,8 +117,7 @@ rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
 }
 
 const gchar *
-rspamd_html_tag_by_id (gint id)
-{
+rspamd_html_tag_by_id(gint id) {
 	khiter_t k;
 
 	k = kh_get (tag_by_id, html_tag_by_id, id);
@@ -402,8 +131,7 @@ rspamd_html_tag_by_id (gint id)
 
 /* Decode HTML entitles in text */
 guint
-rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
-{
+rspamd_html_decode_entitles_inplace(gchar *s, gsize len) {
 	goffset l, rep_len;
 	gchar *t = s, *h = s, *e = s, *end_ptr, old_c;
 	const gchar *end;
@@ -429,7 +157,7 @@ rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
 
 	while (h - s < l && t <= h) {
 		switch (state) {
-		/* Out of entity */
+			/* Out of entity */
 		case 0:
 			if (*h == '&') {
 				state = 1;
@@ -462,23 +190,24 @@ decode_entity:
 
 					if (k != kh_end (html_entity_by_name)) {
 						if (kh_val (html_entity_by_name, k)) {
-							rep_len = strlen (kh_val (html_entity_by_name, k));
+							rep_len = strlen(kh_val (html_entity_by_name, k));
 
 							if (end - t >= rep_len) {
-								memcpy (t, kh_val (html_entity_by_name, k),
+								memcpy(t, kh_val (html_entity_by_name, k),
 										rep_len);
 								t += rep_len;
 							}
-						} else {
+						}
+						else {
 							if (end - t > h - e + 1) {
-								memmove (t, e, h - e + 1);
+								memmove(t, e, h - e + 1);
 								t += h - e + 1;
 							}
 						}
 					}
 					else {
 						if (end - t > h - e + 1) {
-							memmove (t, e, h - e + 1);
+							memmove(t, e, h - e + 1);
 							t += h - e + 1;
 						}
 					}
@@ -495,10 +224,10 @@ decode_entity:
 					}
 
 					if (base == 10) {
-						uc = strtoul ((e + 2), &end_ptr, base);
+						uc = strtoul((e + 2), &end_ptr, base);
 					}
 					else {
-						uc = strtoul ((e + 3), &end_ptr, base);
+						uc = strtoul((e + 3), &end_ptr, base);
 					}
 
 					if (end_ptr != NULL && *end_ptr != '\0') {
@@ -506,7 +235,7 @@ decode_entity:
 						*h = old_c;
 
 						if (end - t > h - e + 1) {
-							memmove (t, e, h - e + 1);
+							memmove(t, e, h - e + 1);
 							t += h - e + 1;
 						}
 					}
@@ -517,16 +246,17 @@ decode_entity:
 
 						if (k != kh_end (html_entity_by_number)) {
 							if (kh_val (html_entity_by_number, k)) {
-								rep_len = strlen (kh_val (html_entity_by_number, k));
+								rep_len = strlen(kh_val (html_entity_by_number, k));
 
 								if (end - t >= rep_len) {
-									memcpy (t, kh_val (html_entity_by_number, k),
+									memcpy(t, kh_val (html_entity_by_number, k),
 											rep_len);
 									t += rep_len;
 								}
-							} else {
+							}
+							else {
 								if (end - t > h - e + 1) {
-									memmove (t, e, h - e + 1);
+									memmove(t, e, h - e + 1);
 									t += h - e + 1;
 								}
 							}
@@ -544,13 +274,13 @@ decode_entity:
 								else {
 									/* Leave invalid entities as is */
 									if (end - t > h - e + 1) {
-										memmove (t, e, h - e + 1);
+										memmove(t, e, h - e + 1);
 										t += h - e + 1;
 									}
 								}
 							}
 							else if (end - t > h - e + 1) {
-								memmove (t, e, h - e + 1);
+								memmove(t, e, h - e + 1);
 								t += h - e + 1;
 							}
 						}
@@ -569,7 +299,7 @@ decode_entity:
 				state = 1;
 
 				if (end - t > h - e) {
-					memmove (t, e, h - e);
+					memmove(t, e, h - e);
 					t += h - e;
 				}
 
@@ -581,11 +311,11 @@ decode_entity:
 				if (h + 1 < end && h[1] == 'x') {
 					seen_hex = TRUE;
 					/* Skip one more character */
-					h ++;
+					h++;
 				}
 			}
 			else if (seen_digit_only != do_mixed &&
-				(g_ascii_isdigit (*h) || (seen_hex && g_ascii_isxdigit (*h)))) {
+					 (g_ascii_isdigit (*h) || (seen_hex && g_ascii_isxdigit (*h)))) {
 				seen_digit_only = do_digits_only;
 			}
 			else {
@@ -608,7 +338,7 @@ decode_entity:
 	if (state == 1 && h > e) {
 		/* Unfinished entity, copy as is */
 		if (end - t >= h - e) {
-			memmove (t, e, h - e);
+			memmove(t, e, h - e);
 			t += h - e;
 		}
 	}
@@ -617,8 +347,7 @@ decode_entity:
 }
 
 static gboolean
-rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
-{
+rspamd_url_is_subdomain(rspamd_ftok_t *t1, rspamd_ftok_t *t2) {
 	const gchar *p1, *p2;
 
 	p1 = t1->begin + t1->len - 1;
@@ -630,7 +359,7 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
 			break;
 		}
 
-		p1 --;
+		p1--;
 	}
 
 	while (p2 > t2->begin) {
@@ -638,7 +367,7 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
 			break;
 		}
 
-		p2 --;
+		p2--;
 	}
 
 	while (p1 > t1->begin && p2 > t2->begin) {
@@ -646,8 +375,8 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
 			break;
 		}
 
-		p1 --;
-		p2 --;
+		p1--;
+		p2--;
 	}
 
 	if (p2 == t2->begin) {
@@ -666,13 +395,12 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
 }
 
 static void
-rspamd_html_url_is_phished (rspamd_mempool_t *pool,
-	struct rspamd_url *href_url,
-	const guchar *url_text,
-	gsize len,
-	gboolean *url_found,
-	struct rspamd_url **ptext_url)
-{
+rspamd_html_url_is_phished(rspamd_mempool_t *pool,
+						   struct rspamd_url *href_url,
+						   const guchar *url_text,
+						   gsize len,
+						   gboolean *url_found,
+						   struct rspamd_url **ptext_url) {
 	struct rspamd_url *text_url;
 	rspamd_ftok_t disp_tok, href_tok;
 	gint rc;
@@ -688,23 +416,23 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 	*url_found = FALSE;
 #if U_ICU_VERSION_MAJOR_NUM >= 46
 	if (udn == NULL) {
-		udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
+		udn = uidna_openUTS46(UIDNA_DEFAULT, &uc_err);
 
 		if (uc_err != U_ZERO_ERROR) {
-			msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
+			msg_err_pool ("cannot init idna converter: %s", u_errorName(uc_err));
 		}
 	}
 #endif
 
 	while (url_text < end && g_ascii_isspace (*url_text)) {
-		url_text ++;
+		url_text++;
 	}
 
 	if (end > url_text + 4 &&
-			rspamd_url_find (pool, url_text, end - url_text, &url_str,
-					RSPAMD_URL_FIND_ALL,
-					&url_pos, NULL) &&
-			url_str != NULL) {
+		rspamd_url_find(pool, url_text, end - url_text, &url_str,
+				RSPAMD_URL_FIND_ALL,
+				&url_pos, NULL) &&
+		url_str != NULL) {
 		if (url_pos > 0) {
 			/*
 			 * We have some url at some offset, so we need to check what is
@@ -722,25 +450,25 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 			}
 		}
 
-		text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
-		rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
+		text_url = rspamd_mempool_alloc0 (pool, sizeof(struct rspamd_url));
+		rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
 				RSPAMD_URL_PARSE_TEXT);
 
 		if (rc == URI_ERRNO_OK) {
 			disp_tok.len = text_url->hostlen;
 			disp_tok.begin = rspamd_url_host_unsafe (text_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-			if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url),
+			if (rspamd_substring_search_caseless(rspamd_url_host_unsafe (text_url),
 					text_url->hostlen, "xn--", 4) != -1) {
 				idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
 				/* We need to convert it to the normal value first */
-				disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
+				disp_tok.len = uidna_nameToUnicodeUTF8(udn,
 						rspamd_url_host_unsafe (text_url), text_url->hostlen,
 						idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
 
 				if (uc_err != U_ZERO_ERROR) {
 					msg_err_pool ("cannot convert to IDN: %s",
-							u_errorName (uc_err));
+							u_errorName(uc_err));
 					disp_tok.len = text_url->hostlen;
 				}
 				else {
@@ -751,17 +479,17 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 			href_tok.len = href_url->hostlen;
 			href_tok.begin = rspamd_url_host_unsafe (href_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-			if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url),
+			if (rspamd_substring_search_caseless(rspamd_url_host_unsafe (href_url),
 					href_url->hostlen, "xn--", 4) != -1) {
 				idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
 				/* We need to convert it to the normal value first */
-				href_tok.len = uidna_nameToUnicodeUTF8 (udn,
+				href_tok.len = uidna_nameToUnicodeUTF8(udn,
 						rspamd_url_host_unsafe (href_url), href_url->hostlen,
 						idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
 
 				if (uc_err != U_ZERO_ERROR) {
 					msg_err_pool ("cannot convert to IDN: %s",
-							u_errorName (uc_err));
+							u_errorName(uc_err));
 					href_tok.len = href_url->hostlen;
 				}
 				else {
@@ -769,24 +497,24 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 				}
 			}
 #endif
-			if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 &&
-					text_url->tldlen > 0 && href_url->tldlen > 0) {
+			if (rspamd_ftok_casecmp(&disp_tok, &href_tok) != 0 &&
+				text_url->tldlen > 0 && href_url->tldlen > 0) {
 
*** OUTPUT TRUNCATED, 6108 LINES SKIPPED ***


More information about the Commits mailing list