commit 4c87703: [Rework] Move entities/tags handling
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri May 21 15:35:05 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-21 09:18:07 +0100
URL: https://github.com/rspamd/rspamd/commit/4c87703334b12bcb0981547591463be0bd58b1ae
[Rework] Move entities/tags handling
---
src/libserver/CMakeLists.txt | 2 +-
src/libserver/html/{html.c => html.cc} | 1036 ++++++---------
src/libserver/html/html_entities.h | 2164 -------------------------------
src/libserver/html/html_entities.hxx | 2196 ++++++++++++++++++++++++++++++++
src/libserver/logger.h | 1 +
src/libutil/cxx/util.hxx | 11 +
6 files changed, 2584 insertions(+), 2826 deletions(-)
diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt
index b17d55e4f..e8267292c 100644
--- a/src/libserver/CMakeLists.txt
+++ b/src/libserver/CMakeLists.txt
@@ -34,7 +34,7 @@ SET(LIBRSPAMDSERVERSRC
${CMAKE_CURRENT_SOURCE_DIR}/http/http_context.c
${CMAKE_CURRENT_SOURCE_DIR}/maps/map.c
${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c
- ${CMAKE_CURRENT_SOURCE_DIR}/html/html.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cc
${LIBCSSSRC})
# Librspamd-server
diff --git a/src/libserver/html/html.c b/src/libserver/html/html.cc
similarity index 67%
rename from src/libserver/html/html.c
rename to src/libserver/html/html.cc
index cfdd0acef..e650cc3e4 100644
--- a/src/libserver/html/html.c
+++ b/src/libserver/html/html.cc
@@ -20,29 +20,27 @@
#include "html.h"
#include "html_tags.h"
#include "html_colors.h"
-#include "html_entities.h"
+
#include "url.h"
#include "contrib/libucl/khash.h"
#include "libmime/images.h"
#include "css/css.h"
#include "libutil/cxx/utf8_util.h"
+#include "html_tag_defs.hxx"
+#include "html_entities.hxx"
+
+#include <vector>
+
#include <unicode/uversion.h>
#include <unicode/ucnv.h>
#if U_ICU_VERSION_MAJOR_NUM >= 46
#include <unicode/uidna.h>
#endif
-static sig_atomic_t tags_sorted = 0;
-static sig_atomic_t entities_sorted = 0;
-static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
+namespace rspamd::html {
-struct html_tag_def {
- const gchar *name;
- gint16 id;
- guint16 len;
- guint flags;
-};
+static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
#define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
rspamd_html_log_id, "html", pool->tag.uid, \
@@ -51,282 +49,16 @@ struct html_tag_def {
INIT_LOG_MODULE(html)
-#define TAG_DEF(id, name, flags) {(name), (id), (sizeof(name) - 1), (flags)}
-
-static struct html_tag_def tag_defs[] = {
- /* W3C defined elements */
- TAG_DEF(Tag_A, "a", FL_HREF),
- TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
- TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
- TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
- TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
- TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
- TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
- TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
- TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
- TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
- TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
- TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_BUTTON, "button", (CM_INLINE|FL_BLOCK)),
- TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
- TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
- TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
- TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
- TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
- TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
- TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
- TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
- TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
- TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_DIV, "div", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_DL, "dl", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
- TAG_DEF(Tag_EM, "em", (CM_INLINE)),
- TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
- TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
- TAG_DEF(Tag_FORM, "form", (CM_BLOCK|FL_HREF)),
- TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
- TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
- TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
- TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
- TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
- TAG_DEF(Tag_I, "i", (CM_INLINE)),
- TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
- TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
- TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
- TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
- TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
- TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
- TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_LINK, "link", (CM_EMPTY|FL_HREF)),
- TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
- TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
- TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
- TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
- TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
- TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
- TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
- TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
- TAG_DEF(Tag_Q, "q", (CM_INLINE)),
- TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
- TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
- TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
- TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
- TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
- TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
- TAG_DEF(Tag_S, "s", (CM_INLINE)),
- TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
- TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
- TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
- TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
- TAG_DEF(Tag_SPAN, "span", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
- TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
- TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
- TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
- TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
- TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
- TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT| FL_BLOCK)),
- TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
- TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
- TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
- TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
- TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
- TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT| FL_BLOCK)),
- TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
- TAG_DEF(Tag_U, "u", (CM_INLINE)),
- TAG_DEF(Tag_UL, "ul", (CM_BLOCK|FL_BLOCK)),
- TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
- TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
- TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
-
- /* proprietary elements */
- TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
- TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
- TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
- TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
- TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
- TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
- TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
- TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
- TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
- TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
- TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
- TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
- TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
- TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
- TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
- TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
- TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)),
-};
-
-KHASH_MAP_INIT_INT (entity_by_number, const char *);
-KHASH_MAP_INIT_STR (entity_by_name, const char *);
-KHASH_MAP_INIT_STR (tag_by_name, struct html_tag_def);
-KHASH_MAP_INIT_INT (tag_by_id, struct html_tag_def);
-KHASH_INIT (color_by_name, const rspamd_ftok_t *, struct html_color, true,
- rspamd_ftok_icase_hash, rspamd_ftok_icase_equal);
-
-khash_t(entity_by_number) *html_entity_by_number;
-khash_t(entity_by_name) *html_entity_by_name;
-khash_t(tag_by_name) *html_tag_by_name;
-khash_t(tag_by_id) *html_tag_by_id;
-khash_t(color_by_name) *html_color_by_name;
-
-static struct rspamd_url *rspamd_html_process_url (rspamd_mempool_t *pool,
- const gchar *start, guint len,
- struct html_tag_component *comp);
-
-static void
-rspamd_html_library_init (void)
-{
- guint i;
- khiter_t k;
- gint rc;
-
- if (!tags_sorted) {
- html_tag_by_id = kh_init (tag_by_id);
- html_tag_by_name = kh_init (tag_by_name);
- kh_resize (tag_by_id, html_tag_by_id, G_N_ELEMENTS (tag_defs));
- kh_resize (tag_by_name, html_tag_by_name, G_N_ELEMENTS (tag_defs));
-
- for (i = 0; i < G_N_ELEMENTS (tag_defs); i++) {
- k = kh_put (tag_by_id, html_tag_by_id, tag_defs[i].id, &rc);
-
- if (rc == 0) {
- /* Collision by id */
- msg_err ("collision in html tag id: %d (%s) vs %d (%s)",
- (int)tag_defs[i].id, tag_defs[i].name,
- (int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
- }
-
- kh_val (html_tag_by_id, k) = tag_defs[i];
-
- k = kh_put (tag_by_name, html_tag_by_name, tag_defs[i].name, &rc);
-
- if (rc == 0) {
- /* Collision by name */
- msg_err ("collision in html tag name: %d (%s) vs %d (%s)",
- (int)tag_defs[i].id, tag_defs[i].name,
- (int)kh_val (html_tag_by_id, k).id, kh_val (html_tag_by_id, k).name);
- }
-
- kh_val (html_tag_by_name, k) = tag_defs[i];
- }
-
- tags_sorted = 1;
- }
- if (!entities_sorted) {
- html_entity_by_number = kh_init (entity_by_number);
- html_entity_by_name = kh_init (entity_by_name);
- kh_resize (entity_by_number, html_entity_by_number,
- G_N_ELEMENTS (entities_defs));
- kh_resize (entity_by_name, html_entity_by_name,
- G_N_ELEMENTS (entities_defs));
+[[maybe_unused]] static const html_tags_storage html_tags_defs;
+[[maybe_unused]] static const html_entities_storage html_entities_defs;
- for (i = 0; i < G_N_ELEMENTS (entities_defs); i++) {
- if (entities_defs[i].code != 0) {
- k = kh_put (entity_by_number, html_entity_by_number,
- entities_defs[i].code, &rc);
-
- if (rc == 0) {
- /* Collision by id */
- gint cmp_res = strcmp (entities_defs[i].replacement,
- kh_val (html_entity_by_number, k));
- if (cmp_res != 0) {
- if (strlen (entities_defs[i].replacement) <
- strlen (kh_val (html_entity_by_number, k))) {
- /* Shorter replacement is more likely to be valid */
- msg_debug ("1 collision in html entity id: %d (%s); replace %s by %s",
- (int) entities_defs[i].code, entities_defs[i].name,
- kh_val (html_entity_by_number, k),
- entities_defs[i].replacement);
- kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
- }
- else if (strlen (entities_defs[i].replacement) ==
- strlen (kh_val (html_entity_by_number, k)) &&
- cmp_res < 0) {
- /* Identical len but lexicographically shorter */
- msg_debug ("collision in html entity id: %d (%s); replace %s by %s",
- (int) entities_defs[i].code, entities_defs[i].name,
- kh_val (html_entity_by_number, k),
- entities_defs[i].replacement);
- kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
- }
- /* Do not replace otherwise */
- }
- /* Identic replacement */
- }
- else {
- kh_val (html_entity_by_number, k) = entities_defs[i].replacement;
- }
- }
-
- k = kh_put (entity_by_name, html_entity_by_name,
- entities_defs[i].name, &rc);
-
- if (rc == 0) {
- /* Collision by name */
- if (strcmp (kh_val (html_entity_by_number, k),
- entities_defs[i].replacement) != 0) {
- msg_err ("collision in html entity name: %d (%s)",
- (int) entities_defs[i].code, entities_defs[i].name);
- }
- }
-
- kh_val (html_entity_by_name, k) = entities_defs[i].replacement;
- }
-
- html_color_by_name = kh_init (color_by_name);
- kh_resize (color_by_name, html_color_by_name,
- G_N_ELEMENTS (html_colornames));
-
- rspamd_ftok_t *keys;
-
- keys = g_malloc0 (sizeof (rspamd_ftok_t) *
- G_N_ELEMENTS (html_colornames));
-
- for (i = 0; i < G_N_ELEMENTS (html_colornames); i ++) {
- struct html_color c;
-
- keys[i].begin = html_colornames[i].name;
- keys[i].len = strlen (html_colornames[i].name);
- k = kh_put (color_by_name, html_color_by_name,
- &keys[i], &rc);
- c.valid = true;
- c.d.comp.r = html_colornames[i].rgb.r;
- c.d.comp.g = html_colornames[i].rgb.g;
- c.d.comp.b = html_colornames[i].rgb.b;
- c.d.comp.alpha = 255;
- kh_val (html_color_by_name, k) = c;
-
- }
-
- entities_sorted = 1;
- }
-}
+static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
+ const gchar *start, guint len,
+ struct html_tag_component *comp);
static gboolean
-rspamd_html_check_balance (GNode * node, GNode ** cur_level)
+rspamd_html_check_balance(GNode *node, GNode **cur_level)
{
struct html_tag *arg = node->data, *tmp;
GNode *cur;
@@ -340,7 +72,7 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level)
(tmp->flags & FL_CLOSED) == 0) {
tmp->flags |= FL_CLOSED;
/* Destroy current node as we find corresponding parent node */
- g_node_destroy (node);
+ g_node_destroy(node);
/* Change level */
*cur_level = cur->parent;
return TRUE;
@@ -356,8 +88,7 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level)
}
gint
-rspamd_html_tag_by_name (const gchar *name)
-{
+rspamd_html_tag_by_name(const gchar *name) {
khiter_t k;
k = kh_get (tag_by_name, html_tag_by_name, name);
@@ -370,14 +101,13 @@ rspamd_html_tag_by_name (const gchar *name)
}
gboolean
-rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
-{
+rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname) {
gint id;
g_assert (hc != NULL);
g_assert (hc->tags_seen != NULL);
- id = rspamd_html_tag_by_name (tagname);
+ id = rspamd_html_tag_by_name(tagname);
if (id != -1) {
return isset (hc->tags_seen, id);
@@ -387,8 +117,7 @@ rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
}
const gchar *
-rspamd_html_tag_by_id (gint id)
-{
+rspamd_html_tag_by_id(gint id) {
khiter_t k;
k = kh_get (tag_by_id, html_tag_by_id, id);
@@ -402,8 +131,7 @@ rspamd_html_tag_by_id (gint id)
/* Decode HTML entitles in text */
guint
-rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
-{
+rspamd_html_decode_entitles_inplace(gchar *s, gsize len) {
goffset l, rep_len;
gchar *t = s, *h = s, *e = s, *end_ptr, old_c;
const gchar *end;
@@ -429,7 +157,7 @@ rspamd_html_decode_entitles_inplace (gchar *s, gsize len)
while (h - s < l && t <= h) {
switch (state) {
- /* Out of entity */
+ /* Out of entity */
case 0:
if (*h == '&') {
state = 1;
@@ -462,23 +190,24 @@ decode_entity:
if (k != kh_end (html_entity_by_name)) {
if (kh_val (html_entity_by_name, k)) {
- rep_len = strlen (kh_val (html_entity_by_name, k));
+ rep_len = strlen(kh_val (html_entity_by_name, k));
if (end - t >= rep_len) {
- memcpy (t, kh_val (html_entity_by_name, k),
+ memcpy(t, kh_val (html_entity_by_name, k),
rep_len);
t += rep_len;
}
- } else {
+ }
+ else {
if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
+ memmove(t, e, h - e + 1);
t += h - e + 1;
}
}
}
else {
if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
+ memmove(t, e, h - e + 1);
t += h - e + 1;
}
}
@@ -495,10 +224,10 @@ decode_entity:
}
if (base == 10) {
- uc = strtoul ((e + 2), &end_ptr, base);
+ uc = strtoul((e + 2), &end_ptr, base);
}
else {
- uc = strtoul ((e + 3), &end_ptr, base);
+ uc = strtoul((e + 3), &end_ptr, base);
}
if (end_ptr != NULL && *end_ptr != '\0') {
@@ -506,7 +235,7 @@ decode_entity:
*h = old_c;
if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
+ memmove(t, e, h - e + 1);
t += h - e + 1;
}
}
@@ -517,16 +246,17 @@ decode_entity:
if (k != kh_end (html_entity_by_number)) {
if (kh_val (html_entity_by_number, k)) {
- rep_len = strlen (kh_val (html_entity_by_number, k));
+ rep_len = strlen(kh_val (html_entity_by_number, k));
if (end - t >= rep_len) {
- memcpy (t, kh_val (html_entity_by_number, k),
+ memcpy(t, kh_val (html_entity_by_number, k),
rep_len);
t += rep_len;
}
- } else {
+ }
+ else {
if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
+ memmove(t, e, h - e + 1);
t += h - e + 1;
}
}
@@ -544,13 +274,13 @@ decode_entity:
else {
/* Leave invalid entities as is */
if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
+ memmove(t, e, h - e + 1);
t += h - e + 1;
}
}
}
else if (end - t > h - e + 1) {
- memmove (t, e, h - e + 1);
+ memmove(t, e, h - e + 1);
t += h - e + 1;
}
}
@@ -569,7 +299,7 @@ decode_entity:
state = 1;
if (end - t > h - e) {
- memmove (t, e, h - e);
+ memmove(t, e, h - e);
t += h - e;
}
@@ -581,11 +311,11 @@ decode_entity:
if (h + 1 < end && h[1] == 'x') {
seen_hex = TRUE;
/* Skip one more character */
- h ++;
+ h++;
}
}
else if (seen_digit_only != do_mixed &&
- (g_ascii_isdigit (*h) || (seen_hex && g_ascii_isxdigit (*h)))) {
+ (g_ascii_isdigit (*h) || (seen_hex && g_ascii_isxdigit (*h)))) {
seen_digit_only = do_digits_only;
}
else {
@@ -608,7 +338,7 @@ decode_entity:
if (state == 1 && h > e) {
/* Unfinished entity, copy as is */
if (end - t >= h - e) {
- memmove (t, e, h - e);
+ memmove(t, e, h - e);
t += h - e;
}
}
@@ -617,8 +347,7 @@ decode_entity:
}
static gboolean
-rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
-{
+rspamd_url_is_subdomain(rspamd_ftok_t *t1, rspamd_ftok_t *t2) {
const gchar *p1, *p2;
p1 = t1->begin + t1->len - 1;
@@ -630,7 +359,7 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
break;
}
- p1 --;
+ p1--;
}
while (p2 > t2->begin) {
@@ -638,7 +367,7 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
break;
}
- p2 --;
+ p2--;
}
while (p1 > t1->begin && p2 > t2->begin) {
@@ -646,8 +375,8 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
break;
}
- p1 --;
- p2 --;
+ p1--;
+ p2--;
}
if (p2 == t2->begin) {
@@ -666,13 +395,12 @@ rspamd_url_is_subdomain (rspamd_ftok_t *t1, rspamd_ftok_t *t2)
}
static void
-rspamd_html_url_is_phished (rspamd_mempool_t *pool,
- struct rspamd_url *href_url,
- const guchar *url_text,
- gsize len,
- gboolean *url_found,
- struct rspamd_url **ptext_url)
-{
+rspamd_html_url_is_phished(rspamd_mempool_t *pool,
+ struct rspamd_url *href_url,
+ const guchar *url_text,
+ gsize len,
+ gboolean *url_found,
+ struct rspamd_url **ptext_url) {
struct rspamd_url *text_url;
rspamd_ftok_t disp_tok, href_tok;
gint rc;
@@ -688,23 +416,23 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
*url_found = FALSE;
#if U_ICU_VERSION_MAJOR_NUM >= 46
if (udn == NULL) {
- udn = uidna_openUTS46 (UIDNA_DEFAULT, &uc_err);
+ udn = uidna_openUTS46(UIDNA_DEFAULT, &uc_err);
if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot init idna converter: %s", u_errorName (uc_err));
+ msg_err_pool ("cannot init idna converter: %s", u_errorName(uc_err));
}
}
#endif
while (url_text < end && g_ascii_isspace (*url_text)) {
- url_text ++;
+ url_text++;
}
if (end > url_text + 4 &&
- rspamd_url_find (pool, url_text, end - url_text, &url_str,
- RSPAMD_URL_FIND_ALL,
- &url_pos, NULL) &&
- url_str != NULL) {
+ rspamd_url_find(pool, url_text, end - url_text, &url_str,
+ RSPAMD_URL_FIND_ALL,
+ &url_pos, NULL) &&
+ url_str != NULL) {
if (url_pos > 0) {
/*
* We have some url at some offset, so we need to check what is
@@ -722,25 +450,25 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
}
}
- text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
+ text_url = rspamd_mempool_alloc0 (pool, sizeof(struct rspamd_url));
+ rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK) {
disp_tok.len = text_url->hostlen;
disp_tok.begin = rspamd_url_host_unsafe (text_url);
#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url),
+ if (rspamd_substring_search_caseless(rspamd_url_host_unsafe (text_url),
text_url->hostlen, "xn--", 4) != -1) {
idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
/* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
+ disp_tok.len = uidna_nameToUnicodeUTF8(udn,
rspamd_url_host_unsafe (text_url), text_url->hostlen,
idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
if (uc_err != U_ZERO_ERROR) {
msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
+ u_errorName(uc_err));
disp_tok.len = text_url->hostlen;
}
else {
@@ -751,17 +479,17 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
href_tok.len = href_url->hostlen;
href_tok.begin = rspamd_url_host_unsafe (href_url);
#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url),
+ if (rspamd_substring_search_caseless(rspamd_url_host_unsafe (href_url),
href_url->hostlen, "xn--", 4) != -1) {
idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
/* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8 (udn,
+ href_tok.len = uidna_nameToUnicodeUTF8(udn,
rspamd_url_host_unsafe (href_url), href_url->hostlen,
idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
if (uc_err != U_ZERO_ERROR) {
msg_err_pool ("cannot convert to IDN: %s",
- u_errorName (uc_err));
+ u_errorName(uc_err));
href_tok.len = href_url->hostlen;
}
else {
@@ -769,24 +497,24 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
}
}
#endif
- if (rspamd_ftok_casecmp (&disp_tok, &href_tok) != 0 &&
- text_url->tldlen > 0 && href_url->tldlen > 0) {
+ if (rspamd_ftok_casecmp(&disp_tok, &href_tok) != 0 &&
+ text_url->tldlen > 0 && href_url->tldlen > 0) {
*** OUTPUT TRUNCATED, 6108 LINES SKIPPED ***
More information about the Commits
mailing list