commit 882ea33: [Rework] Html: Further html urls rework
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu May 27 14:07:07 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-25 12:15:30 +0100
URL: https://github.com/rspamd/rspamd/commit/882ea3395af8bfb9929ff5308ecc8cfd959fd761
[Rework] Html: Further html urls rework
---
src/libserver/html/html.cxx | 403 +++++++---------------------------------
src/libserver/html/html.h | 19 +-
src/libserver/html/html_url.cxx | 220 +++++++++++++++++++++-
src/libserver/html/html_url.hxx | 24 +++
4 files changed, 322 insertions(+), 344 deletions(-)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index c167b004f..c384a9023 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -30,6 +30,7 @@
#include "html_tag_defs.hxx"
#include "html_entities.hxx"
#include "html_tag.hxx"
+#include "html_url.hxx"
#include <vector>
#include <frozen/unordered_map.h>
@@ -633,273 +634,76 @@ parse_tag_content(rspamd_mempool_t *pool,
parser_env.cur_state = state;
}
-}
-
-/* Unconverted C part */
-
-static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
- const gchar *start, guint len,
- struct html_tag_component *comp);
-
-
-
-
-struct rspamd_url *
-rspamd_html_process_url(rspamd_mempool_t *pool, const gchar *start, guint len,
- struct html_tag_component *comp) {
- struct rspamd_url *url;
- guint saved_flags = 0;
- gchar *decoded;
- gint rc;
- gsize decoded_len;
- const gchar *p, *s, *prefix = "http://";
- gchar *d;
- guint i;
- gsize dlen;
- gboolean has_bad_chars = FALSE, no_prefix = FALSE;
- static const gchar hexdigests[] = "0123456789abcdef";
-
- p = start;
-
- /* Strip spaces from the url */
- /* Head spaces */
- while (p < start + len && g_ascii_isspace (*p)) {
- p++;
- start++;
- len--;
- }
-
- if (comp) {
- comp->start = (guchar *)p;
- comp->len = len;
- }
-
- /* Trailing spaces */
- p = start + len - 1;
-
- while (p >= start && g_ascii_isspace (*p)) {
- p--;
- len--;
-
- if (comp) {
- comp->len--;
- }
- }
+static auto
+html_process_url_tag(rspamd_mempool_t *pool,
+ struct html_tag *tag,
+ struct html_content *hc) -> std::optional<struct rspamd_url *>
+{
+ auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
- s = start;
- dlen = 0;
+ if (found_href_it != tag->parameters.end()) {
+ /* Check base url */
+ auto &href_value = found_href_it->second;
- for (i = 0; i < len; i++) {
- if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
- dlen += 3;
- }
- else {
- dlen++;
- }
- }
+ if (hc && hc->base_url && href_value.size() > 2) {
+ /*
+ * Relative url cannot start from the following:
+ * schema://
+ * data:
+ * slash
+ */
- if (rspamd_substring_search(start, len, "://", 3) == -1) {
- if (len >= sizeof("mailto:") &&
- (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
- memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
- memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
- /* Exclusion, has valid but 'strange' prefix */
- }
- else {
- for (i = 0; i < len; i++) {
- if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
- if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
- prefix = "http:";
- dlen += sizeof("http:") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == '@') {
- /* Likely email prefix */
- prefix = "mailto://";
- dlen += sizeof("mailto://") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == ':' && i != 0) {
- /* Special case */
- no_prefix = FALSE;
- }
- else {
- if (i == 0) {
- /* No valid data */
- return NULL;
- }
- else {
- no_prefix = TRUE;
- dlen += strlen(prefix);
- }
- }
+ if (rspamd_substring_search(href_value.data(), href_value.size(), "://", 3) == -1) {
- break;
+ if (href_value.size() >= sizeof("data:") &&
+ g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
+ /* Image data url, never insert as url */
+ return std::nullopt;
}
- }
- }
- }
-
- decoded = (char *)rspamd_mempool_alloc (pool, dlen + 1);
- d = decoded;
-
- if (no_prefix) {
- gsize plen = strlen(prefix);
- memcpy(d, prefix, plen);
- d += plen;
- }
-
- /*
- * We also need to remove all internal newlines, spaces
- * and encode unsafe characters
- */
- for (i = 0; i < len; i++) {
- if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
- continue;
- }
- else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
- /* URL encode */
- *d++ = '%';
- *d++ = hexdigests[(s[i] >> 4) & 0xf];
- *d++ = hexdigests[s[i] & 0xf];
- has_bad_chars = TRUE;
- }
- else {
- *d++ = s[i];
- }
- }
-
- *d = '\0';
- dlen = d - decoded;
- url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+ /* Assume relative url */
+ auto need_slash = false;
- rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
+ auto orig_len = href_value.size();
+ auto len = orig_len + hc->base_url->urllen;
- rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
- /* Filter some completely damaged urls */
- if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
- !((url->protocol & PROTOCOL_UNKNOWN))) {
- url->flags |= saved_flags;
-
- if (has_bad_chars) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (no_prefix) {
- url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+ if (hc->base_url->datalen == 0) {
+ need_slash = true;
+ len++;
+ }
- if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
- /* Ignore urls with both no schema and no tld */
- return NULL;
+ auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+ auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1,
+ "%*s%s%*s",
+ hc->base_url->urllen, hc->base_url->string,
+ need_slash ? "/" : "",
+ (gint) orig_len, href_value.size());
+ href_value = {buf, nlen};
+ }
+ else if (href_value[0] == '/' && href_value[1] != '/') {
+ /* Relative to the hostname */
+ auto orig_len = href_value.size();
+ auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
+ 3 /* for :// */;
+ auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+ auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
+ hc->base_url->protocollen, hc->base_url->string,
+ hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
+ (gint)orig_len, href_value.data());
+ href_value = {buf, nlen};
}
}
- decoded = url->string;
- decoded_len = url->urllen;
+ auto url = html_process_url(pool, href_value);
- if (comp) {
- comp->start = (guchar *)decoded;
- comp->len = decoded_len;
- }
- /* Spaces in href usually mean an attempt to obfuscate URL */
- /* See https://github.com/vstakhov/rspamd/issues/593 */
-#if 0
- if (has_spaces) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ if (url && tag->extra == nullptr) {
+ tag->extra = url.value();
}
-#endif
return url;
}
- return NULL;
-}
-
-static struct rspamd_url *
-rspamd_html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc) {
- struct html_tag_component *comp;
- GList *cur;
- struct rspamd_url *url;
- const gchar *start;
- gsize len;
-
- cur = tag->params->head;
-
- while (cur) {
- comp = (struct html_tag_component *)cur->data;
-
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- start = (char *)comp->start;
- len = comp->len;
-
- /* Check base url */
- if (hc && hc->base_url && comp->len > 2) {
- /*
- * Relative url cannot start from the following:
- * schema://
- * data:
- * slash
- */
- gchar *buf;
- gsize orig_len;
-
- if (rspamd_substring_search(start, len, "://", 3) == -1) {
-
- if (len >= sizeof("data:") &&
- g_ascii_strncasecmp(start, "data:", sizeof("data:") - 1) == 0) {
- /* Image data url, never insert as url */
- return NULL;
- }
-
- /* Assume relative url */
-
- gboolean need_slash = FALSE;
-
- orig_len = len;
- len += hc->base_url->urllen;
-
- if (hc->base_url->datalen == 0) {
- need_slash = TRUE;
- len++;
- }
-
- buf = (char *)rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf(buf, len + 1, "%*s%s%*s",
- hc->base_url->urllen, hc->base_url->string,
- need_slash ? "/" : "",
- (gint) orig_len, start);
- start = buf;
- }
- else if (start[0] == '/' && start[1] != '/') {
- /* Relative to the hostname */
- orig_len = len;
- len += hc->base_url->hostlen + hc->base_url->protocollen +
- 3 /* for :// */;
- buf = (char *)rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
- hc->base_url->protocollen, hc->base_url->string,
- hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
- (gint) orig_len, start);
- start = buf;
- }
- }
-
- url = rspamd_html_process_url(pool, start, len, comp);
-
- if (url && tag->extra == NULL) {
- tag->extra = url;
- }
-
- return url;
- }
-
- cur = g_list_next (cur);
- }
-
- return NULL;
+ return std::nullopt;
}
struct rspamd_html_url_query_cbd {
@@ -910,8 +714,9 @@ struct rspamd_html_url_query_cbd {
};
static gboolean
-rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
- gsize end_offset, gpointer ud) {
+html_url_query_callback(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
struct rspamd_html_url_query_cbd *cbd =
(struct rspamd_html_url_query_cbd *) ud;
rspamd_mempool_t *pool;
@@ -939,9 +744,10 @@ rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
}
static void
-rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
- khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls) {
+process_html_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
+ khash_t (rspamd_url_hash) *url_set,
+ GPtrArray *part_urls)
+{
if (url->querylen > 0) {
struct rspamd_html_url_query_cbd qcbd;
@@ -953,7 +759,7 @@ rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
rspamd_url_find_multiple(pool,
rspamd_url_query_unsafe (url), url->querylen,
RSPAMD_URL_FIND_ALL, NULL,
- rspamd_html_url_query_callback, &qcbd);
+ html_url_query_callback, &qcbd);
}
if (part_urls) {
@@ -1013,10 +819,12 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
}
static void
-rspamd_html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls,
- GByteArray *dest) {
+html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+ struct html_content *hc,
+ khash_t (rspamd_url_hash) *url_set,
+ GPtrArray *part_urls,
+ GByteArray *dest)
+{
struct html_tag_component *comp;
struct html_image *img;
rspamd_ftok_t fstr;
@@ -1205,6 +1013,10 @@ rspamd_html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
}
}
+}
+
+/* Unconverted C part */
+
static void
rspamd_html_process_color(const gchar *line, guint len, struct html_color *cl)
{
@@ -1764,80 +1576,7 @@ rspamd_html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
tag->extra = bl;
}
-static void
-rspamd_html_check_displayed_url(rspamd_mempool_t *pool,
- GList **exceptions,
- khash_t (rspamd_url_hash) *url_set,
- GByteArray *dest,
- gint href_offset,
- struct rspamd_url *url) {
- struct rspamd_url *displayed_url = NULL;
- struct rspamd_url *turl;
- gboolean url_found = FALSE;
- struct rspamd_process_exception *ex;
- guint saved_flags = 0;
- gsize dlen;
-
- if (href_offset < 0) {
- /* No dispalyed url, just some text within <a> tag */
- return;
- }
-
- url->visible_part = (gchar *)rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
- rspamd_strlcpy(url->visible_part,
- reinterpret_cast<const gchar *>(dest->data + href_offset),
- dest->len - href_offset + 1);
- dlen = dest->len - href_offset;
-
- /* Strip unicode spaces from the start and the end */
- url->visible_part = rspamd_string_unicode_trim_inplace(url->visible_part,
- &dlen);
- rspamd_html_url_is_phished(pool, url,
- reinterpret_cast<const guchar *>(url->visible_part),
- dlen,
- &url_found, &displayed_url);
-
- if (url_found) {
- url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
- }
-
- if (exceptions && url_found) {
- ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
- ex->pos = href_offset;
- ex->len = dest->len - href_offset;
- ex->type = RSPAMD_EXCEPTION_URL;
- ex->ptr = url;
-
- *exceptions = g_list_prepend(*exceptions,
- ex);
- }
-
- if (displayed_url && url_set) {
- turl = rspamd_url_set_add_or_return(url_set,
- displayed_url);
- if (turl != NULL) {
- /* Here, we assume the following:
- * if we have a URL in the text part which
- * is the same as displayed URL in the
- * HTML part, we assume that it is also
- * hint only.
- */
- if (turl->flags &
- RSPAMD_URL_FLAG_FROM_TEXT) {
- turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
- }
-
- turl->count++;
- }
- else {
- /* Already inserted by `rspamd_url_set_add_or_return` */
- }
- }
-
- rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
-}
static gboolean
rspamd_html_propagate_lengths(GNode *node, gpointer _unused) {
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 14217b2c9..afa46eb06 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -46,7 +46,6 @@ extern "C" {
struct rspamd_image;
-struct html_tag;
struct html_image {
guint height;
@@ -55,7 +54,7 @@ struct html_image {
gchar *src;
struct rspamd_url *url;
struct rspamd_image *embedded_image;
- struct html_tag *tag;
+ void *tag;
};
struct html_color {
@@ -79,7 +78,7 @@ struct html_color {
};
struct html_block {
- struct html_tag *tag;
+ void *tag;
struct html_color font_color;
struct html_color background_color;
//struct html_tag_component style;
@@ -101,8 +100,6 @@ struct html_block {
#define FL_HREF (1 << 29)
#define FL_IMAGE (1 << 30)
-
-
/* Forwarded declaration */
struct rspamd_task;
@@ -122,13 +119,13 @@ struct html_content {
/*
* Decode HTML entitles in text. Text is modified in place.
*/
-guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
+guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
-GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
+GByteArray *rspamd_html_process_part(rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in);
-GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
+GByteArray *rspamd_html_process_part_full(rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
@@ -138,21 +135,21 @@ GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
/*
* Returns true if a specified tag has been seen in a part
*/
-gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
+gboolean rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname);
/**
* Returns name for the specified tag id
* @param id
* @return
*/
-const gchar *rspamd_html_tag_by_id (gint id);
+const gchar *rspamd_html_tag_by_id(gint id);
/**
* Returns HTML tag id by name
* @param name
* @return
*/
-gint rspamd_html_tag_by_name (const gchar *name);
+gint rspamd_html_tag_by_name(const gchar *name);
/**
* Extract URL from HTML tag component and sets component elements if needed
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx
index 93728119b..5c4fb8d56 100644
--- a/src/libserver/html/html_url.cxx
+++ b/src/libserver/html/html_url.cxx
@@ -18,6 +18,7 @@
#include "libutil/str_util.h"
#include "libserver/url.h"
#include "libserver/logger.h"
+#include "rspamd.h"
#include <unicode/idna.h>
@@ -137,7 +138,7 @@ html_url_is_phished(rspamd_mempool_t *pool,
if (text_data.size() > 4 &&
rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
RSPAMD_URL_FIND_ALL,
- &url_pos, NULL) && url_str != NULL) {
+ &url_pos, NULL) && url_str != nullptr) {
text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
@@ -197,4 +198,221 @@ html_url_is_phished(rspamd_mempool_t *pool,
return std::nullopt;
}
+void
+html_check_displayed_url(rspamd_mempool_t *pool,
+ GList **exceptions,
+ void *url_set,
+ std::string_view visible_part,
+ goffset href_offset,
+ struct rspamd_url *url)
+{
+ struct rspamd_url *displayed_url = nullptr;
+ struct rspamd_url *turl;
+ struct rspamd_process_exception *ex;
+ guint saved_flags = 0;
+ gsize dlen;
+
+ if (visible_part.empty()) {
+ /* No dispalyed url, just some text within <a> tag */
+ return;
+ }
+
+ url->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
+ rspamd_strlcpy(url->visible_part,
+ visible_part.data(),
+ visible_part.size());
+ dlen = visible_part.size();
+
+ /* Strip unicode spaces from the start and the end */
+ url->visible_part = const_cast<char *>(
+ rspamd_string_unicode_trim_inplace(url->visible_part,
+ &dlen));
+ auto maybe_url = html_url_is_phished(pool, url,
+ {url->visible_part, dlen});
+
+ if (maybe_url) {
+ url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
+ displayed_url = maybe_url.value();
+ }
+
+ if (exceptions && displayed_url != nullptr) {
+ ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
+ ex->pos = href_offset;
+ ex->len = dlen;
+ ex->type = RSPAMD_EXCEPTION_URL;
+ ex->ptr = url;
+
+ *exceptions = g_list_prepend(*exceptions, ex);
+ }
+
+ if (displayed_url && url_set) {
+ turl = rspamd_url_set_add_or_return((khash_t (rspamd_url_hash) *)url_set, displayed_url);
+
+ if (turl != nullptr) {
+ /* Here, we assume the following:
+ * if we have a URL in the text part which
+ * is the same as displayed URL in the
+ * HTML part, we assume that it is also
+ * hint only.
+ */
+ if (turl->flags &
+ RSPAMD_URL_FLAG_FROM_TEXT) {
+ turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+ turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+ }
+
+ turl->count++;
+ }
+ else {
+ /* Already inserted by `rspamd_url_set_add_or_return` */
+ }
+ }
+
+ rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
+}
+
+auto
+html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+ -> std::optional<struct rspamd_url *>
+{
+ struct rspamd_url *url;
+ guint saved_flags = 0;
+ gint rc;
+ const gchar *s, *prefix = "http://";
+ gchar *d;
+ gsize dlen;
+ gboolean has_bad_chars = FALSE, no_prefix = FALSE;
+ static const gchar hexdigests[] = "0123456789abcdef";
+
+ auto sz = input.length();
+ const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
+ input = {trimmed, sz};
+
+ const auto *start = input.data();
+ s = start;
+ dlen = 0;
+
+ for (auto i = 0; i < sz; i++) {
+ if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+ dlen += 3;
+ }
+ else {
*** OUTPUT TRUNCATED, 164 LINES SKIPPED ***
More information about the Commits
mailing list