commit 40e894b: [Rework] Rework HTML content urls extraction
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Apr 2 10:14:03 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-04-02 11:07:53 +0100
URL: https://github.com/rspamd/rspamd/commit/40e894b9dfda24c8b454bf2365905d517e8e27a3 (HEAD -> master)
[Rework] Rework HTML content urls extraction
---
src/libmime/message.c | 10 +++++--
src/libserver/html.c | 6 ++--
src/libserver/url.c | 80 +++++++++++++++++++++++++++++----------------------
src/libserver/url.h | 59 ++++++++++++++++++++++---------------
src/lua/lua_url.c | 5 ++--
5 files changed, 96 insertions(+), 64 deletions(-)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index cca134f81..6825bc2f0 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -912,7 +912,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
rspamd_normalize_text_part (task, text_part);
if (!IS_PART_HTML (text_part)) {
- rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
+ rspamd_url_text_extract (task->task_pool, task, text_part,
+ RSPAMD_URL_FIND_ALL);
+ }
+ else {
+ rspamd_url_text_extract (task->task_pool, task, text_part,
+ RSPAMD_URL_FIND_STRICT);
}
if (text_part->exceptions) {
@@ -1231,7 +1236,8 @@ rspamd_message_parse (struct rspamd_task *task)
p = task->subject;
len = strlen (p);
rspamd_cryptobox_hash_update (&st, p, len);
- rspamd_url_find_multiple (task->task_pool, p, len, FALSE, NULL,
+ rspamd_url_find_multiple (task->task_pool, p, len,
+ RSPAMD_URL_FIND_STRICT, NULL,
rspamd_url_task_subject_callback, task);
}
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 6df545f00..41925609e 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -598,7 +598,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
}
if (end > url_text + 4 &&
- rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE,
+ rspamd_url_find (pool, url_text, end - url_text, &url_str,
+ RSPAMD_URL_FIND_ALL,
&url_pos, NULL) &&
url_str != NULL) {
if (url_pos > 0) {
@@ -1569,7 +1570,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
if (url->querylen > 0) {
- if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE,
+ if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
+ RSPAMD_URL_FIND_ALL,
NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (pool,
sizeof (struct rspamd_url));
diff --git a/src/libserver/url.c b/src/libserver/url.c
index f0f5bb21b..d774eb440 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -205,7 +205,7 @@ struct url_matcher static_matchers[] = {
{"sip:", "", url_web_start, url_web_end,
0, 0},
{"www.", "http://", url_web_start, url_web_end,
- 0, 0},
+ URL_FLAG_NOHTML, 0},
{"ftp.", "ftp://", url_web_start, url_web_end,
URL_FLAG_NOHTML, 0},
/* Likely emails */
@@ -218,7 +218,7 @@ struct url_callback_data {
gchar *url_str;
rspamd_mempool_t *pool;
gint len;
- gboolean is_html;
+ enum rspamd_url_find_type how;
gboolean prefix_added;
guint newline_idx;
GPtrArray *newlines;
@@ -2584,12 +2584,12 @@ rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos,
static gint
rspamd_url_trie_callback (struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context)
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
{
struct url_matcher *matcher;
url_match_t m;
@@ -2599,7 +2599,7 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
strnum);
- if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+ if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
@@ -2669,9 +2669,12 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
}
gboolean
-rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
- gchar **url_str, gboolean is_html, goffset *url_pos,
- gboolean *prefix_added)
+rspamd_url_find (rspamd_mempool_t *pool,
+ const gchar *begin, gsize len,
+ gchar **url_str,
+ enum rspamd_url_find_type how,
+ goffset *url_pos,
+ gboolean *prefix_added)
{
struct url_callback_data cb;
gint ret;
@@ -2679,7 +2682,7 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
memset (&cb, 0, sizeof (cb));
cb.begin = begin;
cb.end = begin + len;
- cb.is_html = is_html;
+ cb.how = how;
cb.pool = pool;
ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
@@ -2706,13 +2709,13 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
static gint
rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context,
- gboolean multiple)
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context,
+ gboolean multiple)
{
struct rspamd_url *url;
struct url_matcher *matcher;
@@ -2726,7 +2729,7 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
strnum);
pool = cb->pool;
- if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+ if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
@@ -2894,7 +2897,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, url->query, url->querylen,
- &url_str, IS_PART_HTML (cbd->part), NULL, &prefix_added)) {
+ &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
@@ -2938,9 +2941,9 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
void
rspamd_url_text_extract (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- gboolean is_html)
+ struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_url_find_type how)
{
struct rspamd_url_mimepart_cbdata mcbd;
@@ -2953,14 +2956,18 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
mcbd.part = part;
rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
- part->utf_stripped_content->len, is_html, part->newlines,
+ part->utf_stripped_content->len, how, part->newlines,
rspamd_url_text_part_callback, &mcbd);
}
void
-rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html, GPtrArray *nlines,
- url_insert_function func, gpointer ud)
+rspamd_url_find_multiple (rspamd_mempool_t *pool,
+ const gchar *in,
+ gsize inlen,
+ enum rspamd_url_find_type how,
+ GPtrArray *nlines,
+ url_insert_function func,
+ gpointer ud)
{
struct url_callback_data cb;
@@ -2973,7 +2980,7 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
memset (&cb, 0, sizeof (cb));
cb.begin = in;
cb.end = in + inlen;
- cb.is_html = is_html;
+ cb.how = how;
cb.pool = pool;
cb.funcd = ud;
@@ -2986,9 +2993,12 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
}
void
-rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html,
- url_insert_function func, gpointer ud)
+rspamd_url_find_single (rspamd_mempool_t *pool,
+ const gchar *in,
+ gsize inlen,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud)
{
struct url_callback_data cb;
@@ -3001,7 +3011,7 @@ rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
memset (&cb, 0, sizeof (cb));
cb.begin = in;
cb.end = in + inlen;
- cb.is_html = is_html;
+ cb.how = how;
cb.pool = pool;
cb.funcd = ud;
@@ -3049,7 +3059,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, url->query, url->querylen,
- &url_str, FALSE, NULL, &prefix_added)) {
+ &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 2cf80df4b..2243534dc 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -90,6 +90,17 @@ enum rspamd_url_protocol {
PROTOCOL_UNKNOWN = 1u << 31,
};
+enum rspamd_url_parse_flags {
+ RSPAMD_URL_PARSE_TEXT = 0,
+ RSPAMD_URL_PARSE_HREF = (1u << 0),
+ RSPAMD_URL_PARSE_CHECK = (1 << 1),
+};
+
+enum rspamd_url_find_type {
+ RSPAMD_URL_FIND_ALL = 0,
+ RSPAMD_URL_FIND_STRICT,
+};
+
/**
* Initialize url library
* @param cfg
@@ -104,15 +115,9 @@ void rspamd_url_deinit (void);
* @param is_html turn on html euristic
*/
void rspamd_url_text_extract (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- gboolean is_html);
-
-enum rspamd_url_parse_flags {
- RSPAMD_URL_PARSE_TEXT = 0,
- RSPAMD_URL_PARSE_HREF = (1u << 0),
- RSPAMD_URL_PARSE_CHECK = (1 << 1),
-};
+ struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_url_find_type how);
/*
* Parse a single url into an uri structure
@@ -136,9 +141,12 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
* @param url_str storage for url string(or NULL)
* @return TRUE if url is found in specified text
*/
-gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
- gchar **url_str, gboolean is_html, goffset *url_pos,
- gboolean *prefix_added);
+gboolean rspamd_url_find (rspamd_mempool_t *pool,
+ const gchar *begin, gsize len,
+ gchar **url_str,
+ enum rspamd_url_find_type how,
+ goffset *url_pos,
+ gboolean *prefix_added);
/*
* Return text representation of url parsing error
*/
@@ -166,9 +174,12 @@ typedef void (*url_insert_function) (struct rspamd_url *url,
* @param func
* @param ud
*/
-void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html, GPtrArray *nlines,
- url_insert_function func, gpointer ud);
+void rspamd_url_find_multiple (rspamd_mempool_t *pool,
+ const gchar *in, gsize inlen,
+ enum rspamd_url_find_type how,
+ GPtrArray *nlines,
+ url_insert_function func,
+ gpointer ud);
/**
* Search for a single url in text and call `func` for each url found
* @param pool
@@ -178,9 +189,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
* @param func
* @param ud
*/
-void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
- gsize inlen, gboolean is_html,
- url_insert_function func, gpointer ud);
+void rspamd_url_find_single (rspamd_mempool_t *pool,
+ const gchar *in, gsize inlen,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud);
/**
* Generic callback to insert URLs into rspamd_task
@@ -190,8 +203,8 @@ void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
* @param ud
*/
void rspamd_url_task_subject_callback (struct rspamd_url *url,
- gsize start_offset,
- gsize end_offset, gpointer ud);
+ gsize start_offset,
+ gsize end_offset, gpointer ud);
/**
* Adds a tag for url
@@ -200,8 +213,8 @@ void rspamd_url_task_subject_callback (struct rspamd_url *url,
* @param pool
*/
void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
- const gchar *value,
- rspamd_mempool_t *pool);
+ const gchar *value,
+ rspamd_mempool_t *pool);
guint rspamd_url_hash (gconstpointer u);
guint rspamd_email_hash (gconstpointer u);
@@ -232,7 +245,7 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
* @return
*/
const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
- rspamd_mempool_t *pool);
+ rspamd_mempool_t *pool);
/**
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
index 9bc984da3..a0f8c4648 100644
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -799,7 +799,7 @@ lua_url_create (lua_State *L)
return luaL_error (L, "invalid arguments");
}
else {
- rspamd_url_find_single (pool, text, length, FALSE,
+ rspamd_url_find_single (pool, text, length, RSPAMD_URL_FIND_ALL,
lua_url_single_inserter, L);
if (lua_type (L, -1) != LUA_TUSERDATA) {
@@ -867,7 +867,8 @@ lua_url_all (lua_State *L)
if (text != NULL) {
lua_newtable (L);
- rspamd_url_find_multiple (pool, text, length, FALSE, NULL,
+ rspamd_url_find_multiple (pool, text, length,
+ RSPAMD_URL_FIND_ALL, NULL,
lua_url_table_inserter, L);
}
More information about the Commits
mailing list