commit 0f0717e: [Fix] Core: Implement logic to find some bad characters in URLs
Vsevolod Stakhov
vsevolod at highsecure.ru
Wed Jan 16 15:07:07 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-01-16 15:04:50 +0000
URL: https://github.com/rspamd/rspamd/commit/0f0717ee7ad5ee93f2ecfb24e8f57fbb42e8feca (HEAD -> master)
[Fix] Core: Implement logic to find some bad characters in URLs
---
src/libserver/html.c | 65 ++++++++++++++++++++++++++++++++----
src/libserver/url.c | 93 ++++++++++++++++++++++++++++++++++++----------------
src/libserver/url.h | 13 ++++++--
src/plugins/surbl.c | 6 ++--
4 files changed, 136 insertions(+), 41 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index cbc0fe7da..e97a010fe 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -571,7 +571,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
}
}
text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);
+ rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK) {
disp_tok.len = text_url->hostlen;
@@ -991,23 +992,61 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
state = ignore_bad_tag;
}
else {
+ const guchar *attr_name_end = in;
+
if (*in == '=') {
state = parse_equal;
}
+ else if (*in == '"') {
+ /* No equal or something sane but we have quote character */
+ state = parse_start_dquote;
+ attr_name_end = in - 1;
+
+ while (attr_name_end > *savep) {
+ if (!g_ascii_isalnum (*attr_name_end)) {
+ attr_name_end --;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* One character forward to obtain length */
+ attr_name_end ++;
+ }
else if (g_ascii_isspace (*in)) {
state = spaces_before_eq;
}
else if (*in == '/') {
tag->flags |= FL_CLOSED;
}
+ else if (!g_ascii_isgraph (*in)) {
+ state = parse_value;
+ attr_name_end = in - 1;
+
+ while (attr_name_end > *savep) {
+ if (!g_ascii_isalnum (*attr_name_end)) {
+ attr_name_end --;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* One character forward to obtain length */
+ attr_name_end ++;
+ }
else {
return;
}
- if (!rspamd_html_parse_tag_component (pool, *savep, in, tag)) {
+ if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
/* Ignore unknown params */
*savep = NULL;
}
+ else if (state == parse_value) {
+ *savep = in + 1;
+ }
}
break;
@@ -1153,7 +1192,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
tag->flags |= FL_CLOSED;
store = TRUE;
}
- else if (g_ascii_isspace (*in) || *in == '>') {
+ else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
store = TRUE;
state = spaces_after_param;
}
@@ -1210,6 +1249,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
struct html_tag_component *comp)
{
struct rspamd_url *url;
+ guint saved_flags = 0;
gchar *decoded;
gint rc;
gsize decoded_len;
@@ -1301,13 +1341,23 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
url = rspamd_mempool_alloc0 (pool, sizeof (*url));
- if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
- url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ enum rspamd_normalise_result norm_res;
+
+ norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
+
+ if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
+ saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
}
- rc = rspamd_url_parse (url, decoded, dlen, pool);
+ if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
+ saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
+
+ rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
if (rc == URI_ERRNO_OK) {
+ url->flags |= saved_flags;
+
if (has_bad_chars) {
url->flags |= RSPAMD_URL_FLAG_OBSCURED;
}
@@ -1439,7 +1489,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
rc = rspamd_url_parse (query_url,
url_str,
strlen (url_str),
- pool);
+ pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK &&
query_url->hostlen > 0) {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index e27a2c39b..3a08ec748 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -564,8 +564,10 @@ is_url_end (gchar c)
}
static gint
-rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
- gchar const **end, gboolean strict, guint *flags)
+rspamd_mailto_parse (struct http_parser_url *u,
+ const gchar *str, gsize len,
+ gchar const **end,
+ enum rspamd_url_parse_flags parse_flags, guint *flags)
{
const gchar *p = str, *c = str, *last = str + len;
gchar t;
@@ -711,7 +713,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
*end = p;
}
- if (!strict) {
+ if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
return 0;
}
@@ -720,7 +722,9 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
static gint
rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
- gchar const **end, gboolean strict, guint *flags)
+ gchar const **end,
+ enum rspamd_url_parse_flags parse_flags,
+ guint *flags)
{
const gchar *p = str, *c = str, *last = str + len, *slash = NULL,
*password_start = NULL, *user_start = NULL;
@@ -763,7 +767,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
SET_U (u, UF_SCHEMA);
}
else if (!g_ascii_isalnum (t) && t != '+' && t != '-') {
- if (!strict && p > c) {
+ if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) {
/* We might have some domain, but no protocol */
st = parse_domain;
p = c;
@@ -985,7 +989,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
}
else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
if (*p & 0x80) {
- *flags |= RSPAMD_URL_FLAG_IDN;
+ (*flags) |= RSPAMD_URL_FLAG_IDN;
guint i = 0;
U8_NEXT (p, i, last - p, uc);
@@ -997,11 +1001,16 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
if (!u_isalnum (uc)) {
/* Bad symbol */
- if (strict) {
- goto out;
+ if (IS_ZERO_WIDTH_SPACE (uc)) {
+ (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
}
else {
- goto set;
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
}
}
@@ -1011,11 +1020,18 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
p ++;
}
else {
- if (strict) {
- goto out;
+ if (parse_flags & RSPAMD_URL_PARSE_HREF) {
+ /* We have to use all shit we are given here */
+ p ++;
+ (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
}
else {
- goto set;
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+ goto out;
+ }
+ else {
+ goto set;
+ }
}
}
}
@@ -1117,7 +1133,8 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
goto set;
}
else if (!g_ascii_isdigit (t)) {
- if (strict || !g_ascii_isspace (t)) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) ||
+ !g_ascii_isspace (t)) {
goto out;
}
else {
@@ -1148,7 +1165,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
goto set;
}
else if (is_lwsp (t)) {
- if (strict) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
if (g_ascii_isspace (t)) {
goto set;
}
@@ -1172,7 +1189,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
goto set;
}
else if (is_lwsp (t)) {
- if (strict) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
if (g_ascii_isspace (t)) {
goto set;
}
@@ -1189,7 +1206,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
goto set;
}
else if (is_lwsp (t)) {
- if (strict) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
if (g_ascii_isspace (t)) {
goto set;
}
@@ -1602,8 +1619,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
}
enum uri_errno
-rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
- rspamd_mempool_t *pool)
+rspamd_url_parse (struct rspamd_url *uri,
+ gchar *uristring, gsize len,
+ rspamd_mempool_t *pool,
+ enum rspamd_url_parse_flags parse_flags)
{
struct http_parser_url u;
gchar *p, *comp;
@@ -1624,14 +1643,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
if (len > sizeof ("mailto:") - 1) {
/* For mailto: urls we also need to add slashes to make it a valid URL */
if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
- ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags);
+ ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags,
+ &flags);
}
else {
- ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
+ ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags,
+ &flags);
}
}
else {
- ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
+ ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags, &flags);
}
if (ret != 0) {
@@ -1715,9 +1736,11 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
uri->protocollen);
rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+
if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
}
+
rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->datalen) {
@@ -1730,6 +1753,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
rspamd_url_shift (uri, unquoted_len, UF_PATH);
}
+
if (uri->querylen) {
unquoted_len = rspamd_url_decode (uri->query,
uri->query,
@@ -1739,6 +1763,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
}
rspamd_url_shift (uri, unquoted_len, UF_QUERY);
}
+
if (uri->fragmentlen) {
unquoted_len = rspamd_url_decode (uri->fragment,
uri->fragment,
@@ -1769,7 +1794,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
uri->host, uri->hostlen,
rspamd_tld_trie_callback, uri, NULL);
- if (uri->tldlen == 0) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && uri->tldlen == 0) {
/* Ignore URL's without TLD if it is not a numeric URL */
if (!rspamd_url_is_ip (uri, pool)) {
return URI_ERRNO_TLD_MISSING;
@@ -1777,7 +1802,13 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
}
if (uri->protocol == PROTOCOL_UNKNOWN) {
- return URI_ERRNO_INVALID_PROTOCOL;
+ if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+ return URI_ERRNO_INVALID_PROTOCOL;
+ }
+ else {
+ /* Hack, hack, hack */
+ uri->protocol = PROTOCOL_HTTP;
+ }
}
return URI_ERRNO_OK;
@@ -2089,7 +2120,8 @@ url_web_end (struct url_callback_data *cb,
len = MIN (len, match->newline_pos - pos);
}
- if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) {
+ if (rspamd_web_parse (NULL, pos, len, &last,
+ RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
return FALSE;
}
@@ -2157,7 +2189,8 @@ url_email_end (struct url_callback_data *cb,
if (!match->prefix || match->prefix[0] == '\0') {
/* We have mailto:// at the beginning */
- if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) {
+ if (rspamd_mailto_parse (&u, pos, len, &last,
+ RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
return FALSE;
}
@@ -2470,7 +2503,9 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
cb->fin = m.m_begin + m.m_len;
url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
g_strstrip (cb->url_str);
- rc = rspamd_url_parse (url, cb->url_str, strlen (cb->url_str), pool);
+ rc = rspamd_url_parse (url, cb->url_str,
+ strlen (cb->url_str), pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK && url->hostlen > 0) {
if (cb->prefix_added) {
@@ -2583,7 +2618,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
rc = rspamd_url_parse (query_url,
url_str,
strlen (url_str),
- task->task_pool);
+ task->task_pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK &&
query_url->hostlen > 0) {
@@ -2737,7 +2773,8 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
rc = rspamd_url_parse (query_url,
url_str,
strlen (url_str),
- task->task_pool);
+ task->task_pool,
+ RSPAMD_URL_PARSE_TEXT);
if (rc == URI_ERRNO_OK &&
url->hostlen > 0) {
diff --git a/src/libserver/url.h b/src/libserver/url.h
index b0cc10239..a9eda71de 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -104,6 +104,12 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool,
struct rspamd_mime_text_part *part,
gboolean is_html);
+enum rspamd_url_parse_flags {
+ RSPAMD_URL_PARSE_TEXT = 0,
+ RSPAMD_URL_PARSE_HREF = (1u << 0),
+ RSPAMD_URL_PARSE_CHECK = (1 << 1),
+};
+
/*
* Parse a single url into an uri structure
* @param pool memory pool
@@ -111,9 +117,10 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool,
* @param uri url object, must be pre allocated
*/
enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
- gchar *uristring,
- gsize len,
- rspamd_mempool_t *pool);
+ gchar *uristring,
+ gsize len,
+ rspamd_mempool_t *pool,
+ enum rspamd_url_parse_flags flags);
/*
* Try to extract url from a text
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index 4bc17db20..ab9e5bb47 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -1660,7 +1660,7 @@ surbl_redirector_finish (struct rspamd_http_connection *conn,
sizeof (*redirected_url));
rspamd_strlcpy (urlstr, hdr->begin, urllen + 1);
r = rspamd_url_parse (redirected_url, urlstr, urllen,
- task->task_pool);
+ task->task_pool, RSPAMD_URL_PARSE_TEXT);
if (r == URI_ERRNO_OK) {
if ((existing = g_hash_table_lookup (task->urls, redirected_url)) == NULL) {
@@ -2120,7 +2120,7 @@ surbl_is_redirector_handler (lua_State *L)
url_cpy = rspamd_mempool_alloc (task->task_pool, len);
memcpy (url_cpy, url, len);
- if (rspamd_url_parse (&uri, url_cpy, len, task->task_pool)) {
+ if (rspamd_url_parse (&uri, url_cpy, len, task->task_pool, RSPAMD_URL_PARSE_TEXT)) {
msg_debug_surbl ("check url redirection %*s", uri.urllen,
uri.string);
@@ -2198,7 +2198,7 @@ surbl_continue_process_handler (lua_State *L)
sizeof (*redirected_url));
rspamd_strlcpy (urlstr, nurl, urllen + 1);
r = rspamd_url_parse (redirected_url, urlstr, urllen,
- task->task_pool);
+ task->task_pool, RSPAMD_URL_PARSE_TEXT);
if (r == URI_ERRNO_OK) {
if (!g_hash_table_lookup (task->urls, redirected_url)) {
More information about the Commits
mailing list