commit cd270c5: [Rework] Rework URL structure: host field
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Mar 9 10:49:09 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-03-06 12:01:37 +0000
URL: https://github.com/rspamd/rspamd/commit/cd270c51b2ccd814804e4f17eb31dc7d91a69980
[Rework] Rework URL structure: host field
---
src/libserver/html.c | 14 ++++-----
src/libserver/protocol.c | 11 ++++---
src/libserver/url.c | 78 +++++++++++++++++++++++++++---------------------
src/libserver/url.h | 8 +++--
src/lua/lua_url.c | 8 ++---
5 files changed, 68 insertions(+), 51 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index b7e78e57b..7dca72453 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -658,14 +658,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
if (rc == URI_ERRNO_OK) {
disp_tok.len = text_url->hostlen;
- disp_tok.begin = text_url->host;
+ disp_tok.begin = rspamd_url_host_unsafe (text_url);
#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (text_url->host,
+ if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url),
text_url->hostlen, "xn--", 4) != -1) {
idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
/* We need to convert it to the normal value first */
disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
- text_url->host, text_url->hostlen,
+ rspamd_url_host_unsafe (text_url), text_url->hostlen,
idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
if (uc_err != U_ZERO_ERROR) {
@@ -679,14 +679,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
}
#endif
href_tok.len = href_url->hostlen;
- href_tok.begin = href_url->host;
+ href_tok.begin = rspamd_url_host_unsafe (href_url);
#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless (href_url->host,
+ if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url),
href_url->hostlen, "xn--", 4) != -1) {
idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
/* We need to convert it to the normal value first */
href_tok.len = uidna_nameToUnicodeUTF8 (udn,
- href_url->host, href_url->hostlen,
+ rspamd_url_host_unsafe (href_url), href_url->hostlen,
idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
if (uc_err != U_ZERO_ERROR) {
@@ -1594,7 +1594,7 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
buf = rspamd_mempool_alloc (pool, len + 1);
rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
hc->base_url->protocollen, hc->base_url->string,
- hc->base_url->hostlen, hc->base_url->host,
+ hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
(gint)orig_len, start);
start = buf;
}
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 4c1a94d99..16dc05491 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -882,7 +882,7 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
ucl_object_insert_key (obj, elt, "tld", 0, false);
}
if (url->hostlen > 0) {
- elt = ucl_object_fromstring_common (url->host, url->hostlen, 0);
+ elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url), url->hostlen, 0);
ucl_object_insert_key (obj, elt, "host", 0, false);
}
@@ -925,11 +925,14 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
goffset err_offset;
- if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen)) == 0) {
- obj = ucl_object_fromstring_common (url->host, url->hostlen, 0);
+ if ((err_offset = rspamd_fast_utf8_validate (rspamd_url_host_unsafe (url),
+ url->hostlen)) == 0) {
+ obj = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+ url->hostlen, 0);
}
else {
- obj = ucl_object_fromstring_common (url->host, err_offset - 1, 0);
+ obj = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+ err_offset - 1, 0);
}
}
else {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index a2a9d852f..ac4c11916 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1526,12 +1526,12 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
pos = text + match_start;
p = pos - 1;
- start = url->host;
+ start = rspamd_url_host_unsafe (url);
if (*pos != '.' || match_pos != (gint) url->hostlen) {
/* Something weird has been found */
if (match_pos == (gint) url->hostlen - 1) {
- pos = url->host + match_pos;
+ pos = rspamd_url_host_unsafe (url) + match_pos;
if (*pos == '.') {
/* This is dot at the end of domain */
url->hostlen--;
@@ -1560,9 +1560,9 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
}
if ((ndots == 0 || p == start - 1) &&
- url->tldlen < url->host + url->hostlen - pos) {
+ url->tldlen < rspamd_url_host_unsafe (url) + url->hostlen - pos) {
url->tld = (gchar *) pos;
- url->tldlen = url->host + url->hostlen - pos;
+ url->tldlen = rspamd_url_host_unsafe (url) + url->hostlen - pos;
}
return 0;
@@ -1586,13 +1586,13 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
/* Allocate new string to build it from IP */
strbuf = rspamd_mempool_alloc (pool, slen + 1);
r += rspamd_snprintf (strbuf + r, slen - r, "%*s",
- (gint)(uri->host - uri->string),
+ (gint)(uri->hostshift),
uri->string);
- uri->host = strbuf + r;
+ uri->hostshift = r;
inet_ntop (af, addr, strbuf + r, slen - r + 1);
- uri->hostlen = strlen (uri->host);
+ uri->hostlen = strlen (rspamd_url_host_unsafe (uri));
r += uri->hostlen;
- uri->tld = uri->host;
+ uri->tld = rspamd_url_host_unsafe (uri);
uri->tldlen = uri->hostlen;
uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
@@ -1638,7 +1638,7 @@ rspamd_url_is_ip (struct rspamd_url *uri, rspamd_mempool_t *pool)
gboolean ret = FALSE, check_num = TRUE;
guint32 n, dots, t = 0, i = 0, shift, nshift;
- p = uri->host;
+ p = rspamd_url_host_unsafe (uri);
end = p + uri->hostlen;
if (*p == '[' && *(end - 1) == ']') {
@@ -1814,9 +1814,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
old_shift = uri->hostlen;
uri->hostlen -= shift;
- remain = (uri->urllen - (uri->host - uri->string)) - old_shift;
+ remain = (uri->urllen - (uri->hostshift)) - old_shift;
g_assert (remain >= 0);
- memmove (uri->host + uri->hostlen, uri->host + old_shift,
+ memmove (rspamd_url_host_unsafe (uri) + uri->hostlen,
+ rspamd_url_host_unsafe (uri) + old_shift,
remain);
uri->urllen -= shift;
uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED;
@@ -1877,7 +1878,7 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
uri->usershift -= shift;
}
if (uri->hostlen > 0) {
- uri->host -= shift;
+ uri->hostshift -= shift;
}
/* Go forward */
case UF_HOST:
@@ -1908,9 +1909,9 @@ rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
gint i = 0, w, orig_len;
UChar32 uc;
- t = uri->host;
+ t = rspamd_url_host_unsafe (uri);
h = t;
- end = uri->host + uri->hostlen;
+ end = t + uri->hostlen;
orig_len = uri->hostlen;
if (*h == '+') {
@@ -1931,7 +1932,7 @@ rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
h += i;
}
- uri->hostlen = t - uri->host;
+ uri->hostlen = t - rspamd_url_host_unsafe (uri);
uri->urllen -= (orig_len - uri->hostlen);
}
@@ -2022,7 +2023,7 @@ rspamd_url_parse (struct rspamd_url *uri,
uri->protocollen = u.field_data[i].len;
break;
case UF_HOST:
- uri->host = comp;
+ uri->hostshift = u.field_data[i].off;
uri->hostlen = complen;
break;
case UF_PATH:
@@ -2059,16 +2060,20 @@ rspamd_url_parse (struct rspamd_url *uri,
uri->string,
uri->protocollen);
rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
- unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+ unquoted_len = rspamd_url_decode (rspamd_url_host_unsafe (uri),
+ rspamd_url_host_unsafe (uri), uri->hostlen);
- if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+ if (rspamd_normalise_unicode_inplace (pool,
+ rspamd_url_host_unsafe (uri), &unquoted_len)) {
uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
}
if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
/* Ensure that hostname starts with something sane (exclude numeric urls) */
- if (!(is_domain_start (uri->host[0]) || uri->host[0] == ':')) {
+ const gchar* host = rspamd_url_host_unsafe (uri);
+
+ if (!(is_domain_start (host[0]) || host[0] == ':')) {
return URI_ERRNO_BAD_FORMAT;
}
}
@@ -2093,7 +2098,7 @@ rspamd_url_parse (struct rspamd_url *uri,
struct UConverter *utf8_conv = rspamd_get_utf8_converter ();
utf16_len = ucnv_toUChars (utf8_conv, utf16_hostname, uri->hostlen,
- uri->host, uri->hostlen, &uc_err);
+ rspamd_url_host_unsafe (uri), uri->hostlen, &uc_err);
if (!U_SUCCESS (uc_err)) {
@@ -2110,7 +2115,8 @@ rspamd_url_parse (struct rspamd_url *uri,
}
/* Convert back to utf8, sigh... */
- norm_utf8_len = ucnv_fromUChars (utf8_conv, uri->host, uri->hostlen,
+ norm_utf8_len = ucnv_fromUChars (utf8_conv,
+ rspamd_url_host_unsafe (uri), uri->hostlen,
norm_utf16, norm_utf16_len, &uc_err);
if (!U_SUCCESS (uc_err)) {
@@ -2154,7 +2160,7 @@ rspamd_url_parse (struct rspamd_url *uri,
}
rspamd_str_lc (uri->string, uri->protocollen);
- unquoted_len = rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+ unquoted_len = rspamd_str_lc_utf8 (rspamd_url_host_unsafe (uri), uri->hostlen);
rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->protocol == PROTOCOL_UNKNOWN) {
@@ -2172,7 +2178,7 @@ rspamd_url_parse (struct rspamd_url *uri,
if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
/* Find TLD part */
rspamd_multipattern_lookup (url_scanner->search_trie,
- uri->host, uri->hostlen,
+ rspamd_url_host_unsafe (uri), uri->hostlen,
rspamd_tld_trie_callback, uri, NULL);
if (uri->tldlen == 0) {
@@ -2184,7 +2190,7 @@ rspamd_url_parse (struct rspamd_url *uri,
} else {
if (!rspamd_url_is_ip (uri, pool)) {
/* Assume tld equal to host */
- uri->tld = uri->host;
+ uri->tld = rspamd_url_host_unsafe (uri);
uri->tldlen = uri->hostlen;
}
}
@@ -2194,7 +2200,8 @@ rspamd_url_parse (struct rspamd_url *uri,
if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_FTP) &&
uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) {
- gchar *pos = &uri->string[uri->protocollen], *host_start = uri->host;
+ gchar *pos = &uri->string[uri->protocollen],
+ *host_start = rspamd_url_host_unsafe (uri);
while (pos < host_start) {
if (*pos == '\\') {
@@ -2209,12 +2216,12 @@ rspamd_url_parse (struct rspamd_url *uri,
/* We need to normalise phone number: remove all spaces and braces */
rspamd_telephone_normalise_inplace (uri);
- if (uri->host[0] == '+') {
- uri->tld = uri->host + 1;
+ if (rspamd_url_host_unsafe (uri)[0] == '+') {
+ uri->tld = rspamd_url_host_unsafe (uri) + 1;
uri->tldlen = uri->hostlen - 1;
}
else {
- uri->tld = uri->host;
+ uri->tld = rspamd_url_host_unsafe (uri);
uri->tldlen = uri->hostlen;
}
}
@@ -3362,7 +3369,8 @@ rspamd_url_host_hash (gconstpointer u)
const struct rspamd_url *url = u;
if (url->hostlen > 0) {
- return (guint)rspamd_cryptobox_fast_hash (url->host, url->hostlen,
+ return (guint)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
+ url->hostlen,
rspamd_hash_seed ());
}
@@ -3378,7 +3386,7 @@ rspamd_email_hash (gconstpointer u)
rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
if (url->hostlen > 0) {
- rspamd_cryptobox_fast_hash_update (&st, url->host, url->hostlen);
+ rspamd_cryptobox_fast_hash_update (&st, rspamd_url_host_unsafe (url), url->hostlen);
}
if (url->userlen > 0) {
@@ -3399,7 +3407,8 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
return FALSE;
}
else {
- if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
+ if ((r = rspamd_lc_cmp (rspamd_url_host_unsafe (u1),
+ rspamd_url_host_unsafe (u2), u1->hostlen)) == 0) {
if (u1->userlen != u2->userlen || u1->userlen == 0) {
return FALSE;
}
@@ -3443,7 +3452,8 @@ rspamd_urls_host_cmp (gconstpointer a, gconstpointer b)
return FALSE;
}
else {
- r = memcmp (u1->host, u2->host, u1->hostlen);
+ r = memcmp (rspamd_url_host_unsafe (u1), rspamd_url_host_unsafe (u2),
+ u1->hostlen);
}
return r == 0;
@@ -3637,7 +3647,7 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
g_assert (pdlen != NULL && url != NULL && pool != NULL);
- CHECK_URL_COMPONENT ((guchar *)url->host, url->hostlen,
+ CHECK_URL_COMPONENT (rspamd_url_host_unsafe (url), url->hostlen,
RSPAMD_URL_FLAGS_HOSTSAFE);
CHECK_URL_COMPONENT (rspamd_url_user_unsafe(url), url->userlen,
RSPAMD_URL_FLAGS_USERSAFE);
@@ -3683,7 +3693,7 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
*d++ = ':';
}
- ENCODE_URL_COMPONENT ((guchar *)url->host, url->hostlen,
+ ENCODE_URL_COMPONENT (rspamd_url_host_unsafe (url), url->hostlen,
RSPAMD_URL_FLAGS_HOSTSAFE);
if (url->datalen > 0) {
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 78330d814..080f005c3 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -50,7 +50,9 @@ struct rspamd_url {
guint usershift;
guint userlen;
- gchar *host;
+ guint hostshift;
+ guint hostlen;
+
gchar *data;
gchar *query;
gchar *fragment;
@@ -60,7 +62,6 @@ struct rspamd_url {
struct rspamd_url *phished_url;
guint protocollen;
- guint hostlen;
guint datalen;
guint querylen;
guint fragmentlen;
@@ -75,6 +76,9 @@ struct rspamd_url {
#define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
#define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
+#define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
+#define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
+
enum uri_errno {
URI_ERRNO_OK = 0, /* Parsing went well */
URI_ERRNO_EMPTY, /* The URI string was empty */
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
index 7b0dee89b..bd94120e2 100644
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -158,8 +158,8 @@ lua_url_get_host (lua_State *L)
LUA_TRACE_POINT;
struct rspamd_lua_url *url = lua_check_url (L, 1);
- if (url != NULL) {
- lua_pushlstring (L, url->url->host, url->url->hostlen);
+ if (url != NULL && url->url && url->url->hostlen > 0) {
+ lua_pushlstring (L, rspamd_url_host (url->url), url->url->hostlen);
}
else {
lua_pushnil (L);
@@ -312,7 +312,7 @@ lua_url_tostring (lua_State *L)
}
tmp[url->url->userlen] = '@';
- memcpy (tmp + url->url->userlen + 1, url->url->host,
+ memcpy (tmp + url->url->userlen + 1, rspamd_url_host_unsafe (url->url),
url->url->hostlen);
lua_pushlstring (L, tmp, url->url->userlen + 1 + url->url->hostlen);
@@ -660,7 +660,7 @@ lua_url_to_table (lua_State *L)
if (u->hostlen > 0) {
lua_pushstring (L, "host");
- lua_pushlstring (L, u->host, u->hostlen);
+ lua_pushlstring (L, rspamd_url_host_unsafe (u), u->hostlen);
lua_settable (L, -3);
}
More information about the Commits
mailing list