commit cd270c5: [Rework] Rework URL structure: host field

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Mar 9 10:49:09 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-03-06 12:01:37 +0000
URL: https://github.com/rspamd/rspamd/commit/cd270c51b2ccd814804e4f17eb31dc7d91a69980

[Rework] Rework URL structure: host field

---
 src/libserver/html.c     | 14 ++++-----
 src/libserver/protocol.c | 11 ++++---
 src/libserver/url.c      | 78 +++++++++++++++++++++++++++---------------------
 src/libserver/url.h      |  8 +++--
 src/lua/lua_url.c        |  8 ++---
 5 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index b7e78e57b..7dca72453 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -658,14 +658,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 
 		if (rc == URI_ERRNO_OK) {
 			disp_tok.len = text_url->hostlen;
-			disp_tok.begin = text_url->host;
+			disp_tok.begin = rspamd_url_host_unsafe (text_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-			if (rspamd_substring_search_caseless (text_url->host,
+			if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (text_url),
 					text_url->hostlen, "xn--", 4) != -1) {
 				idn_hbuf = rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
 				/* We need to convert it to the normal value first */
 				disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
-						text_url->host, text_url->hostlen,
+						rspamd_url_host_unsafe (text_url), text_url->hostlen,
 						idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
 
 				if (uc_err != U_ZERO_ERROR) {
@@ -679,14 +679,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 			}
 #endif
 			href_tok.len = href_url->hostlen;
-			href_tok.begin = href_url->host;
+			href_tok.begin = rspamd_url_host_unsafe (href_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-			if (rspamd_substring_search_caseless (href_url->host,
+			if (rspamd_substring_search_caseless (rspamd_url_host_unsafe (href_url),
 					href_url->hostlen, "xn--", 4) != -1) {
 				idn_hbuf = rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
 				/* We need to convert it to the normal value first */
 				href_tok.len = uidna_nameToUnicodeUTF8 (udn,
-						href_url->host, href_url->hostlen,
+						rspamd_url_host_unsafe (href_url), href_url->hostlen,
 						idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
 
 				if (uc_err != U_ZERO_ERROR) {
@@ -1594,7 +1594,7 @@ rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag,
 					buf = rspamd_mempool_alloc (pool, len + 1);
 					rspamd_snprintf (buf, len + 1, "%*s://%*s/%*s",
 							hc->base_url->protocollen, hc->base_url->string,
-							hc->base_url->hostlen, hc->base_url->host,
+							hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
 							(gint)orig_len, start);
 					start = buf;
 				}
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 4c1a94d99..16dc05491 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -882,7 +882,7 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
 		ucl_object_insert_key (obj, elt, "tld", 0, false);
 	}
 	if (url->hostlen > 0) {
-		elt = ucl_object_fromstring_common (url->host, url->hostlen, 0);
+		elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url), url->hostlen, 0);
 		ucl_object_insert_key (obj, elt, "host", 0, false);
 	}
 
@@ -925,11 +925,14 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 
 			goffset err_offset;
 
-			if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen)) == 0) {
-				obj = ucl_object_fromstring_common (url->host, url->hostlen, 0);
+			if ((err_offset = rspamd_fast_utf8_validate (rspamd_url_host_unsafe (url),
+					url->hostlen)) == 0) {
+				obj = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+						url->hostlen, 0);
 			}
 			else {
-				obj = ucl_object_fromstring_common (url->host, err_offset - 1, 0);
+				obj = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+						err_offset - 1, 0);
 			}
 		}
 		else {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index a2a9d852f..ac4c11916 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1526,12 +1526,12 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
 
 	pos = text + match_start;
 	p = pos - 1;
-	start = url->host;
+	start = rspamd_url_host_unsafe (url);
 
 	if (*pos != '.' || match_pos != (gint) url->hostlen) {
 		/* Something weird has been found */
 		if (match_pos == (gint) url->hostlen - 1) {
-			pos = url->host + match_pos;
+			pos = rspamd_url_host_unsafe (url) + match_pos;
 			if (*pos == '.') {
 				/* This is dot at the end of domain */
 				url->hostlen--;
@@ -1560,9 +1560,9 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
 	}
 
 	if ((ndots == 0 || p == start - 1) &&
-			url->tldlen < url->host + url->hostlen - pos) {
+			url->tldlen < rspamd_url_host_unsafe (url) + url->hostlen - pos) {
 		url->tld = (gchar *) pos;
-		url->tldlen = url->host + url->hostlen - pos;
+		url->tldlen = rspamd_url_host_unsafe (url) + url->hostlen - pos;
 	}
 
 	return 0;
@@ -1586,13 +1586,13 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
 	/* Allocate new string to build it from IP */
 	strbuf = rspamd_mempool_alloc (pool, slen + 1);
 	r += rspamd_snprintf (strbuf + r, slen - r, "%*s",
-			(gint)(uri->host - uri->string),
+			(gint)(uri->hostshift),
 			uri->string);
-	uri->host = strbuf + r;
+	uri->hostshift = r;
 	inet_ntop (af, addr, strbuf + r, slen - r + 1);
-	uri->hostlen = strlen (uri->host);
+	uri->hostlen = strlen (rspamd_url_host_unsafe (uri));
 	r += uri->hostlen;
-	uri->tld = uri->host;
+	uri->tld = rspamd_url_host_unsafe (uri);
 	uri->tldlen = uri->hostlen;
 	uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
 
@@ -1638,7 +1638,7 @@ rspamd_url_is_ip (struct rspamd_url *uri, rspamd_mempool_t *pool)
 	gboolean ret = FALSE, check_num = TRUE;
 	guint32 n, dots, t = 0, i = 0, shift, nshift;
 
-	p = uri->host;
+	p = rspamd_url_host_unsafe (uri);
 	end = p + uri->hostlen;
 
 	if (*p == '[' && *(end - 1) == ']') {
@@ -1814,9 +1814,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 
 		old_shift = uri->hostlen;
 		uri->hostlen -= shift;
-		remain = (uri->urllen - (uri->host - uri->string)) - old_shift;
+		remain = (uri->urllen - (uri->hostshift)) - old_shift;
 		g_assert (remain >= 0);
-		memmove (uri->host + uri->hostlen, uri->host + old_shift,
+		memmove (rspamd_url_host_unsafe (uri) + uri->hostlen,
+				rspamd_url_host_unsafe (uri) + old_shift,
 				remain);
 		uri->urllen -= shift;
 		uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED;
@@ -1877,7 +1878,7 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 			uri->usershift -= shift;
 		}
 		if (uri->hostlen > 0) {
-			uri->host -= shift;
+			uri->hostshift -= shift;
 		}
 		/* Go forward */
 	case UF_HOST:
@@ -1908,9 +1909,9 @@ rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
 	gint i = 0, w, orig_len;
 	UChar32 uc;
 
-	t = uri->host;
+	t = rspamd_url_host_unsafe (uri);
 	h = t;
-	end = uri->host + uri->hostlen;
+	end = t + uri->hostlen;
 	orig_len = uri->hostlen;
 
 	if (*h == '+') {
@@ -1931,7 +1932,7 @@ rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
 		h += i;
 	}
 
-	uri->hostlen = t - uri->host;
+	uri->hostlen = t - rspamd_url_host_unsafe (uri);
 	uri->urllen -= (orig_len - uri->hostlen);
 }
 
@@ -2022,7 +2023,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 				uri->protocollen = u.field_data[i].len;
 				break;
 			case UF_HOST:
-				uri->host = comp;
+				uri->hostshift = u.field_data[i].off;
 				uri->hostlen = complen;
 				break;
 			case UF_PATH:
@@ -2059,16 +2060,20 @@ rspamd_url_parse (struct rspamd_url *uri,
 			uri->string,
 			uri->protocollen);
 	rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
-	unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+	unquoted_len = rspamd_url_decode (rspamd_url_host_unsafe (uri),
+			rspamd_url_host_unsafe (uri), uri->hostlen);
 
-	if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+	if (rspamd_normalise_unicode_inplace (pool,
+			rspamd_url_host_unsafe (uri), &unquoted_len)) {
 		uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
 	}
 
 
 	if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
 		/* Ensure that hostname starts with something sane (exclude numeric urls) */
-		if (!(is_domain_start (uri->host[0]) || uri->host[0] == ':')) {
+		const gchar* host = rspamd_url_host_unsafe (uri);
+
+		if (!(is_domain_start (host[0]) || host[0] == ':')) {
 			return URI_ERRNO_BAD_FORMAT;
 		}
 	}
@@ -2093,7 +2098,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 	struct UConverter *utf8_conv = rspamd_get_utf8_converter ();
 
 	utf16_len = ucnv_toUChars (utf8_conv, utf16_hostname, uri->hostlen,
-			uri->host, uri->hostlen, &uc_err);
+			rspamd_url_host_unsafe (uri), uri->hostlen, &uc_err);
 
 	if (!U_SUCCESS (uc_err)) {
 
@@ -2110,7 +2115,8 @@ rspamd_url_parse (struct rspamd_url *uri,
 	}
 
 	/* Convert back to utf8, sigh... */
-	norm_utf8_len = ucnv_fromUChars (utf8_conv, uri->host, uri->hostlen,
+	norm_utf8_len = ucnv_fromUChars (utf8_conv,
+			rspamd_url_host_unsafe (uri), uri->hostlen,
 			norm_utf16, norm_utf16_len, &uc_err);
 
 	if (!U_SUCCESS (uc_err)) {
@@ -2154,7 +2160,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 	}
 
 	rspamd_str_lc (uri->string, uri->protocollen);
-	unquoted_len = rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+	unquoted_len = rspamd_str_lc_utf8 (rspamd_url_host_unsafe (uri), uri->hostlen);
 	rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
 	if (uri->protocol == PROTOCOL_UNKNOWN) {
@@ -2172,7 +2178,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 	if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP|PROTOCOL_FILE)) {
 		/* Find TLD part */
 		rspamd_multipattern_lookup (url_scanner->search_trie,
-				uri->host, uri->hostlen,
+				rspamd_url_host_unsafe (uri), uri->hostlen,
 				rspamd_tld_trie_callback, uri, NULL);
 
 		if (uri->tldlen == 0) {
@@ -2184,7 +2190,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 			} else {
 				if (!rspamd_url_is_ip (uri, pool)) {
 					/* Assume tld equal to host */
-					uri->tld = uri->host;
+					uri->tld = rspamd_url_host_unsafe (uri);
 					uri->tldlen = uri->hostlen;
 				}
 			}
@@ -2194,7 +2200,8 @@ rspamd_url_parse (struct rspamd_url *uri,
 		if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_FTP) &&
 			uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) {
 
-			gchar *pos = &uri->string[uri->protocollen], *host_start = uri->host;
+			gchar *pos = &uri->string[uri->protocollen],
+					*host_start = rspamd_url_host_unsafe (uri);
 
 			while (pos < host_start) {
 				if (*pos == '\\') {
@@ -2209,12 +2216,12 @@ rspamd_url_parse (struct rspamd_url *uri,
 		/* We need to normalise phone number: remove all spaces and braces */
 		rspamd_telephone_normalise_inplace (uri);
 
-		if (uri->host[0] == '+') {
-			uri->tld = uri->host + 1;
+		if (rspamd_url_host_unsafe (uri)[0] == '+') {
+			uri->tld = rspamd_url_host_unsafe (uri) + 1;
 			uri->tldlen = uri->hostlen - 1;
 		}
 		else {
-			uri->tld = uri->host;
+			uri->tld = rspamd_url_host_unsafe (uri);
 			uri->tldlen = uri->hostlen;
 		}
 	}
@@ -3362,7 +3369,8 @@ rspamd_url_host_hash (gconstpointer u)
 	const struct rspamd_url *url = u;
 
 	if (url->hostlen > 0) {
-		return (guint)rspamd_cryptobox_fast_hash (url->host, url->hostlen,
+		return (guint)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
+				url->hostlen,
 				rspamd_hash_seed ());
 	}
 
@@ -3378,7 +3386,7 @@ rspamd_email_hash (gconstpointer u)
 	rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());
 
 	if (url->hostlen > 0) {
-		rspamd_cryptobox_fast_hash_update (&st, url->host, url->hostlen);
+		rspamd_cryptobox_fast_hash_update (&st, rspamd_url_host_unsafe (url), url->hostlen);
 	}
 
 	if (url->userlen > 0) {
@@ -3399,7 +3407,8 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
 		return FALSE;
 	}
 	else {
-		if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
+		if ((r = rspamd_lc_cmp (rspamd_url_host_unsafe (u1),
+				rspamd_url_host_unsafe (u2), u1->hostlen)) == 0) {
 			if (u1->userlen != u2->userlen || u1->userlen == 0) {
 				return FALSE;
 			}
@@ -3443,7 +3452,8 @@ rspamd_urls_host_cmp (gconstpointer a, gconstpointer b)
 		return FALSE;
 	}
 	else {
-		r = memcmp (u1->host, u2->host, u1->hostlen);
+		r = memcmp (rspamd_url_host_unsafe (u1), rspamd_url_host_unsafe (u2),
+				u1->hostlen);
 	}
 
 	return r == 0;
@@ -3637,7 +3647,7 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
 
 	g_assert (pdlen != NULL && url != NULL && pool != NULL);
 
-	CHECK_URL_COMPONENT ((guchar *)url->host, url->hostlen,
+	CHECK_URL_COMPONENT (rspamd_url_host_unsafe (url), url->hostlen,
 			RSPAMD_URL_FLAGS_HOSTSAFE);
 	CHECK_URL_COMPONENT (rspamd_url_user_unsafe(url), url->userlen,
 			RSPAMD_URL_FLAGS_USERSAFE);
@@ -3683,7 +3693,7 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
 		*d++ = ':';
 	}
 
-	ENCODE_URL_COMPONENT ((guchar *)url->host, url->hostlen,
+	ENCODE_URL_COMPONENT (rspamd_url_host_unsafe (url), url->hostlen,
 			RSPAMD_URL_FLAGS_HOSTSAFE);
 
 	if (url->datalen > 0) {
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 78330d814..080f005c3 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -50,7 +50,9 @@ struct rspamd_url {
 	guint usershift;
 	guint userlen;
 
-	gchar *host;
+	guint hostshift;
+	guint hostlen;
+
 	gchar *data;
 	gchar *query;
 	gchar *fragment;
@@ -60,7 +62,6 @@ struct rspamd_url {
 	struct rspamd_url *phished_url;
 
 	guint protocollen;
-	guint hostlen;
 	guint datalen;
 	guint querylen;
 	guint fragmentlen;
@@ -75,6 +76,9 @@ struct rspamd_url {
 #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
 #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
 
+#define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
+#define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
+
 enum uri_errno {
 	URI_ERRNO_OK = 0,           /* Parsing went well */
 	URI_ERRNO_EMPTY,        /* The URI string was empty */
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
index 7b0dee89b..bd94120e2 100644
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -158,8 +158,8 @@ lua_url_get_host (lua_State *L)
 	LUA_TRACE_POINT;
 	struct rspamd_lua_url *url = lua_check_url (L, 1);
 
-	if (url != NULL) {
-		lua_pushlstring (L, url->url->host, url->url->hostlen);
+	if (url != NULL && url->url && url->url->hostlen > 0) {
+		lua_pushlstring (L, rspamd_url_host (url->url), url->url->hostlen);
 	}
 	else {
 		lua_pushnil (L);
@@ -312,7 +312,7 @@ lua_url_tostring (lua_State *L)
 			}
 
 			tmp[url->url->userlen] = '@';
-			memcpy (tmp + url->url->userlen + 1, url->url->host,
+			memcpy (tmp + url->url->userlen + 1, rspamd_url_host_unsafe (url->url),
 					url->url->hostlen);
 
 			lua_pushlstring (L, tmp, url->url->userlen + 1 + url->url->hostlen);
@@ -660,7 +660,7 @@ lua_url_to_table (lua_State *L)
 
 		if (u->hostlen > 0) {
 			lua_pushstring (L, "host");
-			lua_pushlstring (L, u->host, u->hostlen);
+			lua_pushlstring (L, rspamd_url_host_unsafe (u), u->hostlen);
 			lua_settable (L, -3);
 		}
 


More information about the Commits mailing list