commit a8f11fa: [Rework] Rework URL structure: adjust tld part

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Mar 9 10:49:11 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-03-06 14:03:20 +0000
URL: https://github.com/rspamd/rspamd/commit/a8f11faf7f584916078d6fadb36e0c2f1984e2b0

[Rework] Rework URL structure: adjust tld part

---
 src/libserver/html.c                | 12 ++++++------
 src/libserver/protocol.c            |  6 ++++--
 src/libserver/url.c                 | 10 +++++-----
 src/libserver/url.h                 | 19 +++++++++----------
 src/libstat/tokenizers/tokenizers.c |  2 +-
 src/lua/lua_url.c                   |  4 ++--
 6 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index e1a211d2c..981141ad8 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -704,14 +704,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 
 				/* Apply the same logic for TLD */
 				disp_tok.len = text_url->tldlen;
-				disp_tok.begin = text_url->tld;
+				disp_tok.begin = rspamd_url_tld_unsafe (text_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-				if (rspamd_substring_search_caseless (text_url->tld,
+				if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (text_url),
 						text_url->tldlen, "xn--", 4) != -1) {
 					idn_hbuf = rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
 					/* We need to convert it to the normal value first */
 					disp_tok.len = uidna_nameToUnicodeUTF8 (udn,
-							text_url->tld, text_url->tldlen,
+							rspamd_url_tld_unsafe (text_url), text_url->tldlen,
 							idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
 
 					if (uc_err != U_ZERO_ERROR) {
@@ -725,14 +725,14 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 				}
 #endif
 				href_tok.len = href_url->tldlen;
-				href_tok.begin = href_url->tld;
+				href_tok.begin = rspamd_url_tld_unsafe (href_url);
 #if U_ICU_VERSION_MAJOR_NUM >= 46
-				if (rspamd_substring_search_caseless (href_url->tld,
+				if (rspamd_substring_search_caseless (rspamd_url_tld_unsafe (href_url),
 						href_url->tldlen, "xn--", 4) != -1) {
 					idn_hbuf = rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
 					/* We need to convert it to the normal value first */
 					href_tok.len = uidna_nameToUnicodeUTF8 (udn,
-							href_url->tld, href_url->tldlen,
+							rspamd_url_tld_unsafe (href_url), href_url->tldlen,
 							idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
 
 					if (uc_err != U_ZERO_ERROR) {
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 16dc05491..739d3b950 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -878,11 +878,13 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
 	ucl_object_insert_key (obj, elt, "url", 0, false);
 
 	if (url->tldlen > 0) {
-		elt = ucl_object_fromstring_common (url->tld, url->tldlen, 0);
+		elt = ucl_object_fromstring_common (rspamd_url_tld_unsafe (url),
+				url->tldlen, 0);
 		ucl_object_insert_key (obj, elt, "tld", 0, false);
 	}
 	if (url->hostlen > 0) {
-		elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url), url->hostlen, 0);
+		elt = ucl_object_fromstring_common (rspamd_url_host_unsafe (url),
+				url->hostlen, 0);
 		ucl_object_insert_key (obj, elt, "host", 0, false);
 	}
 
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 7e85a460e..043f523f0 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1561,7 +1561,7 @@ rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
 
 	if ((ndots == 0 || p == start - 1) &&
 			url->tldlen < rspamd_url_host_unsafe (url) + url->hostlen - pos) {
-		url->tld = (gchar *) pos;
+		url->tldshift = (pos - url->string);
 		url->tldlen = rspamd_url_host_unsafe (url) + url->hostlen - pos;
 	}
 
@@ -1590,11 +1590,11 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a
 			(gint)(uri->hostshift),
 			uri->string);
 	uri->hostshift = r;
+	uri->tldshift = r;
 	start_offset = strbuf + r;
 	inet_ntop (af, addr, strbuf + r, slen - r + 1);
 	uri->hostlen = strlen (start_offset);
 	r += uri->hostlen;
-	uri->tld = (const gchar *)start_offset;
 	uri->tldlen = uri->hostlen;
 	uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
 
@@ -2214,7 +2214,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 			} else {
 				if (!rspamd_url_is_ip (uri, pool)) {
 					/* Assume tld equal to host */
-					uri->tld = rspamd_url_host_unsafe (uri);
+					uri->tldshift = uri->hostshift;
 					uri->tldlen = uri->hostlen;
 				}
 			}
@@ -2241,11 +2241,11 @@ rspamd_url_parse (struct rspamd_url *uri,
 		rspamd_telephone_normalise_inplace (uri);
 
 		if (rspamd_url_host_unsafe (uri)[0] == '+') {
-			uri->tld = rspamd_url_host_unsafe (uri) + 1;
+			uri->tldshift = uri->hostshift + 1;
 			uri->tldlen = uri->hostlen - 1;
 		}
 		else {
-			uri->tld = rspamd_url_host_unsafe (uri);
+			uri->tldshift = uri->hostshift;
 			uri->tldlen = uri->hostlen;
 		}
 	}
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 87766c4e6..00f09ac30 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -53,15 +53,7 @@ struct rspamd_url {
 	guint datashift;
 	guint queryshift;
 	guint fragmentshift;
-
-	gchar *tld;
-	gchar *visible_part;
-
-	struct rspamd_url *phished_url;
-
-	guint urllen;
-	guint rawlen;
-	guint32 flags;
+	guint tldshift;
 
 	guint16 protocollen;
 	guint16 userlen;
@@ -70,8 +62,14 @@ struct rspamd_url {
 	guint16 querylen;
 	guint16 fragmentlen;
 	guint16 tldlen;
-
 	guint16 count;
+
+	guint urllen;
+	guint rawlen;
+	guint32 flags;
+
+	gchar *visible_part;
+	struct rspamd_url *phished_url;
 };
 
 #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
@@ -79,6 +77,7 @@ struct rspamd_url {
 
 #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
 #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
+#define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift)
 
 #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
 #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 77a924f41..9f1b14daf 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -262,7 +262,7 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
 		uri = ex->ptr;
 
 		if (uri && uri->tldlen > 0) {
-			token.original.begin = uri->tld;
+			token.original.begin = rspamd_url_tld_unsafe (uri);
 			token.original.len = uri->tldlen;
 
 		}
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
index cb54a694c..efd34dc6c 100644
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -560,7 +560,7 @@ lua_url_get_tld (lua_State *L)
 	struct rspamd_lua_url *url = lua_check_url (L, 1);
 
 	if (url != NULL && url->url->tldlen > 0) {
-		lua_pushlstring (L, url->url->tld, url->url->tldlen);
+		lua_pushlstring (L, rspamd_url_tld_unsafe (url->url), url->url->tldlen);
 	}
 	else {
 		lua_pushnil (L);
@@ -672,7 +672,7 @@ lua_url_to_table (lua_State *L)
 
 		if (u->tldlen > 0) {
 			lua_pushstring (L, "tld");
-			lua_pushlstring (L, u->tld, u->tldlen);
+			lua_pushlstring (L, rspamd_url_tld_unsafe (u), u->tldlen);
 			lua_settable (L, -3);
 		}
 


More information about the Commits mailing list