commit 938ac78: [Rework] Rework telephone urls parsing logic

Vsevolod Stakhov vsevolod at highsecure.ru
Fri Mar 1 18:35:03 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-03-01 18:30:33 +0000
URL: https://github.com/rspamd/rspamd/commit/938ac78ad1099c5c216c2cec7ce6c55310744f1a (HEAD -> master)

[Rework] Rework telephone urls parsing logic

---
 src/libserver/html.c |   3 +-
 src/libserver/url.c  | 335 +++++++++++++++++++++++++++++++++++----------------
 src/libserver/url.h  |   2 +-
 src/lua/lua_config.c |   1 -
 4 files changed, 234 insertions(+), 107 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 31438ddad..47d8ec836 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1349,7 +1349,8 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 	if (rspamd_substring_search (start, len, "://", 3) == -1) {
 		if (len >= sizeof ("mailto:") &&
 				(memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
-						memcmp (start, "tel:", sizeof ("tel:") - 1) == 0)) {
+				 memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 ||
+				 memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) {
 			/* Exclusion, has valid but 'strange' prefix */
 		}
 		else {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index b14d1cfa7..6b4a0d2d0 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -103,6 +103,11 @@ static const struct {
 				.name = "tel",
 				.len = 3
 		},
+		{
+				.proto = PROTOCOL_TELEPHONE,
+				.name = "callto",
+				.len = 3
+		},
 		{
 				.proto = PROTOCOL_UNKNOWN,
 				.name = NULL,
@@ -193,7 +198,7 @@ struct url_matcher static_matchers[] = {
 				0, 0},
 		{"mailto:",   "",          url_email_start, url_email_end,
 				0, 0},
-		{"callto://", "",          url_web_start,   url_web_end,
+		{"callto:", "",            url_tel_start,   url_tel_end,
 				0, 0},
 		{"h323:",     "",          url_web_start,   url_web_end,
 				0, 0},
@@ -738,6 +743,144 @@ rspamd_mailto_parse (struct http_parser_url *u,
 	return ret;
 }
 
+static gint
+rspamd_telephone_parse (struct http_parser_url *u,
+						const gchar *str, gsize len,
+						gchar const **end,
+						enum rspamd_url_parse_flags parse_flags,
+						guint *flags)
+{
+	enum {
+		parse_protocol,
+		parse_semicolon,
+		parse_slash,
+		parse_slash_slash,
+		parse_spaces,
+		parse_plus,
+		parse_phone_start,
+		parse_phone,
+	} st = parse_protocol;
+
+	const gchar *p = str, *c = str, *last = str + len;
+	gchar t;
+	gint ret = 1, i;
+	UChar32 uc;
+
+	if (u != NULL) {
+		memset (u, 0, sizeof (*u));
+	}
+
+	while (p < last) {
+		t = *p;
+
+		switch (st) {
+		case parse_protocol:
+			if (t == ':') {
+				st = parse_semicolon;
+				SET_U (u, UF_SCHEMA);
+			}
+			p++;
+			break;
+		case parse_semicolon:
+			if (t == '/' || t == '\\') {
+				st = parse_slash;
+				p++;
+			}
+			else {
+				st = parse_slash_slash;
+			}
+			break;
+		case parse_slash:
+			if (t == '/' || t == '\\') {
+				st = parse_slash_slash;
+			}
+			else {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_slash_slash:
+			if (g_ascii_isspace (t)) {
+				st = parse_spaces;
+				p++;
+			}
+			else if (t == '+') {
+				c = p;
+				st = parse_plus;
+			}
+			else if (t == '/') {
+				/* Skip multiple slashes */
+				p++;
+			}
+			else {
+				st = parse_phone_start;
+				c = p;
+			}
+			break;
+		case parse_spaces:
+			if (t == '+') {
+				c = p;
+				st = parse_plus;
+			}
+			else if (!g_ascii_isspace (t)) {
+				st = parse_phone_start;
+				c = p;
+			}
+			else {
+				p ++;
+			}
+			break;
+		case parse_plus:
+			c = p;
+			p ++;
+			st = parse_phone_start;
+			break;
+		case parse_phone_start:
+			if (*p == '%' || *p == '(' || g_ascii_isdigit (*p)) {
+				st = parse_phone;
+				p ++;
+			}
+			else {
+				goto out;
+			}
+			break;
+		case parse_phone:
+			i = p - str;
+			U8_NEXT (str, i, len, uc);
+			p = str + i;
+
+			if (u_isdigit (uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']'
+				|| u_isspace (uc) || uc == '%') {
+				p ++;
+			}
+			else if (uc <= 0 || is_url_end (uc)) {
+				ret = 0;
+				goto set;
+			}
+			break;
+		}
+	}
+
+	set:
+	if (st == parse_phone) {
+		if (p - c != 0) {
+			SET_U (u, UF_HOST);
+			ret = 0;
+		}
+	}
+
+	out:
+	if (end != NULL) {
+		*end = p;
+	}
+
+	if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+		return 0;
+	}
+
+	return ret;
+}
+
 static gint
 rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 				  gchar const **end,
@@ -1638,6 +1781,40 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 	}
 }
 
+static void
+rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
+{
+	gchar *t, *h, *end;
+	gint i = 0, w, orig_len;
+	UChar32 uc;
+
+	t = uri->host;
+	h = t;
+	end = uri->host + uri->hostlen;
+	orig_len = uri->hostlen;
+
+	if (*h == '+') {
+		h ++;
+		t ++;
+	}
+
+	while (h < end) {
+		i = 0;
+		U8_NEXT (h, i, end - h, uc);
+
+		if (u_isdigit (uc)) {
+			w = 0;
+			U8_APPEND_UNSAFE (t, w, uc);
+			t += w;
+		}
+
+		h += i;
+	}
+
+	uri->hostlen = t - uri->host;
+	uri->urllen -= (orig_len - uri->hostlen);
+}
+
 enum uri_errno
 rspamd_url_parse (struct rspamd_url *uri,
 				  gchar *uristring, gsize len,
@@ -1659,6 +1836,7 @@ rspamd_url_parse (struct rspamd_url *uri,
 	}
 
 	p = uristring;
+	uri->protocol = PROTOCOL_UNKNOWN;
 
 	if (len > sizeof ("mailto:") - 1) {
 		/* For mailto: urls we also need to add slashes to make it a valid URL */
@@ -1666,75 +1844,11 @@ rspamd_url_parse (struct rspamd_url *uri,
 			ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags,
 					&flags);
 		}
-		else if (g_ascii_strncasecmp (p, "tel:", sizeof ("tel:") - 1) == 0) {
-			/* Telephone url */
-			gint nlen = 0;
-			gboolean has_plus = FALSE;
-			end = p + len;
-			gchar *t, *tend;
-			UChar32 uc;
-
-			uri->raw = p;
-			uri->rawlen = len;
-			uri->string = rspamd_mempool_alloc (pool, len + 1);
-			t = uri->string;
-			tend = t + len;
-			i = 4;
-
-			memcpy (t, "tel:", 4);
-			t += 4;
-			p += 4;
-			nlen = 4;
-
-			if (*p == '+') {
-				has_plus = TRUE;
-				*t++ = *p++;
-				nlen ++;
-				i ++;
-			}
-
-			while (t < tend && i < len) {
-				U8_NEXT (uristring, i, len, uc);
-
-				if (u_isdigit (uc)) {
-					if (g_ascii_isdigit (uc)) {
-						*t++ = uc;
-						nlen ++;
-					}
-					else {
-						/* Obfuscated number */
-						uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
-					}
-				}
-				else if (IS_OBSCURED_CHAR (uc)) {
-					uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
-				}
-			}
-
-			*t = '\0';
-
-			if (rspamd_normalise_unicode_inplace (pool, uri->string, &nlen)) {
-				uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
-			}
-
-			uri->urllen = nlen;
-
+		else if (g_ascii_strncasecmp (p, "tel:", sizeof ("tel:") - 1) == 0 ||
+				 g_ascii_strncasecmp (p, "callto:", sizeof ("callto:") - 1) == 0) {
+			ret = rspamd_telephone_parse (&u, uristring, len, &end, parse_flags,
+					&flags);
 			uri->protocol = PROTOCOL_TELEPHONE;
-			uri->protocollen = 4;
-
-			uri->host = uri->string + 4;
-			uri->hostlen = nlen - 4;
-
-			if (has_plus) {
-				uri->tld = uri->string + 5;
-				uri->tldlen = nlen - 5;
-			}
-			else {
-				uri->tld = uri->string + 4;
-				uri->tldlen = nlen - 4;
-			}
-
-			return URI_ERRNO_OK;
 		}
 		else {
 			ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags,
@@ -1867,32 +1981,46 @@ rspamd_url_parse (struct rspamd_url *uri,
 	rspamd_str_lc (uri->string, uri->protocollen);
 	rspamd_str_lc_utf8 (uri->host, uri->hostlen);
 
-	uri->protocol = PROTOCOL_UNKNOWN;
-
-	for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) {
-		if (uri->protocollen == rspamd_url_protocols[i].len) {
-			if (memcmp (uri->string,
-					rspamd_url_protocols[i].name, uri->protocollen) == 0) {
-				uri->protocol = rspamd_url_protocols[i].proto;
-				break;
+	if (uri->protocol == PROTOCOL_UNKNOWN) {
+		for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) {
+			if (uri->protocollen == rspamd_url_protocols[i].len) {
+				if (memcmp (uri->string,
+						rspamd_url_protocols[i].name, uri->protocollen) == 0) {
+					uri->protocol = rspamd_url_protocols[i].proto;
+					break;
+				}
 			}
 		}
 	}
 
-	/* Find TLD part */
-	rspamd_multipattern_lookup (url_scanner->search_trie,
+	if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP)) {
+		/* Find TLD part */
+		rspamd_multipattern_lookup (url_scanner->search_trie,
 				uri->host, uri->hostlen,
 				rspamd_tld_trie_callback, uri, NULL);
 
-	if (uri->tldlen == 0) {
-		if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
-			/* Ignore URL's without TLD if it is not a numeric URL */
-			if (!rspamd_url_is_ip (uri, pool)) {
-				return URI_ERRNO_TLD_MISSING;
+		if (uri->tldlen == 0) {
+			if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+				/* Ignore URL's without TLD if it is not a numeric URL */
+				if (!rspamd_url_is_ip (uri, pool)) {
+					return URI_ERRNO_TLD_MISSING;
+				}
+			} else {
+				/* Assume tld equal to host */
+				uri->tld = uri->host;
+				uri->tldlen = uri->hostlen;
 			}
 		}
+	}
+	else if (uri->protocol & PROTOCOL_TELEPHONE) {
+		/* We need to normalise phone number: remove all spaces and braces */
+		rspamd_telephone_normalise_inplace (uri);
+
+		if (uri->host[0] == '+') {
+			uri->tld = uri->host + 1;
+			uri->tldlen = uri->hostlen - 1;
+		}
 		else {
-			/* Assume tld equal to host */
 			uri->tld = uri->host;
 			uri->tldlen = uri->hostlen;
 		}
@@ -2371,13 +2499,15 @@ url_tel_start (struct url_callback_data *cb,
 			   const gchar *pos,
 			   url_match_t *match)
 {
-	if (!(*pos == '+' || g_ascii_isdigit (*pos))) {
-		/* Urls cannot start with . */
-		return FALSE;
-	}
-
 	match->m_begin = pos;
 
+	if (pos >= cb->begin + 1) {
+		match->st = *(pos - 1);
+	}
+	else {
+		match->st = '\0';
+	}
+
 	return TRUE;
 }
 
@@ -2386,29 +2516,26 @@ url_tel_end (struct url_callback_data *cb,
 			 const gchar *pos,
 			 url_match_t *match)
 {
-	UChar32 uc;
-	gint len = cb->end - pos, i = 0;
+	const gchar *last = NULL;
+	struct http_parser_url u;
+	gint len = cb->end - pos;
+	guint flags = 0;
 
 	if (match->newline_pos && match->st != '<') {
 		/* We should also limit our match end to the newline */
 		len = MIN (len, match->newline_pos - pos);
 	}
 
-	while (i < len) {
-		U8_NEXT (pos, i, len, uc);
-
-		if (uc < 0) {
-			break;
-		}
+	if (rspamd_telephone_parse (&u, pos, len, &last,
+			RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
+		return FALSE;
+	}
 
-		if (!(u_isdigit (uc) || u_isspace (uc) ||
-			  IS_OBSCURED_CHAR (uc) || uc == '+' ||
-			  uc == '-' || uc == '.')) {
-			break;
-		}
+	if (!(u.field_set & (1 << UF_HOST))) {
+		return FALSE;
 	}
 
-	match->m_len = i;
+	match->m_len = (last - pos);
 
 	return TRUE;
 }
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 4d1948921..3deeb8cf5 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -85,7 +85,7 @@ enum rspamd_url_protocol {
 	PROTOCOL_HTTPS = 1u << 3,
 	PROTOCOL_MAILTO = 1u << 4,
 	PROTOCOL_TELEPHONE = 1u << 5,
-	PROTOCOL_UNKNOWN = -1,
+	PROTOCOL_UNKNOWN = 1u << 31,
 };
 
 /**
diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c
index 884467e10..2ea72f075 100644
--- a/src/lua/lua_config.c
+++ b/src/lua/lua_config.c
@@ -2619,7 +2619,6 @@ lua_config_newindex (lua_State *L)
 			gint type = SYMBOL_TYPE_NORMAL, priority = 0, idx;
 			gdouble weight = 1.0, score = NAN;
 			const char *type_str, *group = NULL, *description = NULL;
-			guint flags = 0;
 
 			no_squeeze = cfg->disable_lua_squeeze;
 			/*


More information about the Commits mailing list