commit 938ac78: [Rework] Rework telephone urls parsing logic
Vsevolod Stakhov
vsevolod at highsecure.ru
Fri Mar 1 18:35:03 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-03-01 18:30:33 +0000
URL: https://github.com/rspamd/rspamd/commit/938ac78ad1099c5c216c2cec7ce6c55310744f1a (HEAD -> master)
[Rework] Rework telephone urls parsing logic
---
src/libserver/html.c | 3 +-
src/libserver/url.c | 335 +++++++++++++++++++++++++++++++++++----------------
src/libserver/url.h | 2 +-
src/lua/lua_config.c | 1 -
4 files changed, 234 insertions(+), 107 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 31438ddad..47d8ec836 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1349,7 +1349,8 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
if (rspamd_substring_search (start, len, "://", 3) == -1) {
if (len >= sizeof ("mailto:") &&
(memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
- memcmp (start, "tel:", sizeof ("tel:") - 1) == 0)) {
+ memcmp (start, "tel:", sizeof ("tel:") - 1) == 0 ||
+ memcmp (start, "callto:", sizeof ("callto:") - 1) == 0)) {
/* Exclusion, has valid but 'strange' prefix */
}
else {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index b14d1cfa7..6b4a0d2d0 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -103,6 +103,11 @@ static const struct {
.name = "tel",
.len = 3
},
+ {
+ .proto = PROTOCOL_TELEPHONE,
+ .name = "callto",
+ .len = 3
+ },
{
.proto = PROTOCOL_UNKNOWN,
.name = NULL,
@@ -193,7 +198,7 @@ struct url_matcher static_matchers[] = {
0, 0},
{"mailto:", "", url_email_start, url_email_end,
0, 0},
- {"callto://", "", url_web_start, url_web_end,
+ {"callto:", "", url_tel_start, url_tel_end,
0, 0},
{"h323:", "", url_web_start, url_web_end,
0, 0},
@@ -738,6 +743,144 @@ rspamd_mailto_parse (struct http_parser_url *u,
return ret;
}
+static gint
+rspamd_telephone_parse (struct http_parser_url *u,
+ const gchar *str, gsize len,
+ gchar const **end,
+ enum rspamd_url_parse_flags parse_flags,
+ guint *flags)
+{
+ enum {
+ parse_protocol,
+ parse_semicolon,
+ parse_slash,
+ parse_slash_slash,
+ parse_spaces,
+ parse_plus,
+ parse_phone_start,
+ parse_phone,
+ } st = parse_protocol;
+
+ const gchar *p = str, *c = str, *last = str + len;
+ gchar t;
+ gint ret = 1, i;
+ UChar32 uc;
+
+ if (u != NULL) {
+ memset (u, 0, sizeof (*u));
+ }
+
+ while (p < last) {
+ t = *p;
+
+ switch (st) {
+ case parse_protocol:
+ if (t == ':') {
+ st = parse_semicolon;
+ SET_U (u, UF_SCHEMA);
+ }
+ p++;
+ break;
+ case parse_semicolon:
+ if (t == '/' || t == '\\') {
+ st = parse_slash;
+ p++;
+ }
+ else {
+ st = parse_slash_slash;
+ }
+ break;
+ case parse_slash:
+ if (t == '/' || t == '\\') {
+ st = parse_slash_slash;
+ }
+ else {
+ goto out;
+ }
+ p++;
+ break;
+ case parse_slash_slash:
+ if (g_ascii_isspace (t)) {
+ st = parse_spaces;
+ p++;
+ }
+ else if (t == '+') {
+ c = p;
+ st = parse_plus;
+ }
+ else if (t == '/') {
+ /* Skip multiple slashes */
+ p++;
+ }
+ else {
+ st = parse_phone_start;
+ c = p;
+ }
+ break;
+ case parse_spaces:
+ if (t == '+') {
+ c = p;
+ st = parse_plus;
+ }
+ else if (!g_ascii_isspace (t)) {
+ st = parse_phone_start;
+ c = p;
+ }
+ else {
+ p ++;
+ }
+ break;
+ case parse_plus:
+ c = p;
+ p ++;
+ st = parse_phone_start;
+ break;
+ case parse_phone_start:
+ if (*p == '%' || *p == '(' || g_ascii_isdigit (*p)) {
+ st = parse_phone;
+ p ++;
+ }
+ else {
+ goto out;
+ }
+ break;
+ case parse_phone:
+ i = p - str;
+ U8_NEXT (str, i, len, uc);
+ p = str + i;
+
+ if (u_isdigit (uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']'
+ || u_isspace (uc) || uc == '%') {
+ p ++;
+ }
+ else if (uc <= 0 || is_url_end (uc)) {
+ ret = 0;
+ goto set;
+ }
+ break;
+ }
+ }
+
+ set:
+ if (st == parse_phone) {
+ if (p - c != 0) {
+ SET_U (u, UF_HOST);
+ ret = 0;
+ }
+ }
+
+ out:
+ if (end != NULL) {
+ *end = p;
+ }
+
+ if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+ return 0;
+ }
+
+ return ret;
+}
+
static gint
rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
gchar const **end,
@@ -1638,6 +1781,40 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
}
}
+static void
+rspamd_telephone_normalise_inplace (struct rspamd_url *uri)
+{
+ gchar *t, *h, *end;
+ gint i = 0, w, orig_len;
+ UChar32 uc;
+
+ t = uri->host;
+ h = t;
+ end = uri->host + uri->hostlen;
+ orig_len = uri->hostlen;
+
+ if (*h == '+') {
+ h ++;
+ t ++;
+ }
+
+ while (h < end) {
+ i = 0;
+ U8_NEXT (h, i, end - h, uc);
+
+ if (u_isdigit (uc)) {
+ w = 0;
+ U8_APPEND_UNSAFE (t, w, uc);
+ t += w;
+ }
+
+ h += i;
+ }
+
+ uri->hostlen = t - uri->host;
+ uri->urllen -= (orig_len - uri->hostlen);
+}
+
enum uri_errno
rspamd_url_parse (struct rspamd_url *uri,
gchar *uristring, gsize len,
@@ -1659,6 +1836,7 @@ rspamd_url_parse (struct rspamd_url *uri,
}
p = uristring;
+ uri->protocol = PROTOCOL_UNKNOWN;
if (len > sizeof ("mailto:") - 1) {
/* For mailto: urls we also need to add slashes to make it a valid URL */
@@ -1666,75 +1844,11 @@ rspamd_url_parse (struct rspamd_url *uri,
ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags,
&flags);
}
- else if (g_ascii_strncasecmp (p, "tel:", sizeof ("tel:") - 1) == 0) {
- /* Telephone url */
- gint nlen = 0;
- gboolean has_plus = FALSE;
- end = p + len;
- gchar *t, *tend;
- UChar32 uc;
-
- uri->raw = p;
- uri->rawlen = len;
- uri->string = rspamd_mempool_alloc (pool, len + 1);
- t = uri->string;
- tend = t + len;
- i = 4;
-
- memcpy (t, "tel:", 4);
- t += 4;
- p += 4;
- nlen = 4;
-
- if (*p == '+') {
- has_plus = TRUE;
- *t++ = *p++;
- nlen ++;
- i ++;
- }
-
- while (t < tend && i < len) {
- U8_NEXT (uristring, i, len, uc);
-
- if (u_isdigit (uc)) {
- if (g_ascii_isdigit (uc)) {
- *t++ = uc;
- nlen ++;
- }
- else {
- /* Obfuscated number */
- uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
- }
- else if (IS_OBSCURED_CHAR (uc)) {
- uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
- }
-
- *t = '\0';
-
- if (rspamd_normalise_unicode_inplace (pool, uri->string, &nlen)) {
- uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
-
- uri->urllen = nlen;
-
+ else if (g_ascii_strncasecmp (p, "tel:", sizeof ("tel:") - 1) == 0 ||
+ g_ascii_strncasecmp (p, "callto:", sizeof ("callto:") - 1) == 0) {
+ ret = rspamd_telephone_parse (&u, uristring, len, &end, parse_flags,
+ &flags);
uri->protocol = PROTOCOL_TELEPHONE;
- uri->protocollen = 4;
-
- uri->host = uri->string + 4;
- uri->hostlen = nlen - 4;
-
- if (has_plus) {
- uri->tld = uri->string + 5;
- uri->tldlen = nlen - 5;
- }
- else {
- uri->tld = uri->string + 4;
- uri->tldlen = nlen - 4;
- }
-
- return URI_ERRNO_OK;
}
else {
ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags,
@@ -1867,32 +1981,46 @@ rspamd_url_parse (struct rspamd_url *uri,
rspamd_str_lc (uri->string, uri->protocollen);
rspamd_str_lc_utf8 (uri->host, uri->hostlen);
- uri->protocol = PROTOCOL_UNKNOWN;
-
- for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) {
- if (uri->protocollen == rspamd_url_protocols[i].len) {
- if (memcmp (uri->string,
- rspamd_url_protocols[i].name, uri->protocollen) == 0) {
- uri->protocol = rspamd_url_protocols[i].proto;
- break;
+ if (uri->protocol == PROTOCOL_UNKNOWN) {
+ for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) {
+ if (uri->protocollen == rspamd_url_protocols[i].len) {
+ if (memcmp (uri->string,
+ rspamd_url_protocols[i].name, uri->protocollen) == 0) {
+ uri->protocol = rspamd_url_protocols[i].proto;
+ break;
+ }
}
}
}
- /* Find TLD part */
- rspamd_multipattern_lookup (url_scanner->search_trie,
+ if (uri->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_MAILTO|PROTOCOL_FTP)) {
+ /* Find TLD part */
+ rspamd_multipattern_lookup (url_scanner->search_trie,
uri->host, uri->hostlen,
rspamd_tld_trie_callback, uri, NULL);
- if (uri->tldlen == 0) {
- if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
- /* Ignore URL's without TLD if it is not a numeric URL */
- if (!rspamd_url_is_ip (uri, pool)) {
- return URI_ERRNO_TLD_MISSING;
+ if (uri->tldlen == 0) {
+ if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+ /* Ignore URL's without TLD if it is not a numeric URL */
+ if (!rspamd_url_is_ip (uri, pool)) {
+ return URI_ERRNO_TLD_MISSING;
+ }
+ } else {
+ /* Assume tld equal to host */
+ uri->tld = uri->host;
+ uri->tldlen = uri->hostlen;
}
}
+ }
+ else if (uri->protocol & PROTOCOL_TELEPHONE) {
+ /* We need to normalise phone number: remove all spaces and braces */
+ rspamd_telephone_normalise_inplace (uri);
+
+ if (uri->host[0] == '+') {
+ uri->tld = uri->host + 1;
+ uri->tldlen = uri->hostlen - 1;
+ }
else {
- /* Assume tld equal to host */
uri->tld = uri->host;
uri->tldlen = uri->hostlen;
}
@@ -2371,13 +2499,15 @@ url_tel_start (struct url_callback_data *cb,
const gchar *pos,
url_match_t *match)
{
- if (!(*pos == '+' || g_ascii_isdigit (*pos))) {
- /* Urls cannot start with . */
- return FALSE;
- }
-
match->m_begin = pos;
+ if (pos >= cb->begin + 1) {
+ match->st = *(pos - 1);
+ }
+ else {
+ match->st = '\0';
+ }
+
return TRUE;
}
@@ -2386,29 +2516,26 @@ url_tel_end (struct url_callback_data *cb,
const gchar *pos,
url_match_t *match)
{
- UChar32 uc;
- gint len = cb->end - pos, i = 0;
+ const gchar *last = NULL;
+ struct http_parser_url u;
+ gint len = cb->end - pos;
+ guint flags = 0;
if (match->newline_pos && match->st != '<') {
/* We should also limit our match end to the newline */
len = MIN (len, match->newline_pos - pos);
}
- while (i < len) {
- U8_NEXT (pos, i, len, uc);
-
- if (uc < 0) {
- break;
- }
+ if (rspamd_telephone_parse (&u, pos, len, &last,
+ RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
+ return FALSE;
+ }
- if (!(u_isdigit (uc) || u_isspace (uc) ||
- IS_OBSCURED_CHAR (uc) || uc == '+' ||
- uc == '-' || uc == '.')) {
- break;
- }
+ if (!(u.field_set & (1 << UF_HOST))) {
+ return FALSE;
}
- match->m_len = i;
+ match->m_len = (last - pos);
return TRUE;
}
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 4d1948921..3deeb8cf5 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -85,7 +85,7 @@ enum rspamd_url_protocol {
PROTOCOL_HTTPS = 1u << 3,
PROTOCOL_MAILTO = 1u << 4,
PROTOCOL_TELEPHONE = 1u << 5,
- PROTOCOL_UNKNOWN = -1,
+ PROTOCOL_UNKNOWN = 1u << 31,
};
/**
diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c
index 884467e10..2ea72f075 100644
--- a/src/lua/lua_config.c
+++ b/src/lua/lua_config.c
@@ -2619,7 +2619,6 @@ lua_config_newindex (lua_State *L)
gint type = SYMBOL_TYPE_NORMAL, priority = 0, idx;
gdouble weight = 1.0, score = NAN;
const char *type_str, *group = NULL, *description = NULL;
- guint flags = 0;
no_squeeze = cfg->disable_lua_squeeze;
/*
More information about the Commits
mailing list