commit d029c2a: [Fix] Fix some corner cases of single-host urls parsing
Vsevolod Stakhov
vsevolod at rspamd.com
Thu Aug 10 12:14:04 UTC 2023
Author: Vsevolod Stakhov
Date: 2023-08-10 13:07:38 +0100
URL: https://github.com/rspamd/rspamd/commit/d029c2a4004b91c87482e7c5b2e96b93179ddb56 (HEAD -> master)
[Fix] Fix some corner cases of single-host urls parsing
---
src/libserver/url.c | 87 ++++++++++++++++++++++++++++-------------------------
1 file changed, 46 insertions(+), 41 deletions(-)
diff --git a/src/libserver/url.c b/src/libserver/url.c
index b9b19c355..ab32549c7 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
+/*
+ * Copyright 2023 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,31 +14,6 @@
* limitations under the License.
*/
-/*
- * Copyright (C) 2002-2015 Igor Sysoev
- * Copyright (C) 2011-2015 Nginx, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
#include "config.h"
#include "url.h"
#include "util.h"
@@ -193,8 +168,9 @@ struct url_matcher static_matchers[] = {
{"ftp.", "ftp://", url_web_start, url_web_end,
0},
/* Likely emails */
- {"@", "mailto://", url_email_start, url_email_end,
- 0}};
+ {
+ "@", "mailto://", url_email_start, url_email_end,
+ 0}};
struct rspamd_url_flag_name {
const gchar *name;
@@ -1817,7 +1793,7 @@ rspamd_url_regen_from_inet_addr(struct rspamd_url *uri, const void *addr, int af
}
static gboolean
-rspamd_url_is_ip(struct rspamd_url *uri, rspamd_mempool_t *pool)
+rspamd_url_maybe_regenerate_from_ip(struct rspamd_url *uri, rspamd_mempool_t *pool)
{
const gchar *p, *end, *c;
gchar *errstr;
@@ -2214,7 +2190,7 @@ rspamd_url_parse(struct rspamd_url *uri,
struct http_parser_url u;
gchar *p;
const gchar *end;
- guint i, complen, ret, flags = 0;
+ guint complen, ret, flags = 0;
gsize unquoted_len = 0;
memset(uri, 0, sizeof(*uri));
@@ -2277,7 +2253,7 @@ rspamd_url_parse(struct rspamd_url *uri,
p + u.field_data[UF_SCHEMA].len + 1,
len - 2 - u.field_data[UF_SCHEMA].len);
/* Compensate slashes added */
- for (i = UF_SCHEMA + 1; i < UF_MAX; i++) {
+ for (int i = UF_SCHEMA + 1; i < UF_MAX; i++) {
if (u.field_set & (1 << i)) {
u.field_data[i].off += 2;
}
@@ -2291,7 +2267,7 @@ rspamd_url_parse(struct rspamd_url *uri,
uri->urllen = len;
uri->flags = flags;
- for (i = 0; i < UF_MAX; i++) {
+ for (guint i = 0; i < UF_MAX; i++) {
if (u.field_set & (1 << i)) {
guint shift = u.field_data[i].off;
complen = u.field_data[i].len;
@@ -2458,7 +2434,7 @@ rspamd_url_parse(struct rspamd_url *uri,
rspamd_url_shift(uri, unquoted_len, UF_HOST);
if (uri->protocol == PROTOCOL_UNKNOWN) {
- for (i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) {
+ for (int i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) {
if (uri->protocollen == rspamd_url_protocols[i].len) {
if (memcmp(uri->string,
rspamd_url_protocols[i].name, uri->protocollen) == 0) {
@@ -2481,21 +2457,50 @@ rspamd_url_parse(struct rspamd_url *uri,
/*
* If we have not detected eSLD, but there are no dots in the hostname,
* then we should treat the whole hostname as eSLD - a rule of thumb
+ *
+ * We also check that a hostname ends with a permitted character, and all characters are forming
+ * DNS label. We also need to check for a numeric IP within this check.
*/
- if (uri->hostlen > 0 && memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen) == NULL) {
- uri->tldlen = uri->hostlen;
- uri->tldshift = uri->hostshift;
+ const char *dot_pos = memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen);
+ bool is_whole_hostname_tld = false;
+
+ if (uri->hostlen > 0 && (dot_pos == NULL || dot_pos == rspamd_url_host_unsafe(uri) + uri->hostlen - 1)) {
+ bool all_chars_domain = true;
+
+ for (int i = 0; i < uri->hostlen; i++) {
+ if (!is_domain(rspamd_url_host_unsafe(uri)[i])) {
+ all_chars_domain = false;
+ break;
+ }
+ }
+
+ if (all_chars_domain) {
+ /* Also check the last character to be either a dot or alphanumeric character */
+ char last_c = rspamd_url_host_unsafe(uri)[uri->hostlen - 1];
+ if (last_c != '.' && !g_ascii_isalnum(last_c)) {
+ all_chars_domain = false;
+ }
+ }
+
+ if (all_chars_domain) {
+ /* Additionally check for a numeric IP as we can have some number here... */
+ rspamd_url_maybe_regenerate_from_ip(uri, pool);
+ uri->tldlen = uri->hostlen;
+ uri->tldshift = uri->hostshift;
+ is_whole_hostname_tld = true;
+ }
}
- else {
+
+ if (!is_whole_hostname_tld) {
if (uri->protocol != PROTOCOL_MAILTO) {
if (url_scanner->has_tld_file && !(parse_flags & RSPAMD_URL_PARSE_HREF)) {
/* Ignore URL's without TLD if it is not a numeric URL */
- if (!rspamd_url_is_ip(uri, pool)) {
+ if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) {
return URI_ERRNO_TLD_MISSING;
}
}
else {
- if (!rspamd_url_is_ip(uri, pool)) {
+ if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) {
/* Assume tld equal to host */
uri->tldshift = uri->hostshift;
uri->tldlen = uri->hostlen;
More information about the Commits
mailing list