commit 0917da5: [Fix] Deny URLs where hostname is bogus

Vsevolod Stakhov vsevolod at highsecure.ru
Thu May 2 09:49:05 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-05-02 10:46:01 +0100
URL: https://github.com/rspamd/rspamd/commit/0917da51251df2566c1ad722b2727dfaa2969711 (HEAD -> master)

[Fix] Deny URLs where hostname is bogus

---
 src/libserver/url.c | 771 ++++++++++++++++++++++++++--------------------------
 1 file changed, 390 insertions(+), 381 deletions(-)

diff --git a/src/libserver/url.c b/src/libserver/url.c
index 0ec256902..ea46c0353 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -915,6 +915,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 		parse_multiple_at,
 		parse_password_start,
 		parse_password,
+		parse_domain_start,
 		parse_domain,
 		parse_ipv6,
 		parse_port_password,
@@ -933,465 +934,473 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 		t = *p;
 
 		switch (st) {
-			case parse_protocol:
-				if (t == ':') {
-					st = parse_semicolon;
-					SET_U (u, UF_SCHEMA);
+		case parse_protocol:
+			if (t == ':') {
+				st = parse_semicolon;
+				SET_U (u, UF_SCHEMA);
+			}
+			else if (!g_ascii_isalnum (t) && t != '+' && t != '-') {
+				if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) {
+					/* We might have some domain, but no protocol */
+					st = parse_domain_start;
+					p = c;
+					slash = c;
+					break;
 				}
-				else if (!g_ascii_isalnum (t) && t != '+' && t != '-') {
-					if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) {
-						/* We might have some domain, but no protocol */
-						st = parse_domain;
-						p = c;
-						slash = c;
-						break;
-					}
-					else {
-						goto out;
-					}
+				else {
+					goto out;
 				}
+			}
+			p++;
+			break;
+		case parse_semicolon:
+			if (t == '/' || t == '\\') {
+				st = parse_slash;
 				p++;
-				break;
-			case parse_semicolon:
-				if (t == '/' || t == '\\') {
-					st = parse_slash;
+			}
+			else {
+				st = parse_slash_slash;
+				*(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES;
+			}
+			break;
+		case parse_slash:
+			if (t == '/' || t == '\\') {
+				st = parse_slash_slash;
+			}
+			else {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_slash_slash:
+
+			if (t != '/' && t != '\\') {
+				c = p;
+				st = parse_domain_start;
+				slash = p;
+
+				if (*p == '[') {
+					st = parse_ipv6;
 					p++;
+					c = p;
 				}
-				else {
-					st = parse_slash_slash;
-					*(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES;
-				}
-				break;
-			case parse_slash:
-				if (t == '/' || t == '\\') {
-					st = parse_slash_slash;
-				}
-				else {
+			}
+			else {
+				/* Skip multiple slashes */
+				p++;
+			}
+			break;
+		case parse_ipv6:
+			if (t == ']') {
+				if (p - c == 0) {
 					goto out;
 				}
+				SET_U (u, UF_HOST);
 				p++;
-				break;
-			case parse_slash_slash:
-
-				if (t != '/' && t != '\\') {
-					c = p;
-					st = parse_domain;
-					slash = p;
 
-					if (*p == '[') {
-						st = parse_ipv6;
-						p++;
-						c = p;
-					}
+				if (*p == ':') {
+					st = parse_port;
+					c = p + 1;
 				}
-				else {
-					/* Skip multiple slashes */
-					p++;
+				else if (*p == '/') {
+					st = parse_path;
+					c = p + 1;
 				}
-				break;
-			case parse_ipv6:
-				if (t == ']') {
-					if (p - c == 0) {
-						goto out;
-					}
-					SET_U (u, UF_HOST);
-					p++;
-
-					if (*p == ':') {
-						st = parse_port;
-						c = p + 1;
-					}
-					else if (*p == '/') {
-						st = parse_path;
-						c = p + 1;
-					}
-					else if (p != last) {
-						goto out;
-					}
+				else if (p != last) {
+					goto out;
 				}
-				else if (!g_ascii_isxdigit (t) && t != ':' && t != '.') {
+			}
+			else if (!g_ascii_isxdigit (t) && t != ':' && t != '.') {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_user:
+			if (t == ':') {
+				if (p - c == 0) {
 					goto out;
 				}
-				p++;
-				break;
-			case parse_user:
-				if (t == ':') {
-					if (p - c == 0) {
-						goto out;
-					}
-					user_start = c;
-					st = parse_password_start;
+				user_start = c;
+				st = parse_password_start;
+			}
+			else if (t == '@') {
+				/* No password */
+				if (p - c == 0) {
+					/* We have multiple at in fact */
+					st = parse_multiple_at;
+					user_seen = TRUE;
+					*flags |= RSPAMD_URL_FLAG_OBSCURED;
+
+					continue;
+				}
+
+				SET_U (u, UF_USERINFO);
+				*flags |= RSPAMD_URL_FLAG_HAS_USER;
+				st = parse_at;
+			}
+			else if (!g_ascii_isgraph (t)) {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_multiple_at:
+			if (t != '@') {
+				if (p - c == 0) {
+					goto out;
 				}
-				else if (t == '@') {
-					/* No password */
-					if (p - c == 0) {
-						/* We have multiple at in fact */
-						st = parse_multiple_at;
-						user_seen = TRUE;
-						*flags |= RSPAMD_URL_FLAG_OBSCURED;
 
-						continue;
+				/* For now, we ignore all that stuff as it is bogus */
+				SET_U (u, UF_USERINFO);
+				st = parse_at;
+			}
+			else {
+				p ++;
+			}
+			break;
+		case parse_password_start:
+			if (t == '@') {
+				/* Empty password */
+				SET_U (u, UF_USERINFO);
+				if (u != NULL && u->field_data[UF_USERINFO].len > 0) {
+					/* Eat semicolon */
+					u->field_data[UF_USERINFO].len--;
+				}
+				*flags |= RSPAMD_URL_FLAG_HAS_USER;
+				st = parse_at;
+			}
+			else {
+				c = p;
+				password_start = p;
+				st = parse_password;
+			}
+			p++;
+			break;
+		case parse_password:
+			if (t == '@') {
+				/* XXX: password is not stored */
+				if (u != NULL) {
+					if (u->field_data[UF_USERINFO].len == 0
+						&& password_start
+						&& user_start && password_start > user_start + 1) {
+						*flags |= RSPAMD_URL_FLAG_HAS_USER;
+						u->field_set |= 1u << (UF_USERINFO);
+						u->field_data[UF_USERINFO].len =
+								password_start - user_start - 1;
+						u->field_data[UF_USERINFO].off =
+								user_start - str;
 					}
 
-					SET_U (u, UF_USERINFO);
-					*flags |= RSPAMD_URL_FLAG_HAS_USER;
-					st = parse_at;
 				}
-				else if (!g_ascii_isgraph (t)) {
+				st = parse_at;
+			}
+			else if (!g_ascii_isgraph (t)) {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_at:
+			c = p;
+
+			if (t == '@') {
+				*flags |= RSPAMD_URL_FLAG_OBSCURED;
+				p ++;
+			}
+			else if (t == '[') {
+				st = parse_ipv6;
+				p++;
+				c = p;
+			}
+			else {
+				st = parse_domain_start;
+			}
+			break;
+		case parse_domain_start:
+			if (g_ascii_isalnum (t) || t & 0x80) {
+				st = parse_domain;
+			}
+			else {
+				goto out;
+			}
+			break;
+		case parse_domain:
+			if (t == '/' || t == ':' || t == '?' || t == '#') {
+				if (p - c == 0) {
 					goto out;
 				}
-				p++;
-				break;
-			case parse_multiple_at:
-				if (t != '@') {
-					if (p - c == 0) {
-						goto out;
-					}
-
-					/* For now, we ignore all that stuff as it is bogus */
-					SET_U (u, UF_USERINFO);
-					st = parse_at;
+				if (t == '/') {
+					SET_U (u, UF_HOST);
+					st = parse_suffix_slash;
 				}
-				else {
-					p ++;
+				else if (t == '?') {
+					SET_U (u, UF_HOST);
+					st = parse_query;
+					c = p + 1;
 				}
-				break;
-			case parse_password_start:
-				if (t == '@') {
-					/* Empty password */
-					SET_U (u, UF_USERINFO);
-					if (u != NULL && u->field_data[UF_USERINFO].len > 0) {
-						/* Eat semicolon */
-						u->field_data[UF_USERINFO].len--;
-					}
-					*flags |= RSPAMD_URL_FLAG_HAS_USER;
-					st = parse_at;
+				else if (t == '#') {
+					SET_U (u, UF_HOST);
+					st = parse_part;
+					c = p + 1;
+				}
+				else if (!user_seen) {
+					/*
+					 * Here we can have both port and password, hence we need
+					 * to apply some heuristic here
+					 */
+					st = parse_port_password;
 				}
 				else {
-					c = p;
-					password_start = p;
-					st = parse_password;
+					/*
+					 * We can go only for parsing port here
+					 */
+					SET_U (u, UF_HOST);
+					st = parse_port;
+					c = p + 1;
 				}
 				p++;
-				break;
-			case parse_password:
-				if (t == '@') {
-					/* XXX: password is not stored */
-					if (u != NULL) {
-						if (u->field_data[UF_USERINFO].len == 0
-								&& password_start
-								&& user_start && password_start > user_start + 1) {
-							*flags |= RSPAMD_URL_FLAG_HAS_USER;
-							u->field_set |= 1u << (UF_USERINFO);
-							u->field_data[UF_USERINFO].len =
-									password_start - user_start - 1;
-							u->field_data[UF_USERINFO].off =
-									user_start - str;
-						}
-
-					}
-					st = parse_at;
+			}
+			else {
+				if (is_url_end (t)) {
+					goto set;
 				}
-				else if (!g_ascii_isgraph (t)) {
-					goto out;
+				else if (*p == '@' && !user_seen) {
+					/* We need to fallback and test user */
+					p = slash;
+					user_seen = TRUE;
+					st = parse_user;
 				}
-				p++;
-				break;
-			case parse_at:
-				c = p;
+				else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
+					if (*p & 0x80) {
+						(*flags) |= RSPAMD_URL_FLAG_IDN;
+						guint i = 0;
 
-				if (t == '@') {
-					*flags |= RSPAMD_URL_FLAG_OBSCURED;
-					p ++;
-				}
-				else if (t == '[') {
-					st = parse_ipv6;
-					p++;
-					c = p;
-				}
-				else {
-					st = parse_domain;
-				}
-				break;
-			case parse_domain:
-				if (t == '/' || t == ':' || t == '?' || t == '#') {
-					if (p - c == 0) {
-						goto out;
-					}
-					if (t == '/') {
-						SET_U (u, UF_HOST);
-						st = parse_suffix_slash;
-					}
-					else if (t == '?') {
-						SET_U (u, UF_HOST);
-						st = parse_query;
-						c = p + 1;
-					}
-					else if (t == '#') {
-						SET_U (u, UF_HOST);
-						st = parse_part;
-						c = p + 1;
-					}
-					else if (!user_seen) {
-						/*
-						 * Here we can have both port and password, hence we need
-						 * to apply some heuristic here
-						 */
-						st = parse_port_password;
-					}
-					else {
-						/*
-						 * We can go only for parsing port here
-						 */
-						SET_U (u, UF_HOST);
-						st = parse_port;
-						c = p + 1;
-					}
-					p++;
-				}
-				else {
-					if (is_url_end (t)) {
-						goto set;
-					}
-					else if (*p == '@' && !user_seen) {
-						/* We need to fallback and test user */
-						p = slash;
-						user_seen = TRUE;
-						st = parse_user;
-					}
-					else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
-						if (*p & 0x80) {
-							(*flags) |= RSPAMD_URL_FLAG_IDN;
-							guint i = 0;
+						U8_NEXT (p, i, last - p, uc);
 
-							U8_NEXT (p, i, last - p, uc);
+						if (uc < 0) {
+							/* Bad utf8 */
+							goto out;
+						}
 
-							if (uc < 0) {
-								/* Bad utf8 */
-								goto out;
+						if (!u_isalnum (uc)) {
+							/* Bad symbol */
+							if (IS_ZERO_WIDTH_SPACE (uc)) {
+								(*flags) |= RSPAMD_URL_FLAG_OBSCURED;
 							}
-
-							if (!u_isalnum (uc)) {
-								/* Bad symbol */
-								if (IS_ZERO_WIDTH_SPACE (uc)) {
-									(*flags) |= RSPAMD_URL_FLAG_OBSCURED;
-								}
-								else {
-									if (!u_isgraph (uc)) {
-										if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
-											goto out;
-										}
-										else {
-											goto set;
-										}
+							else {
+								if (!u_isgraph (uc)) {
+									if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+										goto out;
+									}
+									else {
+										goto set;
 									}
 								}
 							}
-
-							p = p + i;
 						}
-						else if (is_urlsafe (*p)) {
+
+						p = p + i;
+					}
+					else if (is_urlsafe (*p)) {
+						p ++;
+					}
+					else {
+						if (parse_flags & RSPAMD_URL_PARSE_HREF) {
+							/* We have to use all shit we are given here */
 							p ++;
+							(*flags) |= RSPAMD_URL_FLAG_OBSCURED;
 						}
 						else {
-							if (parse_flags & RSPAMD_URL_PARSE_HREF) {
-								/* We have to use all shit we are given here */
-								p ++;
-								(*flags) |= RSPAMD_URL_FLAG_OBSCURED;
+							if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+								goto out;
 							}
 							else {
-								if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
-									goto out;
-								}
-								else {
-									goto set;
-								}
+								goto set;
 							}
 						}
 					}
-					else {
-						p++;
-					}
 				}
-				break;
-			case parse_port_password:
-				if (g_ascii_isdigit (t)) {
-					const gchar *tmp = p;
-
-					while (tmp < last) {
-						if (!g_ascii_isdigit (*tmp)) {
-							if (*tmp == '/' || *tmp == '#' || *tmp == '?' ||
-									is_url_end (*tmp) || g_ascii_isspace (*tmp)) {
-								/* Port + something */
-								st = parse_port;
-								c = slash;
-								p--;
-								SET_U (u, UF_HOST);
-								p++;
-								c = p;
-								break;
-							}
-							else {
-								/* Not a port, bad character at the end */
-								break;
-							}
+				else {
+					p++;
+				}
+			}
+			break;
+		case parse_port_password:
+			if (g_ascii_isdigit (t)) {
+				const gchar *tmp = p;
+
+				while (tmp < last) {
+					if (!g_ascii_isdigit (*tmp)) {
+						if (*tmp == '/' || *tmp == '#' || *tmp == '?' ||
+							is_url_end (*tmp) || g_ascii_isspace (*tmp)) {
+							/* Port + something */
+							st = parse_port;
+							c = slash;
+							p--;
+							SET_U (u, UF_HOST);
+							p++;
+							c = p;
+							break;
+						}
+						else {
+							/* Not a port, bad character at the end */
+							break;
 						}
-						tmp ++;
-					}
-
-					if (tmp == last) {
-						/* Host + port only */
-						st = parse_port;
-						c = slash;
-						p--;
-						SET_U (u, UF_HOST);
-						p++;
-						c = p;
 					}
+					tmp ++;
+				}
 
-					if (st != parse_port) {
-						/* Fallback to user:password */
-						p = slash;
-						c = slash;
-						user_seen = TRUE;
-						st = parse_user;
-					}
+				if (tmp == last) {
+					/* Host + port only */
+					st = parse_port;
+					c = slash;
+					p--;
+					SET_U (u, UF_HOST);
+					p++;
+					c = p;
 				}
-				else {
-					/* Rewind back */
+
+				if (st != parse_port) {
+					/* Fallback to user:password */
 					p = slash;
 					c = slash;
 					user_seen = TRUE;
 					st = parse_user;
 				}
-				break;
-			case parse_port:
-				if (t == '/') {
-					pt = strtoul (c, NULL, 10);
-					if (pt == 0 || pt > 65535) {
-						goto out;
-					}
-					if (u != NULL) {
-						u->port = pt;
-						*flags |= RSPAMD_URL_FLAG_HAS_PORT;
-					}
-					st = parse_suffix_slash;
+			}
+			else {
+				/* Rewind back */
+				p = slash;
+				c = slash;
+				user_seen = TRUE;
+				st = parse_user;
+			}
+			break;
+		case parse_port:
+			if (t == '/') {
+				pt = strtoul (c, NULL, 10);
+				if (pt == 0 || pt > 65535) {
+					goto out;
+				}
+				if (u != NULL) {
+					u->port = pt;
+					*flags |= RSPAMD_URL_FLAG_HAS_PORT;
+				}
+				st = parse_suffix_slash;
+			}
+			else if (t == '?') {
+				pt = strtoul (c, NULL, 10);
+				if (pt == 0 || pt > 65535) {
+					goto out;
+				}
+				if (u != NULL) {
+					u->port = pt;
+					*flags |= RSPAMD_URL_FLAG_HAS_PORT;
 				}
-				else if (t == '?') {
-					pt = strtoul (c, NULL, 10);
-					if (pt == 0 || pt > 65535) {
-						goto out;
-					}
-					if (u != NULL) {
-						u->port = pt;
-						*flags |= RSPAMD_URL_FLAG_HAS_PORT;
-					}
 
-					c = p + 1;
-					st = parse_query;
+				c = p + 1;
+				st = parse_query;
+			}
+			else if (t == '#') {
+				pt = strtoul (c, NULL, 10);
+				if (pt == 0 || pt > 65535) {
+					goto out;
+				}
+				if (u != NULL) {
+					u->port = pt;
+					*flags |= RSPAMD_URL_FLAG_HAS_PORT;
 				}
-				else if (t == '#') {
*** OUTPUT TRUNCATED, 177 LINES SKIPPED ***


More information about the Commits mailing list