commit 96b9470: [Rework] Make http normalize path function a generic function

Vsevolod Stakhov vsevolod at rspamd.com
Sun Oct 23 21:21:05 UTC 2022


Author: Vsevolod Stakhov
Date: 2022-10-23 21:41:18 +0100
URL: https://github.com/rspamd/rspamd/commit/96b94707c1c6fde1cc2aa06522587114c5c6c809

[Rework] Make http normalize path function a generic function

---
 src/controller.c                  |  12 +-
 src/libserver/http/http_router.c  |   6 +-
 src/libserver/http/http_util.c    | 224 --------------------------------------
 src/libserver/http/http_util.h    |   9 --
 src/libserver/hyperscan_tools.cxx |   2 +-
 src/libserver/url.c               |   4 +-
 src/libutil/util.c                | 224 ++++++++++++++++++++++++++++++++++++++
 src/libutil/util.h                |   9 ++
 test/lua/unit/url.lua             |   2 +-
 9 files changed, 246 insertions(+), 246 deletions(-)

diff --git a/src/controller.c b/src/controller.c
index e695d86a4..0ff7d64c0 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -3287,9 +3287,9 @@ rspamd_controller_handle_custom (struct rspamd_http_connection_entry *conn_ent,
 		lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
 		lookup.len = u.field_data[UF_PATH].len;
 
-		rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
-				lookup.len,
-				&unnorm_len);
+		rspamd_normalize_path_inplace((gchar *) lookup.begin,
+			lookup.len,
+			&unnorm_len);
 		lookup.len = unnorm_len;
 	}
 	else {
@@ -3494,9 +3494,9 @@ rspamd_controller_handle_lua_plugin (struct rspamd_http_connection_entry *conn_e
 		lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
 		lookup.len = u.field_data[UF_PATH].len;
 
-		rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
-				lookup.len,
-				&unnorm_len);
+		rspamd_normalize_path_inplace((gchar *) lookup.begin,
+			lookup.len,
+			&unnorm_len);
 		lookup.len = unnorm_len;
 	}
 	else {
diff --git a/src/libserver/http/http_router.c b/src/libserver/http/http_router.c
index 5c4990ab6..a70ea223f 100644
--- a/src/libserver/http/http_router.c
+++ b/src/libserver/http/http_router.c
@@ -302,9 +302,9 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn,
 				lookup.begin = pathbuf;
 				lookup.len = u.field_data[UF_PATH].len;
 
-				rspamd_http_normalize_path_inplace (pathbuf,
-						lookup.len,
-						&unnorm_len);
+				rspamd_normalize_path_inplace(pathbuf,
+					lookup.len,
+					&unnorm_len);
 				lookup.len = unnorm_len;
 			}
 			else {
diff --git a/src/libserver/http/http_util.c b/src/libserver/http/http_util.c
index fd5adb3c1..c9035375b 100644
--- a/src/libserver/http/http_util.c
+++ b/src/libserver/http/http_util.c
@@ -299,228 +299,4 @@ rspamd_http_date_format (gchar *buf, gsize len, time_t time)
 			http_week[tms.tm_wday], tms.tm_mday,
 			http_month[tms.tm_mon], tms.tm_year + 1900,
 			tms.tm_hour, tms.tm_min, tms.tm_sec);
-}
-
-void
-rspamd_http_normalize_path_inplace (gchar *path, guint len, gsize *nlen)
-{
-	const gchar *p, *end, *slash = NULL, *dot = NULL;
-	gchar *o;
-	enum {
-		st_normal = 0,
-		st_got_dot,
-		st_got_dot_dot,
-		st_got_slash,
-		st_got_slash_slash,
-	} state = st_normal;
-
-	p = path;
-	end = path + len;
-	o = path;
-
-	while (p < end) {
-		switch (state) {
-		case st_normal:
-			if (G_UNLIKELY (*p == '/')) {
-				state = st_got_slash;
-				slash = p;
-			}
-			else if (G_UNLIKELY (*p == '.')) {
-				state = st_got_dot;
-				dot = p;
-			}
-			else {
-				*o++ = *p;
-			}
-			p ++;
-			break;
-		case st_got_slash:
-			if (G_UNLIKELY (*p == '/')) {
-				/* Ignore double slash */
-				*o++ = *p;
-				state = st_got_slash_slash;
-			}
-			else if (G_UNLIKELY (*p == '.')) {
-				dot = p;
-				state = st_got_dot;
-			}
-			else {
-				*o++ = '/';
-				*o++ = *p;
-				slash = NULL;
-				dot = NULL;
-				state = st_normal;
-			}
-			p ++;
-			break;
-		case st_got_slash_slash:
-			if (G_LIKELY (*p != '/')) {
-				slash = p - 1;
-				dot = NULL;
-				state = st_normal;
-				continue;
-			}
-			p ++;
-			break;
-		case st_got_dot:
-			if (G_UNLIKELY (*p == '/')) {
-				/* Remove any /./ or ./ paths */
-				if (((o > path && *(o - 1) != '/') || (o == path)) && slash) {
-					/* Preserve one slash */
-					*o++ = '/';
-				}
-
-				slash = p;
-				dot = NULL;
-				/* Ignore last slash */
-				state = st_normal;
-			}
-			else if (*p == '.') {
-				/* Double dot character */
-				state = st_got_dot_dot;
-			}
-			else {
-				/* We have something like .some or /.some */
-				if (dot && p > dot) {
-					if (slash == dot - 1 && (o > path && *(o - 1) != '/')) {
-						/* /.blah */
-						memmove (o, slash, p - slash);
-						o += p - slash;
-					}
-					else {
-						memmove (o, dot, p - dot);
-						o += p - dot;
-					}
-				}
-
-				slash = NULL;
-				dot = NULL;
-				state = st_normal;
-				continue;
-			}
-
-			p ++;
-			break;
-		case st_got_dot_dot:
-			if (*p == '/') {
-				/* We have something like /../ or ../ */
-				if (slash) {
-					/* We need to remove the last component from o if it is there */
-					if (o > path + 2 && *(o - 1) == '/') {
-						slash = rspamd_memrchr (path, '/', o - path - 2);
-					}
-					else if (o > path + 1) {
-						slash = rspamd_memrchr (path, '/', o - path - 1);
-					}
-					else {
-						slash = NULL;
-					}
-
-					if (slash) {
-						o = (gchar *)slash;
-					}
-					/* Otherwise we keep these dots */
-					slash = p;
-					state = st_got_slash;
-				}
-				else {
-					/* We have something like bla../, so we need to copy it as is */
-					if (o > path && dot && p > dot) {
-						memmove (o, dot, p - dot);
-						o += p - dot;
-					}
-
-					slash = NULL;
-					dot = NULL;
-					state = st_normal;
-					continue;
-				}
-			}
-			else {
-				/* We have something like ..bla or ... */
-				if (slash) {
-					*o ++ = '/';
-				}
-
-				if (dot && p > dot) {
-					memmove (o, dot, p - dot);
-					o += p - dot;
-				}
-
-				slash = NULL;
-				dot = NULL;
-				state = st_normal;
-				continue;
-			}
-
-			p ++;
-			break;
-		}
-	}
-
-	/* Leftover */
-	switch (state) {
-	case st_got_dot_dot:
-		/* Trailing .. */
-		if (slash) {
-			/* We need to remove the last component from o if it is there */
-			if (o > path + 2 && *(o - 1) == '/') {
-				slash = rspamd_memrchr (path, '/', o - path - 2);
-			}
-			else if (o > path + 1) {
-				slash = rspamd_memrchr (path, '/', o - path - 1);
-			}
-			else {
-				if (o == path) {
-					/* Corner case */
-					*o++ = '/';
-				}
-
-				slash = NULL;
-			}
-
-			if (slash) {
-				/* Remove last / */
-				o = (gchar *)slash;
-			}
-		}
-		else {
-			/* Corner case */
-			if (o == path) {
-				*o++ = '/';
-			}
-			else {
-				if (dot && p > dot) {
-					memmove (o, dot, p - dot);
-					o += p - dot;
-				}
-			}
-		}
-		break;
-	case st_got_dot:
-		if (slash) {
-			/* /. -> must be / */
-			*o++ = '/';
-		}
-		else {
-			if (o > path) {
-				*o++ = '.';
-			}
-		}
-		break;
-	case st_got_slash:
-		*o++ = '/';
-		break;
-	default:
-#if 0
-		if (o > path + 1 && *(o - 1) == '/') {
-			o --;
-		}
-#endif
-		break;
-	}
-
-	if (nlen) {
-		*nlen = (o - path);
-	}
 }
\ No newline at end of file
diff --git a/src/libserver/http/http_util.h b/src/libserver/http/http_util.h
index 19b497f30..3d8356c6d 100644
--- a/src/libserver/http/http_util.h
+++ b/src/libserver/http/http_util.h
@@ -40,15 +40,6 @@ time_t rspamd_http_parse_date (const gchar *header, gsize len);
  */
 glong rspamd_http_date_format (gchar *buf, gsize len, time_t time);
 
-/**
- * Normalize HTTP path removing dot sequences and repeating '/' symbols as
- * per rfc3986#section-5.2
- * @param path
- * @param len
- * @param nlen
- */
-void rspamd_http_normalize_path_inplace (gchar *path, guint len, gsize *nlen);
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx
index 6ec5f7c36..bb1c9ffbc 100644
--- a/src/libserver/hyperscan_tools.cxx
+++ b/src/libserver/hyperscan_tools.cxx
@@ -140,7 +140,7 @@ public:
 
 		auto mut_fname = std::string{fname};
 		std::size_t sz;
-		rspamd_http_normalize_path_inplace(mut_fname.data(), mut_fname.size(), &sz);
+		rspamd_normalize_path_inplace(mut_fname.data(), mut_fname.size(), &sz);
 		mut_fname.resize(sz);
 		auto dir = hs_known_files_cache::get_dir(mut_fname);
 		auto ext =  hs_known_files_cache::get_extension(mut_fname);
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 805e3d65d..7be9d020a 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2439,8 +2439,8 @@ rspamd_url_parse (struct rspamd_url *uri,
 
 		rspamd_url_shift (uri, unquoted_len, UF_PATH);
 		/* We now normalize path */
-		rspamd_http_normalize_path_inplace (rspamd_url_data_unsafe (uri),
-				uri->datalen, &unquoted_len);
+		rspamd_normalize_path_inplace(rspamd_url_data_unsafe (uri),
+			uri->datalen, &unquoted_len);
 		rspamd_url_shift (uri, unquoted_len, UF_PATH);
 	}
 
diff --git a/src/libutil/util.c b/src/libutil/util.c
index 547669536..bc62bb919 100644
--- a/src/libutil/util.c
+++ b/src/libutil/util.c
@@ -2471,3 +2471,227 @@ rspamd_sum_floats (float *ar, gsize *nelts)
 	*nelts = cnt;
 	return sum;
 }
+
+void
+rspamd_normalize_path_inplace (gchar *path, guint len, gsize *nlen)
+{
+	const gchar *p, *end, *slash = NULL, *dot = NULL;
+	gchar *o;
+	enum {
+		st_normal = 0,
+		st_got_dot,
+		st_got_dot_dot,
+		st_got_slash,
+		st_got_slash_slash,
+	} state = st_normal;
+
+	p = path;
+	end = path + len;
+	o = path;
+
+	while (p < end) {
+		switch (state) {
+		case st_normal:
+			if (G_UNLIKELY (*p == '/')) {
+				state = st_got_slash;
+				slash = p;
+			}
+			else if (G_UNLIKELY (*p == '.')) {
+				state = st_got_dot;
+				dot = p;
+			}
+			else {
+				*o++ = *p;
+			}
+			p ++;
+			break;
+		case st_got_slash:
+			if (G_UNLIKELY (*p == '/')) {
+				/* Ignore double slash */
+				*o++ = *p;
+				state = st_got_slash_slash;
+			}
+			else if (G_UNLIKELY (*p == '.')) {
+				dot = p;
+				state = st_got_dot;
+			}
+			else {
+				*o++ = '/';
+				*o++ = *p;
+				slash = NULL;
+				dot = NULL;
+				state = st_normal;
+			}
+			p ++;
+			break;
+		case st_got_slash_slash:
+			if (G_LIKELY (*p != '/')) {
+				slash = p - 1;
+				dot = NULL;
+				state = st_normal;
+				continue;
+			}
+			p ++;
+			break;
+		case st_got_dot:
+			if (G_UNLIKELY (*p == '/')) {
+				/* Remove any /./ or ./ paths */
+				if (((o > path && *(o - 1) != '/') || (o == path)) && slash) {
+					/* Preserve one slash */
+					*o++ = '/';
+				}
+
+				slash = p;
+				dot = NULL;
+				/* Ignore last slash */
+				state = st_normal;
+			}
+			else if (*p == '.') {
+				/* Double dot character */
+				state = st_got_dot_dot;
+			}
+			else {
+				/* We have something like .some or /.some */
+				if (dot && p > dot) {
+					if (slash == dot - 1 && (o > path && *(o - 1) != '/')) {
+						/* /.blah */
+						memmove (o, slash, p - slash);
+						o += p - slash;
+					}
+					else {
+						memmove (o, dot, p - dot);
+						o += p - dot;
+					}
+				}
+
+				slash = NULL;
+				dot = NULL;
+				state = st_normal;
+				continue;
+			}
+
+			p ++;
+			break;
+		case st_got_dot_dot:
+			if (*p == '/') {
+				/* We have something like /../ or ../ */
+				if (slash) {
+					/* We need to remove the last component from o if it is there */
+					if (o > path + 2 && *(o - 1) == '/') {
+						slash = rspamd_memrchr (path, '/', o - path - 2);
+					}
+					else if (o > path + 1) {
+						slash = rspamd_memrchr (path, '/', o - path - 1);
+					}
+					else {
+						slash = NULL;
+					}
+
+					if (slash) {
+						o = (gchar *)slash;
+					}
+					/* Otherwise we keep these dots */
+					slash = p;
+					state = st_got_slash;
+				}
+				else {
+					/* We have something like bla../, so we need to copy it as is */
+					if (o > path && dot && p > dot) {
+						memmove (o, dot, p - dot);
+						o += p - dot;
+					}
+
+					slash = NULL;
+					dot = NULL;
+					state = st_normal;
+					continue;
+				}
+			}
+			else {
+				/* We have something like ..bla or ... */
+				if (slash) {
+					*o ++ = '/';
+				}
+
+				if (dot && p > dot) {
+					memmove (o, dot, p - dot);
+					o += p - dot;
+				}
+
+				slash = NULL;
+				dot = NULL;
+				state = st_normal;
+				continue;
+			}
+
+			p ++;
+			break;
+		}
+	}
+
+	/* Leftover */
+	switch (state) {
+	case st_got_dot_dot:
+		/* Trailing .. */
+		if (slash) {
+			/* We need to remove the last component from o if it is there */
+			if (o > path + 2 && *(o - 1) == '/') {
+				slash = rspamd_memrchr (path, '/', o - path - 2);
+			}
+			else if (o > path + 1) {
+				slash = rspamd_memrchr (path, '/', o - path - 1);
+			}
+			else {
+				if (o == path) {
+					/* Corner case */
+					*o++ = '/';
+				}
+
+				slash = NULL;
+			}
+
+			if (slash) {
+				/* Remove last / */
+				o = (gchar *)slash;
+			}
+		}
+		else {
+			/* Corner case */
+			if (o == path) {
+				*o++ = '/';
+			}
+			else {
+				if (dot && p > dot) {
+					memmove (o, dot, p - dot);
+					o += p - dot;
+				}
+			}
+		}
+		break;
+	case st_got_dot:
+		if (slash) {
+			/* /. -> must be / */
+			*o++ = '/';
+		}
+		else {
+			if (o > path) {
+				*o++ = '.';
+			}
+		}
+		break;
+	case st_got_slash:
+		*o++ = '/';
+		break;
+	default:
+#if 0
+		if (o > path + 1 && *(o - 1) == '/') {
+			o --;
+		}
+#endif
+		break;
+	}
+
+	if (nlen) {
+		*nlen = (o - path);
+	}
+}
diff --git a/src/libutil/util.h b/src/libutil/util.h
index f9be15d28..f747bce5b 100644
--- a/src/libutil/util.h
+++ b/src/libutil/util.h
@@ -526,6 +526,15 @@ extern const struct rspamd_controller_pbkdf pbkdf_list[];
  */
 float rspamd_sum_floats (float *ar, gsize *nelts);
 
+/**
+ * Normalize file path removing dot sequences and repeating '/' symbols as
+ * per rfc3986#section-5.2
+ * @param path
+ * @param len
+ * @param nlen
+ */
+void rspamd_normalize_path_inplace (gchar *path, guint len, gsize *nlen);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/test/lua/unit/url.lua b/test/lua/unit/url.lua
index 2016cc6f4..46eeef277 100644
--- a/test/lua/unit/url.lua
+++ b/test/lua/unit/url.lua
@@ -10,7 +10,7 @@ context("URL check functions", function()
   local ffi = require("ffi")
 
   ffi.cdef[[
-  void rspamd_http_normalize_path_inplace(char *path, size_t len, size_t *nlen);
+  void rspamd_normalize_path_inplace(char *path, size_t len, size_t *nlen);
   ]]
 
   test_helper.init_url_parser()


More information about the Commits mailing list