commit c85595e: [Project] Rework cleanup

Vsevolod Stakhov vsevolod at rspamd.com
Sun Oct 23 21:21:04 UTC 2022


Author: Vsevolod Stakhov
Date: 2022-10-23 21:37:38 +0100
URL: https://github.com/rspamd/rspamd/commit/c85595e6230a5563055eabd69135c8342d31b207

[Project] Rework cleanup

---
 src/libserver/hyperscan_tools.cxx | 115 +++++++++++++---------
 src/libserver/hyperscan_tools.h   |   5 +
 src/libserver/maps/map_helpers.c  | 194 ++++----------------------------------
 src/rspamd.c                      |   7 ++
 4 files changed, 98 insertions(+), 223 deletions(-)

diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx
index 3383915c3..6ec5f7c36 100644
--- a/src/libserver/hyperscan_tools.cxx
+++ b/src/libserver/hyperscan_tools.cxx
@@ -67,49 +67,9 @@ private:
 
 	virtual ~hs_known_files_cache() {
 		// Cleanup cache dir
-		/* We clean dir merely if we are running from the main process */
-		if (rspamd_current_worker == nullptr) {
-			auto cleanup_dir = [&](std::string_view dir) -> void {
-				for (const auto &ext : cache_extensions) {
-					glob_t globbuf;
-
-					auto glob_pattern = fmt::format("{}{}*.{}",
-						dir, G_DIR_SEPARATOR_S, ext);
-					memset(&globbuf, 0, sizeof(globbuf));
-
-					if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) {
-						for (auto i = 0; i < globbuf.gl_pathc; i++) {
-							const auto *path = globbuf.gl_pathv[i];
-							struct stat st;
-
-							if (stat(path, &st) == -1) {
-								msg_debug_hyperscan("cannot stat file %s: %s",
-									path, strerror(errno));
-								continue;
-							}
-
-							if (S_ISREG(st.st_mode)) {
-								if (!known_cached_files.contains(path)) {
-									msg_info_hyperscan("remove stale hyperscan file %s", path);
-									unlink(path);
-								}
-								else {
-									msg_debug_hyperscan("found known hyperscan file %s, size: %Hz",
-										path, st.st_size);
-								}
-							}
-						}
-					}
-
-					globfree(&globbuf);
-				}
-			};
-
-			for (const auto &dir: cache_dirs) {
-				cleanup_dir(dir);
-			}
-		}
+		cleanup_maybe();
 	}
+
 	/* Have to duplicate raii_file methods to use raw filenames */
 	static auto get_dir(std::string_view fname) -> std::string_view {
 		auto sep_pos = fname.rfind(G_DIR_SEPARATOR);
@@ -177,8 +137,13 @@ public:
 	}
 
 	void add_cached_file(const char *fname) {
-		auto dir = hs_known_files_cache::get_dir(fname);
-		auto ext =  hs_known_files_cache::get_extension(fname);
+
+		auto mut_fname = std::string{fname};
+		std::size_t sz;
+		rspamd_http_normalize_path_inplace(mut_fname.data(), mut_fname.size(), &sz);
+		mut_fname.resize(sz);
+		auto dir = hs_known_files_cache::get_dir(mut_fname);
+		auto ext =  hs_known_files_cache::get_extension(mut_fname);
 
 		if (std::find_if(cache_dirs.begin(), cache_dirs.end(),
 			[&](const auto& item){ return item == dir; }) == std::end(cache_dirs)) {
@@ -189,10 +154,60 @@ public:
 			cache_extensions.emplace_back(std::string{ext});
 		}
 
-		auto is_known = known_cached_files.insert(fname);
+		auto is_known = known_cached_files.insert(mut_fname);
 		msg_debug_hyperscan("added %s known hyperscan file: %s",
 			is_known.second ? "new" : "already",
-			fname);
+			mut_fname.c_str());
+	}
+
+	auto cleanup_maybe() -> void {
+		/* We clean dir merely if we are running from the main process */
+		if (rspamd_current_worker == nullptr) {
+			auto cleanup_dir = [&](std::string_view dir) -> void {
+				for (const auto &ext : cache_extensions) {
+					glob_t globbuf;
+
+					auto glob_pattern = fmt::format("{}{}*.{}",
+						dir, G_DIR_SEPARATOR_S, ext);
+					memset(&globbuf, 0, sizeof(globbuf));
+
+					if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) {
+						for (auto i = 0; i < globbuf.gl_pathc; i++) {
+							const auto *path = globbuf.gl_pathv[i];
+							struct stat st;
+
+							if (stat(path, &st) == -1) {
+								msg_debug_hyperscan("cannot stat file %s: %s",
+									path, strerror(errno));
+								continue;
+							}
+
+							if (S_ISREG(st.st_mode)) {
+								if (!known_cached_files.contains(path)) {
+									msg_info_hyperscan("remove stale hyperscan file %s", path);
+									unlink(path);
+								}
+								else {
+									msg_debug_hyperscan("found known hyperscan file %s, size: %Hz",
+										path, st.st_size);
+								}
+							}
+						}
+					}
+
+					globfree(&globbuf);
+				}
+			};
+
+			for (const auto &dir: cache_dirs) {
+				msg_debug_hyperscan("cleaning up directory %s", dir.c_str());
+				cleanup_dir(dir);
+			}
+
+			cache_dirs.clear();
+			cache_extensions.clear();
+			known_cached_files.clear();
+		}
 	}
 };
 
@@ -333,7 +348,6 @@ auto load_cached_hs_file(const char *fname) -> tl::expected<hs_shared_database,
 			if (unserialized_file.has_value()) {
 
 				auto &unserialized_checked = unserialized_file.value();
-				hs_cache.add_cached_file(unserialized_checked);
 
 				if (unserialized_checked.get_size() == 0) {
 					/*
@@ -344,6 +358,7 @@ auto load_cached_hs_file(const char *fname) -> tl::expected<hs_shared_database,
 					return hs_shared_from_serialized(std::forward<T>(cached_serialized));
 				}
 				else {
+					hs_cache.add_cached_file(unserialized_checked);
 					return raii_mmaped_file::mmap_shared(std::move(unserialized_checked), PROT_READ)
 						.and_then([&]<class U>(U &&mmapped_unserialized) -> auto {
 							return hs_shared_from_unserialized(std::forward<U>(mmapped_unserialized));
@@ -444,4 +459,10 @@ rspamd_hyperscan_notice_known(const char *fname)
 	}
 }
 
+void
+rspamd_hyperscan_cleanup_maybe(void)
+{
+	rspamd::util::hs_known_files_cache::get().cleanup_maybe();
+}
+
 #endif // WITH_HYPERSCAN
\ No newline at end of file
diff --git a/src/libserver/hyperscan_tools.h b/src/libserver/hyperscan_tools.h
index 50ca51543..5d50e07ec 100644
--- a/src/libserver/hyperscan_tools.h
+++ b/src/libserver/hyperscan_tools.h
@@ -60,6 +60,11 @@ void rspamd_hyperscan_free(rspamd_hyperscan_t *db);
  */
 void rspamd_hyperscan_notice_known(const char *fname);
 
+/**
+ * Cleans up old files. This method should be called on config free (in the main process)
+ */
+void rspamd_hyperscan_cleanup_maybe(void);
+
 G_END_DECLS
 
 #endif
diff --git a/src/libserver/maps/map_helpers.c b/src/libserver/maps/map_helpers.c
index 6381e6d51..8850d052c 100644
--- a/src/libserver/maps/map_helpers.c
+++ b/src/libserver/maps/map_helpers.c
@@ -26,6 +26,7 @@
 
 #ifdef WITH_HYPERSCAN
 #include "hs.h"
+#include "hyperscan_tools.h"
 #endif
 #ifndef WITH_PCRE2
 #include <pcre.h>
@@ -82,7 +83,7 @@ struct rspamd_regexp_map_helper {
 	khash_t(rspamd_map_hash) *htb;
 	enum rspamd_regexp_map_flags map_flags;
 #ifdef WITH_HYPERSCAN
-	hs_database_t *hs_db;
+	rspamd_hyperscan_t *hs_db;
 	hs_scratch_t *hs_scratch;
 	gchar **patterns;
 	gint *flags;
@@ -883,7 +884,7 @@ rspamd_map_helper_destroy_regexp (struct rspamd_regexp_map_helper *re_map)
 		hs_free_scratch (re_map->hs_scratch);
 	}
 	if (re_map->hs_db) {
-		hs_free_database (re_map->hs_db);
+		rspamd_hyperscan_free(re_map->hs_db);
 	}
 	if (re_map->patterns) {
 		for (i = 0; i < re_map->regexps->len; i ++) {
@@ -1055,112 +1056,11 @@ rspamd_radix_dtor (struct map_cb_data *data)
 }
 
 #ifdef WITH_HYPERSCAN
-struct rspamd_re_maps_cache_dtor_cbdata {
-	struct rspamd_config *cfg;
-	GHashTable *valid_re_hashes;
-	gchar *dirname;
-};
-
-static void
-rspamd_re_maps_cache_cleanup_dtor (gpointer ud)
-{
-	struct rspamd_re_maps_cache_dtor_cbdata *cbd =
-			(struct rspamd_re_maps_cache_dtor_cbdata *)ud;
-	GPtrArray *cache_files;
-	GError *err = NULL;
-	struct rspamd_config *cfg;
-
-	cfg = cbd->cfg;
-
-	if (cfg->cur_worker != NULL) {
-		/* Skip dtor, limit it to main process only */
-		return;
-	}
-
-	cache_files = rspamd_glob_path (cbd->dirname, "*.hsmc", FALSE, &err);
-
-	if (!cache_files) {
-		msg_err_config ("cannot glob files in %s: %e", cbd->dirname, err);
-		g_error_free (err);
-	}
-	else {
-		const gchar *fname;
-		guint i;
-
-		PTR_ARRAY_FOREACH (cache_files, i, fname) {
-			gchar *basename = g_path_get_basename (fname);
-
-			if (g_hash_table_lookup (cbd->valid_re_hashes, basename) == NULL) {
-				gchar *dir;
-
-				dir = g_path_get_dirname (fname);
-
-				/* Sanity check to avoid removal of something bad */
-				if (strcmp (dir, cbd->dirname) != 0) {
-					msg_err_config ("bogus file found: %s in %s, skip deleting",
-							fname, dir);
-				}
-				else {
-					if (unlink (fname) == -1) {
-						msg_err_config ("cannot delete obsolete file %s in %s: %s",
-								fname, dir, strerror (errno));
-					}
-					else {
-						msg_info_config ("deleted obsolete file %s in %s",
-								fname, dir);
-					}
-				}
-
-				g_free (dir);
-			}
-			else {
-				msg_debug_config ("valid re cache file %s", fname);
-			}
-
-			g_free (basename);
-		}
-
-		g_ptr_array_free (cache_files, TRUE);
-	}
-
-	g_hash_table_unref (cbd->valid_re_hashes);
-	g_free (cbd->dirname);
-}
-
-static void
-rspamd_re_map_cache_update (const gchar *fname, struct rspamd_config *cfg)
-{
-	GHashTable *valid_re_hashes;
-
-	valid_re_hashes = rspamd_mempool_get_variable (cfg->cfg_pool,
-			RSPAMD_MEMPOOL_RE_MAPS_CACHE);
-
-	if (!valid_re_hashes) {
-		valid_re_hashes = g_hash_table_new_full (g_str_hash, g_str_equal,
-				g_free, NULL);
-		rspamd_mempool_set_variable (cfg->cfg_pool,
-				RSPAMD_MEMPOOL_RE_MAPS_CACHE,
-				valid_re_hashes, (rspamd_mempool_destruct_t)g_hash_table_unref);
-
-		/* We also add a cleanup dtor for all hashes */
-		static struct rspamd_re_maps_cache_dtor_cbdata cbd;
-
-		cbd.valid_re_hashes = g_hash_table_ref (valid_re_hashes);
-		cbd.cfg = cfg;
-		cbd.dirname = g_path_get_dirname (fname);
-		rspamd_mempool_add_destructor (cfg->cfg_pool,
-				rspamd_re_maps_cache_cleanup_dtor, &cbd);
-	}
-
-	g_hash_table_insert (valid_re_hashes, g_path_get_basename (fname), "1");
-}
 
 static gboolean
 rspamd_try_load_re_map_cache (struct rspamd_regexp_map_helper *re_map)
 {
 	gchar fp[PATH_MAX];
-	gpointer data;
-	gsize len;
 	struct rspamd_map *map;
 
 	map = re_map->map;
@@ -1173,25 +1073,9 @@ rspamd_try_load_re_map_cache (struct rspamd_regexp_map_helper *re_map)
 			map->cfg->hs_cache_dir,
 			(gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest);
 
-	if ((data = rspamd_file_xmap (fp, PROT_READ, &len, TRUE)) != NULL) {
-		if (hs_deserialize_database (data, len, &re_map->hs_db) == HS_SUCCESS) {
-			rspamd_re_map_cache_update (fp, map->cfg);
-			munmap (data, len);
-
-			msg_info_map ("loaded hypersan cache from %s (%Hz length) for %s",
-					fp, len, map->name);
-
-			return TRUE;
-		}
-
-		msg_info_map ("invalid hypersan cache in %s (%Hz length) for %s, removing file",
-				fp, len, map->name);
-		munmap (data, len);
-		/* Remove stale file */
-		(void)unlink (fp);
-	}
+	re_map->hs_db = rspamd_hyperscan_maybe_load(fp);
 
-	return FALSE;
+	return re_map->hs_db != NULL;
 }
 
 static gboolean
@@ -1214,7 +1098,7 @@ rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map)
 			(gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest);
 
 	if ((fd = rspamd_file_xopen (fp, O_WRONLY | O_CREAT | O_EXCL, 00644, 0)) != -1) {
-		if (hs_serialize_database (re_map->hs_db, &bytes, &len) == HS_SUCCESS) {
+		if (hs_serialize_database (rspamd_hyperscan_get_database(re_map->hs_db), &bytes, &len) == HS_SUCCESS) {
 			if (write (fd, bytes, len) == -1) {
 				msg_warn_map ("cannot write hyperscan cache to %s: %s",
 						fp, strerror (errno));
@@ -1237,8 +1121,7 @@ rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map)
 				else {
 					msg_info_map ("written cached hyperscan data for %s to %s (%Hz length)",
 							map->name, np, len);
-
-					rspamd_re_map_cache_update (np, map->cfg);
+					rspamd_hyperscan_notice_known(np);
 				}
 			}
 		}
@@ -1255,43 +1138,6 @@ rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map)
 	return FALSE;
 }
 
-static gboolean
-rspamd_re_map_cache_cleanup_old (struct rspamd_regexp_map_helper *old_re_map)
-{
-	gchar fp[PATH_MAX];
-	struct rspamd_map *map;
-	gboolean ret = TRUE;
-
-	map = old_re_map->map;
-
-	if (!map->cfg->hs_cache_dir) {
-		return FALSE;
-	}
-
-	rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmc",
-			map->cfg->hs_cache_dir,
-			(gint)rspamd_cryptobox_HASHBYTES / 2, old_re_map->re_digest);
-
-	msg_info_map ("unlink stale cache file for %s: %s", map->name, fp);
-
-	if (unlink (fp) == -1) {
-		msg_warn_map ("cannot unlink stale cache file for %s (%s): %s",
-				map->name, fp, strerror (errno));
-		ret = FALSE;
-	}
-
-	GHashTable *valid_re_hashes;
-
-	valid_re_hashes = rspamd_mempool_get_variable (map->cfg->cfg_pool,
-			RSPAMD_MEMPOOL_RE_MAPS_CACHE);
-
-	if (valid_re_hashes) {
-		g_hash_table_remove (valid_re_hashes, fp);
-	}
-
-	return ret;
-}
-
 #endif
 
 static void
@@ -1376,6 +1222,7 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map)
 
 		if (!rspamd_try_load_re_map_cache (re_map)) {
 			gdouble ts1 = rspamd_get_ticks (FALSE);
+			hs_database_t *hs_db = NULL;
 
 			if (hs_compile_multi ((const gchar **) re_map->patterns,
 					re_map->flags,
@@ -1383,7 +1230,7 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map)
 					re_map->regexps->len,
 					HS_MODE_BLOCK,
 					&plt,
-					&re_map->hs_db,
+					&hs_db,
 					&err) != HS_SUCCESS) {
 
 				msg_err_map ("cannot create tree of regexp when processing '%s': %s",
@@ -1396,6 +1243,8 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map)
 				return;
 			}
 
+			re_map->hs_db = rspamd_hyperscan_from_raw_db(hs_db);
+
 			ts1 = (rspamd_get_ticks (FALSE) - ts1) * 1000.0;
 			msg_info_map ("hyperscan compiled %d regular expressions from %s in %.1f ms",
 					re_map->regexps->len, re_map->map->name, ts1);
@@ -1406,9 +1255,9 @@ rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map)
 					re_map->regexps->len, re_map->map->name);
 		}
 
-		if (hs_alloc_scratch (re_map->hs_db, &re_map->hs_scratch) != HS_SUCCESS) {
+		if (hs_alloc_scratch (rspamd_hyperscan_get_database(re_map->hs_db), &re_map->hs_scratch) != HS_SUCCESS) {
 			msg_err_map ("cannot allocate scratch space for hyperscan");
-			hs_free_database (re_map->hs_db);
+			rspamd_hyperscan_free(re_map->hs_db);
 			re_map->hs_db = NULL;
 		}
 	}
@@ -1547,15 +1396,6 @@ rspamd_regexp_list_fin (struct map_cb_data *data, void **target)
 
 		if (data->prev_data) {
 			old_re_map = data->prev_data;
-
-#ifdef WITH_HYPERSCAN
-			if (re_map && memcmp(re_map->re_digest, old_re_map->re_digest,
-					sizeof(re_map->re_digest)) != 0) {
-				/* Cleanup old stuff */
-				rspamd_re_map_cache_cleanup_old(old_re_map);
-			}
-#endif
-
 			rspamd_map_helper_destroy_regexp(old_re_map);
 		}
 	}
@@ -1614,8 +1454,9 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map,
 
 		if (validated) {
 
-			res = hs_scan (map->hs_db, in, len, 0, map->hs_scratch,
-					rspamd_match_hs_single_handler, (void *)&i);
+			res = hs_scan (rspamd_hyperscan_get_database(map->hs_db), in, len, 0,
+				map->hs_scratch,
+				rspamd_match_hs_single_handler, (void *)&i);
 
 			if (res == HS_SCAN_TERMINATED) {
 				res = 1;
@@ -1711,7 +1552,8 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map,
 			cbd.ar = ret;
 			cbd.map = map;
 
-			if (hs_scan (map->hs_db, in, len, 0, map->hs_scratch,
+			if (hs_scan (rspamd_hyperscan_get_database(map->hs_db), in, len,
+					0, map->hs_scratch,
 					rspamd_match_hs_multiple_handler, &cbd) == HS_SUCCESS) {
 				res = 1;
 			}
diff --git a/src/rspamd.c b/src/rspamd.c
index 3779e7f8e..d8371de55 100644
--- a/src/rspamd.c
+++ b/src/rspamd.c
@@ -53,6 +53,10 @@
 #include "sqlite3.h"
 #include "contrib/libev/ev.h"
 
+#ifdef WITH_HYPERSCAN
+#include "libserver/hyperscan_tools.h"
+#endif
+
 /* 2 seconds to fork new process in place of dead one */
 #define SOFT_FORK_TIME 2
 
@@ -1643,6 +1647,9 @@ main (gint argc, gchar **argv, gchar **env)
 
 	msg_info_main ("terminating...");
 
+#ifdef WITH_HYPERSCAN
+	rspamd_hyperscan_cleanup_maybe();
+#endif
 	REF_RELEASE (rspamd_main->cfg);
 	rspamd_log_close (rspamd_main->logger);
 	g_hash_table_unref (rspamd_main->spairs);


More information about the Commits mailing list