commit c4a6b58: [Minor] Remove fuzzy_merge tool

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Aug 6 18:14:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-08-06 19:10:21 +0100
URL: https://github.com/rspamd/rspamd/commit/c4a6b5863a33efebec372cec921e6cde9b754bd6

[Minor] Remove fuzzy_merge tool

---
 src/rspamadm/CMakeLists.txt |   1 -
 src/rspamadm/commands.c     |   2 -
 src/rspamadm/fuzzy_merge.c  | 589 --------------------------------------------
 3 files changed, 592 deletions(-)

diff --git a/src/rspamadm/CMakeLists.txt b/src/rspamadm/CMakeLists.txt
index 3d4f2f490..925471619 100644
--- a/src/rspamadm/CMakeLists.txt
+++ b/src/rspamadm/CMakeLists.txt
@@ -3,7 +3,6 @@ SET(RSPAMADMSRC rspamadm.c
         pw.c
         configtest.c
         fuzzy_convert.c
-        fuzzy_merge.c
         configdump.c
         control.c
         confighelp.c
diff --git a/src/rspamadm/commands.c b/src/rspamadm/commands.c
index 5b0b4bb5a..cf3143136 100644
--- a/src/rspamadm/commands.c
+++ b/src/rspamadm/commands.c
@@ -21,7 +21,6 @@
 
 extern struct rspamadm_command pw_command;
 extern struct rspamadm_command configtest_command;
-extern struct rspamadm_command fuzzy_merge_command;
 extern struct rspamadm_command configdump_command;
 extern struct rspamadm_command control_command;
 extern struct rspamadm_command confighelp_command;
@@ -35,7 +34,6 @@ const struct rspamadm_command *commands[] = {
 	&help_command,
 	&pw_command,
 	&configtest_command,
-	&fuzzy_merge_command,
 	&configdump_command,
 	&control_command,
 	&confighelp_command,
diff --git a/src/rspamadm/fuzzy_merge.c b/src/rspamadm/fuzzy_merge.c
deleted file mode 100644
index f5e6847fa..000000000
--- a/src/rspamadm/fuzzy_merge.c
+++ /dev/null
@@ -1,589 +0,0 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "config.h"
-#include "rspamadm.h"
-#include "logger.h"
-#include "sqlite_utils.h"
-
-static gchar *target = NULL;
-static gchar **sources = NULL;
-static gboolean quiet;
-
-static void rspamadm_fuzzy_merge (gint argc, gchar **argv,
-								  const struct rspamadm_command *cmd);
-static const char *rspamadm_fuzzy_merge_help (gboolean full_help,
-											  const struct rspamadm_command *cmd);
-
-struct rspamadm_command fuzzy_merge_command = {
-		.name = "fuzzy_merge",
-		.flags = 0,
-		.help = rspamadm_fuzzy_merge_help,
-		.run = rspamadm_fuzzy_merge,
-		.lua_subrs = NULL,
-};
-
-static GOptionEntry entries[] = {
-		{"source", 's', 0, G_OPTION_ARG_STRING_ARRAY, &sources,
-				"Source for merge (can be repeated)",                    NULL},
-		{"destination", 'd', 0, G_OPTION_ARG_STRING, &target,
-				"Destination db",     NULL},
-		{"quiet", 'q', 0, G_OPTION_ARG_NONE, &quiet,
-				"Suppress output", NULL},
-		{NULL,  0,   0, G_OPTION_ARG_NONE, NULL, NULL, NULL}
-};
-
-static const gchar *create_tables_sql =
-				"BEGIN;"
-				"CREATE TABLE digests("
-				"id INTEGER PRIMARY KEY,"
-				"flag INTEGER NOT NULL,"
-				"digest TEXT NOT NULL,"
-				"value INTEGER,"
-				"time INTEGER);"
-				"CREATE TABLE shingles("
-				"value INTEGER NOT NULL,"
-				"number INTEGER NOT NULL,"
-				"digest_id INTEGER REFERENCES digests(id) ON DELETE CASCADE "
-				"ON UPDATE CASCADE);"
-				"CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);"
-				"CREATE INDEX IF NOT EXISTS t ON digests(time);"
-				"CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);"
-				"COMMIT;";
-static const gchar *select_digests_sql =
-				"SELECT * FROM digests;";
-static const gchar *select_shingles_sql =
-				"SELECT * FROM shingles;";
-
-enum statement_idx {
-	TRANSACTION_START = 0,
-	TRANSACTION_COMMIT,
-	TRANSACTION_ROLLBACK,
-	INSERT,
-	UPDATE,
-	INSERT_SHINGLE,
-	CHECK,
-	CHECK_DIGEST_ID,
-	COUNT,
-	STMAX
-};
-
-static struct rspamd_sqlite3_prstmt prepared_stmts[STMAX] = {
-		[TRANSACTION_START] = {
-				.idx = TRANSACTION_START,
-				.sql = "BEGIN IMMEDIATE TRANSACTION;",
-				.args = "",
-				.stmt = NULL,
-				.result = SQLITE_DONE,
-				.ret = ""
-		},
-		[TRANSACTION_COMMIT] = {
-				.idx = TRANSACTION_COMMIT,
-				.sql = "COMMIT;",
-				.args = "",
-				.stmt = NULL,
-				.result = SQLITE_DONE,
-				.ret = ""
-		},
-		[TRANSACTION_ROLLBACK] = {
-				.idx = TRANSACTION_ROLLBACK,
-				.sql = "ROLLBACK;",
-				.args = "",
-				.stmt = NULL,
-				.result = SQLITE_DONE,
-				.ret = ""
-		},
-		[INSERT] = {
-				.idx = INSERT,
-				.sql = "INSERT INTO digests(flag, digest, value, time) VALUES"
-						"(?1, ?2, ?3, ?4);",
-				.args = "SBII",
-				.stmt = NULL,
-				.result = SQLITE_DONE,
-				.ret = ""
-		},
-		[INSERT_SHINGLE] = {
-				.idx = INSERT_SHINGLE,
-				.sql = "INSERT OR REPLACE INTO shingles(value, number, digest_id) "
-						"VALUES (?1, ?2, ?3);",
-				.args = "III",
-				.stmt = NULL,
-				.result = SQLITE_DONE,
-				.ret = ""
-		},
-		[UPDATE] = {
-				.idx = UPDATE,
-				.sql = "UPDATE digests SET value=?1, time=?2 WHERE "
-						"digest==?3;",
-				.args = "IIB",
-				.stmt = NULL,
-				.result = SQLITE_DONE,
-				.ret = ""
-		},
-		[CHECK] = {
-				.idx = CHECK,
-				.sql = "SELECT value, time, flag FROM digests WHERE digest==?1;",
-				.args = "B",
-				.stmt = NULL,
-				.result = SQLITE_ROW,
-				.ret = "III"
-		},
-		[CHECK_DIGEST_ID] = {
-				.idx = CHECK_DIGEST_ID,
-				.sql = "SELECT id FROM digests WHERE digest==?1",
-				.args = "B",
-				.stmt = NULL,
-				.result = SQLITE_ROW,
-				.ret = "I"
-		},
-		[COUNT] = {
-				.idx = COUNT,
-				.sql = "SELECT COUNT(*) FROM digests;",
-				.args = "",
-				.stmt = NULL,
-				.result = SQLITE_ROW,
-				.ret = "I"
-		},
-};
-
-static const char *
-rspamadm_fuzzy_merge_help (gboolean full_help, const struct rspamadm_command *cmd)
-{
-	const char *help_str;
-
-	if (full_help) {
-		help_str = "Merge multiple sources of fuzzy hashes db into a single destination\n\n"
-				"Usage: rspamadm fuzzy_merge -s source1 [-s source2 ...] -d destination\n"
-				"Where options are:\n\n"
-				"-s: source db for merge\n"
-				"-d: destination db for merge\n"
-				"--help: shows available options and commands";
-	}
-	else {
-		help_str = "Merge fuzzy databases";
-	}
-
-	return help_str;
-}
-
-enum op_type {
-	OP_INSERT = 0,
-	OP_UPDATE,
-	OP_INSERT_SHINGLE,
-};
-struct fuzzy_merge_op {
-	enum op_type op;
-	guchar digest[64];
-	union {
-		struct {
-			guint flag;
-			gint64 value;
-			gint64 tm;
-			gint64 id;
-		} dgst;
-		struct {
-			guint number;
-			gint64 value;
-		} shgl;
-	} data;
-};
-
-static guint
-rspamadm_op_hash (gconstpointer p)
-{
-	const struct fuzzy_merge_op *op = p;
-	guint res;
-
-	/* Uniformly distributed */
-	memcpy (&res, op->digest, sizeof (res));
-	return res;
-}
-
-static gboolean
-rspamadm_op_equal (gconstpointer a, gconstpointer b)
-{
-	const struct fuzzy_merge_op *op1 = a, *op2 = b;
-
-	return memcmp (op1->digest, op2->digest, sizeof (op1->digest)) == 0;
-}
-
-static void
-rspamadm_fuzzy_merge (gint argc, gchar **argv, const struct rspamadm_command *cmd)
-{
-	GOptionContext *context;
-	GError *error = NULL;
-	sqlite3 *dest_db;
-	GPtrArray *source_dbs;
-	GArray *prstmt;
-	GPtrArray *ops;
-	GHashTable *unique_ops, *digests_id;
-	rspamd_mempool_t *pool;
-	guint i, nsrc;
-	guint64 old_count, inserted = 0, updated = 0, shingles_inserted = 0;
-	gint64 value, flag, tm, dig_id, src_value, src_flag;
-	sqlite3 *src;
-	sqlite3_stmt *stmt, *shgl_stmt;
-	struct fuzzy_merge_op *nop, *op;
-
-	context = g_option_context_new (
-			"fuzzy_merge - merge fuzzy databases");
-	g_option_context_set_summary (context,
-			"Summary:\n  Rspamd administration utility version "
-					RVERSION
-					"\n  Release id: "
-					RID);
-	g_option_context_add_main_entries (context, entries, NULL);
-
-	if (!g_option_context_parse (context, &argc, &argv, &error)) {
-		rspamd_fprintf(stderr, "option parsing failed: %s\n", error->message);
-		g_error_free (error);
-		exit (1);
-	}
-
-	if (target == NULL || sources == NULL || sources[0] == NULL) {
-		rspamd_fprintf(stderr, "no sources or no destination has been specified\n");
-		exit (1);
-	}
-
-	pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), "fuzzy_merge");
-	dest_db = rspamd_sqlite3_open_or_create (pool, target, create_tables_sql,
-			0, &error);
-
-	if (dest_db == NULL) {
-		rspamd_fprintf(stderr, "cannot open destination: %s\n", error->message);
-		g_error_free (error);
-		exit (1);
-	}
-
-	prstmt = rspamd_sqlite3_init_prstmt (dest_db, prepared_stmts,
-			STMAX, &error);
-
-	if (prstmt == NULL) {
-		rspamd_fprintf(stderr, "cannot init prepared statements: %s\n", error->message);
-		g_error_free (error);
-		exit (1);
-	}
-
-	rspamd_sqlite3_run_prstmt (pool, dest_db, prstmt, COUNT, &old_count);
-
-	nsrc = g_strv_length (sources);
-	source_dbs = g_ptr_array_sized_new (nsrc);
-	ops = g_ptr_array_new ();
-	unique_ops = g_hash_table_new (rspamadm_op_hash, rspamadm_op_equal);
-
-	for (i = 0; i < nsrc; i++) {
-		src = rspamd_sqlite3_open_or_create (pool, sources[i], NULL, 0, &error);
-
-		if (src == NULL) {
-			rspamd_fprintf(stderr, "cannot open source %s: %s\n", sources[i],
-					error->message);
-			g_error_free (error);
-			exit (1);
-		}
-
-		g_ptr_array_add (source_dbs, src);
-	}
-
-	for (i = 0; i < nsrc; i++) {
-		const guchar *digest;
-		guint64 nsrc_ops = 0, ndup_dst = 0, ndup_other = 0, nupdated = 0,
-				nsrc_shingles = 0;
-
-		src = g_ptr_array_index (source_dbs, i);
-
-		if (!quiet) {
-			rspamd_printf ("reading data from %s\n", sources[i]);
-		}
-
-		if (sqlite3_prepare_v2 (src, select_digests_sql, -1, &stmt, NULL) !=
-					SQLITE_OK) {
-			rspamd_fprintf(stderr, "cannot prepare statement %s: %s\n",
-					select_digests_sql, sqlite3_errmsg (src));
-			exit (1);
-		}
-
-		/* Temporary index for inserted IDs */
-		digests_id = g_hash_table_new (g_int64_hash, g_int64_equal);
-
-		while (sqlite3_step (stmt) == SQLITE_ROW) {
-			/* id, flag, digest, value, time */
-			digest = sqlite3_column_text (stmt, 2);
-			src_value = sqlite3_column_int64 (stmt, 3);
-			src_flag = sqlite3_column_int64 (stmt, 1);
-
-			/* Now search for this digest in the destination */
-			if (rspamd_sqlite3_run_prstmt (pool,
-					dest_db,
-					prstmt,
-					CHECK,
-					(gint64)sqlite3_column_bytes (stmt, 2), digest,
-					&value, &tm, &flag) == SQLITE_OK) {
-				/*
-				 * We compare values and if src value is bigger than
-				 * local one then we replace dest value with the src value
-				 */
-				if (src_value > value && src_flag == flag) {
-					nop = g_malloc0 (sizeof (*nop));
-					nop->op = OP_UPDATE;
-					memcpy (nop->digest, digest,
-							sizeof (nop->digest));
-					nop->data.dgst.flag = flag;
-					/* Update time as well */
-					nop->data.dgst.tm = sqlite3_column_int64 (stmt, 4);
-					nop->data.dgst.id = sqlite3_column_int64 (stmt, 0);
-
-					if ((op = g_hash_table_lookup (unique_ops, nop)) == NULL) {
-						g_ptr_array_add (ops, nop);
-						g_hash_table_insert (unique_ops, nop, nop);
-						nupdated ++;
-					}
-					else {
-						if (op->data.dgst.value < nop->data.dgst.value) {
-							op->data.dgst.value = nop->data.dgst.value;
-							op->data.dgst.tm = nop->data.dgst.tm;
-							nupdated ++;
-						}
-						else {
-							ndup_other ++;
-						}
-						g_free (nop);
-					}
-				}
-				else {
-					ndup_dst ++;
-				}
-			}
-			else {
-				/* Digest has not been found, but maybe we have the same in other
-				 * sources ?
-				 */
-				nop = g_malloc0 (sizeof (*nop));
-				nop->op = OP_INSERT;
-				memcpy (nop->digest, digest,
-						sizeof (nop->digest));
-				nop->data.dgst.flag = src_flag;
-				nop->data.dgst.value = src_value;
-				/* Update time as well */
-				nop->data.dgst.tm = sqlite3_column_int64 (stmt, 4);
-				nop->data.dgst.id = sqlite3_column_int64 (stmt, 0);
-
-				if ((op = g_hash_table_lookup (unique_ops, nop)) == NULL) {
-					g_ptr_array_add (ops, nop);
-					g_hash_table_insert (unique_ops, nop, nop);
-					g_hash_table_insert (digests_id, &nop->data.dgst.id,
-							nop);
-					nsrc_ops ++;
-				}
-				else {
-					if (op->data.dgst.value < nop->data.dgst.value) {
-						op->data.dgst.value = nop->data.dgst.value;
-						op->data.dgst.tm = nop->data.dgst.tm;
-						op->data.dgst.tm = nop->data.dgst.tm;
-						nupdated++;
-					}
-					else {
-						ndup_other++;
-					}
-					g_free (nop);
-				}
-			}
-		}
-
-		/* We also need to scan all shingles and select those that
-		 * are to be inserted
-		 */
-		if (sqlite3_prepare_v2 (src,
-				select_shingles_sql,
-				-1,
-				&shgl_stmt,
-				NULL) == SQLITE_OK) {
-			sqlite3_bind_int64 (shgl_stmt,
-					sqlite3_column_int64 (stmt, 0), 1);
-
-			while (sqlite3_step (shgl_stmt) == SQLITE_ROW) {
-				gint64 id = sqlite3_column_int64 (shgl_stmt, 2);
-
-				if ((op = g_hash_table_lookup (digests_id, &id)) != NULL) {
-					/* value, number, digest_id */
-					nop = g_malloc0 (sizeof (*nop));
-					nop->op = OP_INSERT_SHINGLE;
-					memcpy (nop->digest, op->digest, sizeof (nop->digest));
-					nop->data.shgl.number = sqlite3_column_int64 (shgl_stmt, 1);
-					nop->data.shgl.value = sqlite3_column_int64 (shgl_stmt,
-							0);
-					g_ptr_array_add (ops, nop);
-					nsrc_shingles ++;
-				}
-			}
-
-			sqlite3_finalize (shgl_stmt);
-		}
-		else {
-			rspamd_fprintf (stderr, "cannot prepare statement %s: %s\n",
-					select_shingles_sql, sqlite3_errmsg (src));
-			exit (1);
-		}
-
-		if (!quiet) {
-			rspamd_printf ("processed %s: %L new hashes, %L duplicate hashes (other sources), "
-							"%L duplicate hashes (destination), %L hashes to update, "
-							"%L shingles to insert\n\n",
-					sources[i],
-					nsrc_ops,
-					ndup_other,
-					ndup_dst,
-					nupdated,
-					nsrc_shingles);
-		}
-		/* Cleanup */
-		g_hash_table_unref (digests_id);
-		sqlite3_finalize (stmt);
-		sqlite3_close (src);
-	}
-
-	if (!quiet) {
-		rspamd_printf ("start writing to %s, %ud ops pending\n", target, ops->len);
-	}
-
-	/* Start transaction */
-	if (rspamd_sqlite3_run_prstmt (pool,
-			dest_db,
-			prstmt,
-			TRANSACTION_START) != SQLITE_OK) {
-		rspamd_fprintf (stderr, "cannot start transaction in destination: %s\n",
-				sqlite3_errmsg (dest_db));
-		exit (1);
-	}
-
-	/* Now all ops are inside ops array, so we just iterate over it */
-	for (i = 0; i < ops->len; i ++) {
-		op = g_ptr_array_index (ops, i);
-
-		switch (op->op) {
-		case OP_INSERT:
-			/* flag, digest, value, time */
-			if (rspamd_sqlite3_run_prstmt (pool,
-					dest_db,
-					prstmt,
-					INSERT,
-					(gint64)op->data.dgst.flag,
-					(gint64)sizeof (op->digest), op->digest,
-					op->data.dgst.value,
-					op->data.dgst.tm) != SQLITE_OK) {
-				rspamd_fprintf(stderr, "cannot insert digest: %s\n",
-						sqlite3_errmsg (dest_db));
-				goto err;
-			}
-
-			inserted ++;
-			break;
-		case OP_UPDATE:
-			if (rspamd_sqlite3_run_prstmt (pool,
-					dest_db,
-					prstmt,
-					UPDATE,
-					(gint64) op->data.dgst.value,
-					op->data.dgst.tm,
-					(gint64) sizeof (op->digest),
-					op->digest) != SQLITE_OK) {
-				rspamd_fprintf(stderr, "cannot update digest: %s\n",
-						sqlite3_errmsg (dest_db));
-				goto err;
-			}
-
-			updated ++;
-			break;
-		case OP_INSERT_SHINGLE:
-			/* First select the appropriate digest */
-			if (rspamd_sqlite3_run_prstmt (pool,
-					dest_db,
-					prstmt,
-					CHECK_DIGEST_ID,
-					(gint64) sizeof (op->digest),
-					op->digest,
-					&dig_id) == SQLITE_OK) {
-				if (rspamd_sqlite3_run_prstmt (pool,
-						dest_db,
-						prstmt,
-						INSERT_SHINGLE,
-						(gint64)op->data.shgl.value,
-						(gint64)op->data.shgl.number,
-						dig_id) != SQLITE_OK) {
-					rspamd_fprintf(stderr, "cannot insert shingle: %s\n",
-							sqlite3_errmsg (dest_db));
-					goto err;
-				}
-
-				shingles_inserted ++;
-			}
-			else {
-				msg_warn_pool ("cannot find digest id for shingle");
-			}
-
-			break;
-		}
-	}
-
-	/* Normal closing */
-	if (rspamd_sqlite3_run_prstmt (pool,
-			dest_db,
-			prstmt,
-			TRANSACTION_COMMIT) != SQLITE_OK) {
-		rspamd_fprintf (stderr, "cannot commit transaction: %s\n",
-				sqlite3_errmsg (dest_db));
-		goto err;
-	}
-
-	rspamd_sqlite3_close_prstmt (dest_db, prstmt);
-	sqlite3_close (dest_db);
-	for (i = 0; i < ops->len; i++) {
-		op = g_ptr_array_index (ops, i);
-		g_free (op);
-	}
-	g_ptr_array_free (ops, TRUE);
-	rspamd_mempool_delete (pool);
-
-	if (!quiet) {
-		rspamd_printf ("Successfully merged data into %s\n%L hashes added, "
-				"%L hashes updated, %L shingles inserted\nhashes count before update: "
-				"%L\nhashes count after update: %L\n",
-				target,
-				inserted, updated, shingles_inserted,
-				old_count, old_count + inserted);
-	}
-
-	exit (EXIT_SUCCESS);
-
-err:
-	rspamd_sqlite3_run_prstmt (pool,
-		dest_db,
-		prstmt,
-		TRANSACTION_ROLLBACK);
-	rspamd_sqlite3_close_prstmt (dest_db, prstmt);
-	sqlite3_close (dest_db);
-	for (i = 0; i < ops->len; i++) {
-		op = g_ptr_array_index (ops, i);
-		g_free (op);
-	}
-	g_ptr_array_free (ops, TRUE);
-	rspamd_mempool_delete (pool);
-
-
-	if (!quiet) {
-		rspamd_printf ("Merge failed, rolled back\n");
-	}
-
-	exit (EXIT_FAILURE);
-}


More information about the Commits mailing list