commit c4a6b58: [Minor] Remove fuzzy_merge tool
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Aug 6 18:14:04 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-08-06 19:10:21 +0100
URL: https://github.com/rspamd/rspamd/commit/c4a6b5863a33efebec372cec921e6cde9b754bd6
[Minor] Remove fuzzy_merge tool
---
src/rspamadm/CMakeLists.txt | 1 -
src/rspamadm/commands.c | 2 -
src/rspamadm/fuzzy_merge.c | 589 --------------------------------------------
3 files changed, 592 deletions(-)
diff --git a/src/rspamadm/CMakeLists.txt b/src/rspamadm/CMakeLists.txt
index 3d4f2f490..925471619 100644
--- a/src/rspamadm/CMakeLists.txt
+++ b/src/rspamadm/CMakeLists.txt
@@ -3,7 +3,6 @@ SET(RSPAMADMSRC rspamadm.c
pw.c
configtest.c
fuzzy_convert.c
- fuzzy_merge.c
configdump.c
control.c
confighelp.c
diff --git a/src/rspamadm/commands.c b/src/rspamadm/commands.c
index 5b0b4bb5a..cf3143136 100644
--- a/src/rspamadm/commands.c
+++ b/src/rspamadm/commands.c
@@ -21,7 +21,6 @@
extern struct rspamadm_command pw_command;
extern struct rspamadm_command configtest_command;
-extern struct rspamadm_command fuzzy_merge_command;
extern struct rspamadm_command configdump_command;
extern struct rspamadm_command control_command;
extern struct rspamadm_command confighelp_command;
@@ -35,7 +34,6 @@ const struct rspamadm_command *commands[] = {
&help_command,
&pw_command,
&configtest_command,
- &fuzzy_merge_command,
&configdump_command,
&control_command,
&confighelp_command,
diff --git a/src/rspamadm/fuzzy_merge.c b/src/rspamadm/fuzzy_merge.c
deleted file mode 100644
index f5e6847fa..000000000
--- a/src/rspamadm/fuzzy_merge.c
+++ /dev/null
@@ -1,589 +0,0 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "config.h"
-#include "rspamadm.h"
-#include "logger.h"
-#include "sqlite_utils.h"
-
-static gchar *target = NULL;
-static gchar **sources = NULL;
-static gboolean quiet;
-
-static void rspamadm_fuzzy_merge (gint argc, gchar **argv,
- const struct rspamadm_command *cmd);
-static const char *rspamadm_fuzzy_merge_help (gboolean full_help,
- const struct rspamadm_command *cmd);
-
-struct rspamadm_command fuzzy_merge_command = {
- .name = "fuzzy_merge",
- .flags = 0,
- .help = rspamadm_fuzzy_merge_help,
- .run = rspamadm_fuzzy_merge,
- .lua_subrs = NULL,
-};
-
-static GOptionEntry entries[] = {
- {"source", 's', 0, G_OPTION_ARG_STRING_ARRAY, &sources,
- "Source for merge (can be repeated)", NULL},
- {"destination", 'd', 0, G_OPTION_ARG_STRING, &target,
- "Destination db", NULL},
- {"quiet", 'q', 0, G_OPTION_ARG_NONE, &quiet,
- "Suppress output", NULL},
- {NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL}
-};
-
-static const gchar *create_tables_sql =
- "BEGIN;"
- "CREATE TABLE digests("
- "id INTEGER PRIMARY KEY,"
- "flag INTEGER NOT NULL,"
- "digest TEXT NOT NULL,"
- "value INTEGER,"
- "time INTEGER);"
- "CREATE TABLE shingles("
- "value INTEGER NOT NULL,"
- "number INTEGER NOT NULL,"
- "digest_id INTEGER REFERENCES digests(id) ON DELETE CASCADE "
- "ON UPDATE CASCADE);"
- "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);"
- "CREATE INDEX IF NOT EXISTS t ON digests(time);"
- "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);"
- "COMMIT;";
-static const gchar *select_digests_sql =
- "SELECT * FROM digests;";
-static const gchar *select_shingles_sql =
- "SELECT * FROM shingles;";
-
-enum statement_idx {
- TRANSACTION_START = 0,
- TRANSACTION_COMMIT,
- TRANSACTION_ROLLBACK,
- INSERT,
- UPDATE,
- INSERT_SHINGLE,
- CHECK,
- CHECK_DIGEST_ID,
- COUNT,
- STMAX
-};
-
-static struct rspamd_sqlite3_prstmt prepared_stmts[STMAX] = {
- [TRANSACTION_START] = {
- .idx = TRANSACTION_START,
- .sql = "BEGIN IMMEDIATE TRANSACTION;",
- .args = "",
- .stmt = NULL,
- .result = SQLITE_DONE,
- .ret = ""
- },
- [TRANSACTION_COMMIT] = {
- .idx = TRANSACTION_COMMIT,
- .sql = "COMMIT;",
- .args = "",
- .stmt = NULL,
- .result = SQLITE_DONE,
- .ret = ""
- },
- [TRANSACTION_ROLLBACK] = {
- .idx = TRANSACTION_ROLLBACK,
- .sql = "ROLLBACK;",
- .args = "",
- .stmt = NULL,
- .result = SQLITE_DONE,
- .ret = ""
- },
- [INSERT] = {
- .idx = INSERT,
- .sql = "INSERT INTO digests(flag, digest, value, time) VALUES"
- "(?1, ?2, ?3, ?4);",
- .args = "SBII",
- .stmt = NULL,
- .result = SQLITE_DONE,
- .ret = ""
- },
- [INSERT_SHINGLE] = {
- .idx = INSERT_SHINGLE,
- .sql = "INSERT OR REPLACE INTO shingles(value, number, digest_id) "
- "VALUES (?1, ?2, ?3);",
- .args = "III",
- .stmt = NULL,
- .result = SQLITE_DONE,
- .ret = ""
- },
- [UPDATE] = {
- .idx = UPDATE,
- .sql = "UPDATE digests SET value=?1, time=?2 WHERE "
- "digest==?3;",
- .args = "IIB",
- .stmt = NULL,
- .result = SQLITE_DONE,
- .ret = ""
- },
- [CHECK] = {
- .idx = CHECK,
- .sql = "SELECT value, time, flag FROM digests WHERE digest==?1;",
- .args = "B",
- .stmt = NULL,
- .result = SQLITE_ROW,
- .ret = "III"
- },
- [CHECK_DIGEST_ID] = {
- .idx = CHECK_DIGEST_ID,
- .sql = "SELECT id FROM digests WHERE digest==?1",
- .args = "B",
- .stmt = NULL,
- .result = SQLITE_ROW,
- .ret = "I"
- },
- [COUNT] = {
- .idx = COUNT,
- .sql = "SELECT COUNT(*) FROM digests;",
- .args = "",
- .stmt = NULL,
- .result = SQLITE_ROW,
- .ret = "I"
- },
-};
-
-static const char *
-rspamadm_fuzzy_merge_help (gboolean full_help, const struct rspamadm_command *cmd)
-{
- const char *help_str;
-
- if (full_help) {
- help_str = "Merge multiple sources of fuzzy hashes db into a single destination\n\n"
- "Usage: rspamadm fuzzy_merge -s source1 [-s source2 ...] -d destination\n"
- "Where options are:\n\n"
- "-s: source db for merge\n"
- "-d: destination db for merge\n"
- "--help: shows available options and commands";
- }
- else {
- help_str = "Merge fuzzy databases";
- }
-
- return help_str;
-}
-
-enum op_type {
- OP_INSERT = 0,
- OP_UPDATE,
- OP_INSERT_SHINGLE,
-};
-struct fuzzy_merge_op {
- enum op_type op;
- guchar digest[64];
- union {
- struct {
- guint flag;
- gint64 value;
- gint64 tm;
- gint64 id;
- } dgst;
- struct {
- guint number;
- gint64 value;
- } shgl;
- } data;
-};
-
-static guint
-rspamadm_op_hash (gconstpointer p)
-{
- const struct fuzzy_merge_op *op = p;
- guint res;
-
- /* Uniformly distributed */
- memcpy (&res, op->digest, sizeof (res));
- return res;
-}
-
-static gboolean
-rspamadm_op_equal (gconstpointer a, gconstpointer b)
-{
- const struct fuzzy_merge_op *op1 = a, *op2 = b;
-
- return memcmp (op1->digest, op2->digest, sizeof (op1->digest)) == 0;
-}
-
-static void
-rspamadm_fuzzy_merge (gint argc, gchar **argv, const struct rspamadm_command *cmd)
-{
- GOptionContext *context;
- GError *error = NULL;
- sqlite3 *dest_db;
- GPtrArray *source_dbs;
- GArray *prstmt;
- GPtrArray *ops;
- GHashTable *unique_ops, *digests_id;
- rspamd_mempool_t *pool;
- guint i, nsrc;
- guint64 old_count, inserted = 0, updated = 0, shingles_inserted = 0;
- gint64 value, flag, tm, dig_id, src_value, src_flag;
- sqlite3 *src;
- sqlite3_stmt *stmt, *shgl_stmt;
- struct fuzzy_merge_op *nop, *op;
-
- context = g_option_context_new (
- "fuzzy_merge - merge fuzzy databases");
- g_option_context_set_summary (context,
- "Summary:\n Rspamd administration utility version "
- RVERSION
- "\n Release id: "
- RID);
- g_option_context_add_main_entries (context, entries, NULL);
-
- if (!g_option_context_parse (context, &argc, &argv, &error)) {
- rspamd_fprintf(stderr, "option parsing failed: %s\n", error->message);
- g_error_free (error);
- exit (1);
- }
-
- if (target == NULL || sources == NULL || sources[0] == NULL) {
- rspamd_fprintf(stderr, "no sources or no destination has been specified\n");
- exit (1);
- }
-
- pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), "fuzzy_merge");
- dest_db = rspamd_sqlite3_open_or_create (pool, target, create_tables_sql,
- 0, &error);
-
- if (dest_db == NULL) {
- rspamd_fprintf(stderr, "cannot open destination: %s\n", error->message);
- g_error_free (error);
- exit (1);
- }
-
- prstmt = rspamd_sqlite3_init_prstmt (dest_db, prepared_stmts,
- STMAX, &error);
-
- if (prstmt == NULL) {
- rspamd_fprintf(stderr, "cannot init prepared statements: %s\n", error->message);
- g_error_free (error);
- exit (1);
- }
-
- rspamd_sqlite3_run_prstmt (pool, dest_db, prstmt, COUNT, &old_count);
-
- nsrc = g_strv_length (sources);
- source_dbs = g_ptr_array_sized_new (nsrc);
- ops = g_ptr_array_new ();
- unique_ops = g_hash_table_new (rspamadm_op_hash, rspamadm_op_equal);
-
- for (i = 0; i < nsrc; i++) {
- src = rspamd_sqlite3_open_or_create (pool, sources[i], NULL, 0, &error);
-
- if (src == NULL) {
- rspamd_fprintf(stderr, "cannot open source %s: %s\n", sources[i],
- error->message);
- g_error_free (error);
- exit (1);
- }
-
- g_ptr_array_add (source_dbs, src);
- }
-
- for (i = 0; i < nsrc; i++) {
- const guchar *digest;
- guint64 nsrc_ops = 0, ndup_dst = 0, ndup_other = 0, nupdated = 0,
- nsrc_shingles = 0;
-
- src = g_ptr_array_index (source_dbs, i);
-
- if (!quiet) {
- rspamd_printf ("reading data from %s\n", sources[i]);
- }
-
- if (sqlite3_prepare_v2 (src, select_digests_sql, -1, &stmt, NULL) !=
- SQLITE_OK) {
- rspamd_fprintf(stderr, "cannot prepare statement %s: %s\n",
- select_digests_sql, sqlite3_errmsg (src));
- exit (1);
- }
-
- /* Temporary index for inserted IDs */
- digests_id = g_hash_table_new (g_int64_hash, g_int64_equal);
-
- while (sqlite3_step (stmt) == SQLITE_ROW) {
- /* id, flag, digest, value, time */
- digest = sqlite3_column_text (stmt, 2);
- src_value = sqlite3_column_int64 (stmt, 3);
- src_flag = sqlite3_column_int64 (stmt, 1);
-
- /* Now search for this digest in the destination */
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- CHECK,
- (gint64)sqlite3_column_bytes (stmt, 2), digest,
- &value, &tm, &flag) == SQLITE_OK) {
- /*
- * We compare values and if src value is bigger than
- * local one then we replace dest value with the src value
- */
- if (src_value > value && src_flag == flag) {
- nop = g_malloc0 (sizeof (*nop));
- nop->op = OP_UPDATE;
- memcpy (nop->digest, digest,
- sizeof (nop->digest));
- nop->data.dgst.flag = flag;
- /* Update time as well */
- nop->data.dgst.tm = sqlite3_column_int64 (stmt, 4);
- nop->data.dgst.id = sqlite3_column_int64 (stmt, 0);
-
- if ((op = g_hash_table_lookup (unique_ops, nop)) == NULL) {
- g_ptr_array_add (ops, nop);
- g_hash_table_insert (unique_ops, nop, nop);
- nupdated ++;
- }
- else {
- if (op->data.dgst.value < nop->data.dgst.value) {
- op->data.dgst.value = nop->data.dgst.value;
- op->data.dgst.tm = nop->data.dgst.tm;
- nupdated ++;
- }
- else {
- ndup_other ++;
- }
- g_free (nop);
- }
- }
- else {
- ndup_dst ++;
- }
- }
- else {
- /* Digest has not been found, but maybe we have the same in other
- * sources ?
- */
- nop = g_malloc0 (sizeof (*nop));
- nop->op = OP_INSERT;
- memcpy (nop->digest, digest,
- sizeof (nop->digest));
- nop->data.dgst.flag = src_flag;
- nop->data.dgst.value = src_value;
- /* Update time as well */
- nop->data.dgst.tm = sqlite3_column_int64 (stmt, 4);
- nop->data.dgst.id = sqlite3_column_int64 (stmt, 0);
-
- if ((op = g_hash_table_lookup (unique_ops, nop)) == NULL) {
- g_ptr_array_add (ops, nop);
- g_hash_table_insert (unique_ops, nop, nop);
- g_hash_table_insert (digests_id, &nop->data.dgst.id,
- nop);
- nsrc_ops ++;
- }
- else {
- if (op->data.dgst.value < nop->data.dgst.value) {
- op->data.dgst.value = nop->data.dgst.value;
- op->data.dgst.tm = nop->data.dgst.tm;
- op->data.dgst.tm = nop->data.dgst.tm;
- nupdated++;
- }
- else {
- ndup_other++;
- }
- g_free (nop);
- }
- }
- }
-
- /* We also need to scan all shingles and select those that
- * are to be inserted
- */
- if (sqlite3_prepare_v2 (src,
- select_shingles_sql,
- -1,
- &shgl_stmt,
- NULL) == SQLITE_OK) {
- sqlite3_bind_int64 (shgl_stmt,
- sqlite3_column_int64 (stmt, 0), 1);
-
- while (sqlite3_step (shgl_stmt) == SQLITE_ROW) {
- gint64 id = sqlite3_column_int64 (shgl_stmt, 2);
-
- if ((op = g_hash_table_lookup (digests_id, &id)) != NULL) {
- /* value, number, digest_id */
- nop = g_malloc0 (sizeof (*nop));
- nop->op = OP_INSERT_SHINGLE;
- memcpy (nop->digest, op->digest, sizeof (nop->digest));
- nop->data.shgl.number = sqlite3_column_int64 (shgl_stmt, 1);
- nop->data.shgl.value = sqlite3_column_int64 (shgl_stmt,
- 0);
- g_ptr_array_add (ops, nop);
- nsrc_shingles ++;
- }
- }
-
- sqlite3_finalize (shgl_stmt);
- }
- else {
- rspamd_fprintf (stderr, "cannot prepare statement %s: %s\n",
- select_shingles_sql, sqlite3_errmsg (src));
- exit (1);
- }
-
- if (!quiet) {
- rspamd_printf ("processed %s: %L new hashes, %L duplicate hashes (other sources), "
- "%L duplicate hashes (destination), %L hashes to update, "
- "%L shingles to insert\n\n",
- sources[i],
- nsrc_ops,
- ndup_other,
- ndup_dst,
- nupdated,
- nsrc_shingles);
- }
- /* Cleanup */
- g_hash_table_unref (digests_id);
- sqlite3_finalize (stmt);
- sqlite3_close (src);
- }
-
- if (!quiet) {
- rspamd_printf ("start writing to %s, %ud ops pending\n", target, ops->len);
- }
-
- /* Start transaction */
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- TRANSACTION_START) != SQLITE_OK) {
- rspamd_fprintf (stderr, "cannot start transaction in destination: %s\n",
- sqlite3_errmsg (dest_db));
- exit (1);
- }
-
- /* Now all ops are inside ops array, so we just iterate over it */
- for (i = 0; i < ops->len; i ++) {
- op = g_ptr_array_index (ops, i);
-
- switch (op->op) {
- case OP_INSERT:
- /* flag, digest, value, time */
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- INSERT,
- (gint64)op->data.dgst.flag,
- (gint64)sizeof (op->digest), op->digest,
- op->data.dgst.value,
- op->data.dgst.tm) != SQLITE_OK) {
- rspamd_fprintf(stderr, "cannot insert digest: %s\n",
- sqlite3_errmsg (dest_db));
- goto err;
- }
-
- inserted ++;
- break;
- case OP_UPDATE:
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- UPDATE,
- (gint64) op->data.dgst.value,
- op->data.dgst.tm,
- (gint64) sizeof (op->digest),
- op->digest) != SQLITE_OK) {
- rspamd_fprintf(stderr, "cannot update digest: %s\n",
- sqlite3_errmsg (dest_db));
- goto err;
- }
-
- updated ++;
- break;
- case OP_INSERT_SHINGLE:
- /* First select the appropriate digest */
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- CHECK_DIGEST_ID,
- (gint64) sizeof (op->digest),
- op->digest,
- &dig_id) == SQLITE_OK) {
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- INSERT_SHINGLE,
- (gint64)op->data.shgl.value,
- (gint64)op->data.shgl.number,
- dig_id) != SQLITE_OK) {
- rspamd_fprintf(stderr, "cannot insert shingle: %s\n",
- sqlite3_errmsg (dest_db));
- goto err;
- }
-
- shingles_inserted ++;
- }
- else {
- msg_warn_pool ("cannot find digest id for shingle");
- }
-
- break;
- }
- }
-
- /* Normal closing */
- if (rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- TRANSACTION_COMMIT) != SQLITE_OK) {
- rspamd_fprintf (stderr, "cannot commit transaction: %s\n",
- sqlite3_errmsg (dest_db));
- goto err;
- }
-
- rspamd_sqlite3_close_prstmt (dest_db, prstmt);
- sqlite3_close (dest_db);
- for (i = 0; i < ops->len; i++) {
- op = g_ptr_array_index (ops, i);
- g_free (op);
- }
- g_ptr_array_free (ops, TRUE);
- rspamd_mempool_delete (pool);
-
- if (!quiet) {
- rspamd_printf ("Successfully merged data into %s\n%L hashes added, "
- "%L hashes updated, %L shingles inserted\nhashes count before update: "
- "%L\nhashes count after update: %L\n",
- target,
- inserted, updated, shingles_inserted,
- old_count, old_count + inserted);
- }
-
- exit (EXIT_SUCCESS);
-
-err:
- rspamd_sqlite3_run_prstmt (pool,
- dest_db,
- prstmt,
- TRANSACTION_ROLLBACK);
- rspamd_sqlite3_close_prstmt (dest_db, prstmt);
- sqlite3_close (dest_db);
- for (i = 0; i < ops->len; i++) {
- op = g_ptr_array_index (ops, i);
- g_free (op);
- }
- g_ptr_array_free (ops, TRUE);
- rspamd_mempool_delete (pool);
-
-
- if (!quiet) {
- rspamd_printf ("Merge failed, rolled back\n");
- }
-
- exit (EXIT_FAILURE);
-}
More information about the Commits
mailing list