commit 33cf745: [Rework] Use strings for int64_t
Vsevolod Stakhov
vsevolod at rspamd.com
Mon Jul 29 17:50:02 UTC 2024
Author: Vsevolod Stakhov
Date: 2023-12-08 09:33:57 +0000
URL: https://github.com/rspamd/rspamd/commit/33cf745fb1c772c57f45e14de15dc706ed5284d5
[Rework] Use strings for int64_t
It seems there is no easy way to use int64 in Redis Lua, hence, we have
to use strings. It's much more expensive but still some advantage over
the previous schema.
---
lualib/redis_scripts/bayes_classify.lua | 6 +++---
lualib/redis_scripts/bayes_learn.lua | 4 ++--
.../redis_scripts/bayes_stat.lua | 0
src/libstat/backends/redis_backend.cxx | 20 +++++++++++---------
4 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/lualib/redis_scripts/bayes_classify.lua b/lualib/redis_scripts/bayes_classify.lua
index c999609e5..9bef96f14 100644
--- a/lualib/redis_scripts/bayes_classify.lua
+++ b/lualib/redis_scripts/bayes_classify.lua
@@ -1,10 +1,9 @@
-- Lua script to perform bayes classification
-- This script accepts the following parameters:
-- key1 - prefix for bayes tokens (e.g. for per-user classification)
--- key2 - set of tokens encoded in messagepack array of int64_t
+-- key2 - set of tokens encoded in messagepack array of strings
local prefix = KEYS[1]
-local input_tokens = cmsgpack.unpack(KEYS[2])
local output_spam = {}
local output_ham = {}
@@ -17,8 +16,9 @@ local prefix_underscore = prefix .. '_'
-- This optimisation will save a lot of space for sparse tokens, and in Bayes that assumption is normally held
if learned_ham > 0 and learned_spam > 0 then
+ local input_tokens = cmsgpack.unpack(KEYS[2])
for i, token in ipairs(input_tokens) do
- local token_data = redis.call('HMGET', prefix_underscore .. tostring(token), 'H', 'S')
+ local token_data = redis.call('HMGET', prefix_underscore .. token, 'H', 'S')
if token_data then
local ham_count = token_data[1]
diff --git a/lualib/redis_scripts/bayes_learn.lua b/lualib/redis_scripts/bayes_learn.lua
index 638254706..7536f6808 100644
--- a/lualib/redis_scripts/bayes_learn.lua
+++ b/lualib/redis_scripts/bayes_learn.lua
@@ -4,7 +4,7 @@
-- key2 - boolean is_spam
-- key3 - string symbol
-- key4 - boolean is_unlearn
--- key5 - set of tokens encoded in messagepack array of int64_t
+-- key5 - set of tokens encoded in messagepack array of strings
local prefix = KEYS[1]
local is_spam = KEYS[2] == 'true' and true or false
@@ -21,5 +21,5 @@ redis.call('HSET', prefix, 'version', '2') -- new schema
redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count
for _, token in ipairs(input_tokens) do
- redis.call('HINCRBY', prefix_underscore .. tostring(token), hash_key, 1)
+ redis.call('HINCRBY', prefix_underscore .. token, hash_key, 1)
end
\ No newline at end of file
diff --git a/src/libcryptobox/AsmOpt.cmake b/lualib/redis_scripts/bayes_stat.lua
similarity index 100%
copy from src/libcryptobox/AsmOpt.cmake
copy to lualib/redis_scripts/bayes_stat.lua
diff --git a/src/libstat/backends/redis_backend.cxx b/src/libstat/backends/redis_backend.cxx
index 342fa0273..0eddf26cb 100644
--- a/src/libstat/backends/redis_backend.cxx
+++ b/src/libstat/backends/redis_backend.cxx
@@ -657,13 +657,13 @@ void rspamd_redis_close(gpointer p)
static char *
rspamd_redis_serialize_tokens(struct rspamd_task *task, GPtrArray *tokens, gsize *ser_len)
{
- /* Each token is int64_t that requires 9 bytes + 4 bytes array len + 1 byte array magic */
- gsize req_len = tokens->len * 9 + 5, i;
- gchar *buf, *p;
+ /* Each token is int64_t that requires 10 bytes (2 int32_t) + 4 bytes array len + 1 byte array magic */
+ char max_int64_str[] = "18446744073709551615";
+ auto req_len = tokens->len * sizeof(max_int64_str) + 5;
rspamd_token_t *tok;
- buf = (gchar *) rspamd_mempool_alloc(task->task_pool, req_len);
- p = buf;
+ auto *buf = (gchar *) rspamd_mempool_alloc(task->task_pool, req_len);
+ auto *p = buf;
/* Array */
*p++ = (gchar) 0xdd;
@@ -673,13 +673,15 @@ rspamd_redis_serialize_tokens(struct rspamd_task *task, GPtrArray *tokens, gsize
*p++ = (gchar) ((tokens->len >> 8) & 0xff);
*p++ = (gchar) (tokens->len & 0xff);
+ int i;
PTR_ARRAY_FOREACH(tokens, i, tok)
{
- *p++ = (gchar) 0xd3;
+ char numbuf[sizeof(max_int64_str)];
+ auto r = rspamd_snprintf(numbuf, sizeof(numbuf), "%uL", tok->data);
+ *p++ = (gchar) ((r & 0xff) | 0xa0);
- guint64 val = GUINT64_TO_BE(tok->data);
- memcpy(p, &val, sizeof(val));
- p += sizeof(val);
+ memcpy(p, &numbuf, r);
+ p += r;
}
*ser_len = p - buf;
More information about the Commits
mailing list