commit 8f18989: [Minor] Lua_text: Add oneline method

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jan 22 12:21:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-22 11:01:18 +0000
URL: https://github.com/rspamd/rspamd/commit/8f1898962c717ba0be3016eab22883d8c352c57f

[Minor] Lua_text: Add oneline method

---
 src/lua/lua_text.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)

diff --git a/src/lua/lua_text.c b/src/lua/lua_text.c
index ed1771feb..de8fa9b93 100644
--- a/src/lua/lua_text.c
+++ b/src/lua/lua_text.c
@@ -16,6 +16,7 @@
 
 #include "lua_common.h"
 #include "libcryptobox/cryptobox.h"
+#include "contrib/fastutf8/fastutf8.h"
 #include "unix-std.h"
 
 /***
@@ -120,6 +121,20 @@ LUA_FUNCTION_DEF (text, take_ownership);
  * @return {tspamd_text} modified or copied text
  */
 LUA_FUNCTION_DEF (text, exclude_chars);
+/***
+ * @method rspamd_text:oneline([always_copy])
+ * Returns a text (if owned, then the original text is modified, if not, then it is copied and owned)
+ * where the following transformations are made:
+ * - All spaces sequences are replaced with a single space
+ * - All newlines sequences are replaced with a single space
+ * - Trailing and leading spaces are removed
+ * - Control characters are excluded
+ * - UTF8 sequences are normalised
+ *
+ * @param {boolean} always_copy always copy the source text
+ * @return {tspamd_text} modified or copied text
+ */
+LUA_FUNCTION_DEF (text, oneline);
 LUA_FUNCTION_DEF (text, gc);
 LUA_FUNCTION_DEF (text, eq);
 
@@ -141,6 +156,7 @@ static const struct luaL_reg textlib_m[] = {
 		LUA_INTERFACE_DEF (text, at),
 		LUA_INTERFACE_DEF (text, bytes),
 		LUA_INTERFACE_DEF (text, exclude_chars),
+		LUA_INTERFACE_DEF (text, oneline),
 		{"write", lua_text_save_in_file},
 		{"__len", lua_text_len},
 		{"__tostring", lua_text_str},
@@ -1021,6 +1037,146 @@ lua_text_exclude_chars (lua_State *L)
 	return 1;
 }
 
+static gint
+lua_text_oneline (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_lua_text *t = lua_check_text (L, 1);
+	const gchar *p, *end;
+	gchar *dest, *d;
+	gsize byteset[32 / sizeof(gsize)]; /* Bitset for ascii */
+	gboolean copy = TRUE, seen_8bit = FALSE;
+	guint *plen;
+
+	if (t != NULL) {
+		if (lua_isboolean (L, 2)) {
+			copy = lua_toboolean (L, 2);
+		}
+		else if (t->flags & RSPAMD_TEXT_FLAG_OWN) {
+			copy = FALSE;
+		}
+
+		if (!copy) {
+			dest = (gchar *)t->start;
+			plen = &t->len;
+			lua_pushvalue (L, 1); /* Push text as a result */
+		}
+		else {
+			/* We need to copy read only text */
+			struct rspamd_lua_text *nt;
+
+			dest = g_malloc (t->len);
+			nt = lua_newuserdata (L, sizeof (*nt));
+			rspamd_lua_setclass (L, "rspamd{text}", -1);
+			nt->len = t->len;
+			nt->flags = RSPAMD_TEXT_FLAG_OWN;
+			memcpy (dest, t->start, t->len);
+			nt->start = dest;
+			plen = &nt->len;
+		}
+
+		/* Fill pattern bitset */
+		memset (byteset, 0, sizeof byteset);
+		/* All spaces */
+		byteset[0] |= GSIZE_FROM_LE (0x100003600);
+		/* Control characters */
+		byteset[0] |= GSIZE_FROM_LE (0xffffffff);
+		/* Del character */
+		byteset[1] |= GSIZE_FROM_LE (0x8000000000000000);
+		/* 8 bit characters */
+		byteset[2] |= GSIZE_FROM_LE (0xffffffffffffffffLLU);
+		byteset[3] |= GSIZE_FROM_LE (0xffffffffffffffffLLU);
+
+		p = t->start;
+		end = t->start + t->len;
+		d = dest;
+
+		while (p < end) {
+			if (!BITOP (byteset, *(guchar *)p, &)) {
+				*d++ = *p;
+			}
+			else {
+				if ((*(guchar *)p) & 0x80) {
+					seen_8bit = TRUE;
+					*d++ = *p;
+				}
+				else {
+					if (*p == ' ') {
+						if (d != dest) {
+							*d++ = *p++;
+						}
+
+						while (p < end && g_ascii_isspace (*p)) {
+							p ++;
+						}
+
+						continue; /* To avoid p++ */
+					}
+					else if (*p == '\r' || *p == '\n') {
+						if (d != dest) {
+							*d++ = ' ';
+							p ++;
+						}
+
+						while (p < end && g_ascii_isspace (*p)) {
+							p ++;
+						}
+
+						continue; /* To avoid p++ */
+					}
+				}
+			}
+
+			p ++;
+		}
+
+		while (d > dest && g_ascii_isspace (*(d - 1))) {
+			d --;
+		}
+
+		if (seen_8bit) {
+			if (rspamd_fast_utf8_validate (dest, d - dest) != 0) {
+				/* Need to make it valid :( */
+				UChar32 uc;
+				goffset err_offset;
+				gsize remain = d - dest;
+				gchar *nd = dest;
+
+				while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (nd, remain)) > 0) {
+					gint i = 0;
+
+					err_offset --; /* As it returns it 1 indexed */
+					nd += err_offset;
+					remain -= err_offset;
+
+					/* Each invalid character of input requires 3 bytes of output (+2 bytes) */
+					while (i < remain) {
+						gint old_pos = i;
+						U8_NEXT (nd, i, remain, uc);
+
+						if (uc < 0) {
+							nd[old_pos] = '?';
+						}
+						else {
+							break;
+						}
+					}
+
+					nd += i;
+					remain -= i;
+				}
+			}
+		}
+
+		*(plen) = d - dest;
+	}
+	else {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	return 1;
+}
+
 static gint
 lua_load_text (lua_State * L)
 {


More information about the Commits mailing list