commit 15f7926: [Feature] Lua_text: Add regexp split iterator method

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Jan 1 19:49:07 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-01 19:46:01 +0000
URL: https://github.com/rspamd/rspamd/commit/15f792603b88a09a3d84dccc2db1c253cf90a5b8 (HEAD -> master)

[Feature] Lua_text: Add regexp split iterator method

---
 src/lua/lua_text.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/src/lua/lua_text.c b/src/lua/lua_text.c
index 55dcb8a88..3f024d236 100644
--- a/src/lua/lua_text.c
+++ b/src/lua/lua_text.c
@@ -81,6 +81,14 @@ LUA_FUNCTION_DEF (text, span);
  * @return {iterator} iterator triplet
  */
 LUA_FUNCTION_DEF (text, lines);
+/***
+ * @method rspamd_text:split(regexp, [stringify])
+ * Returns an iter over all encounters of the specific regexp as rspamd_text objects or as strings if `stringify` is true
+ * @param {rspamd_regexp} regexp regexp (pcre syntax) used for splitting
+ * @param {boolean} stringify stringify lines
+ * @return {iterator} iterator triplet
+ */
+LUA_FUNCTION_DEF (text, split);
 /***
  * @method rspamd_text:at(pos)
  * Returns a byte at the position `pos`
@@ -112,6 +120,7 @@ static const struct luaL_reg textlib_m[] = {
 		LUA_INTERFACE_DEF (text, save_in_file),
 		LUA_INTERFACE_DEF (text, span),
 		LUA_INTERFACE_DEF (text, lines),
+		LUA_INTERFACE_DEF (text, split),
 		LUA_INTERFACE_DEF (text, at),
 		LUA_INTERFACE_DEF (text, bytes),
 		{"write", lua_text_save_in_file},
@@ -493,6 +502,163 @@ lua_text_lines (lua_State *L)
 	return 1;
 }
 
+static gint
+rspamd_lua_text_regexp_split (lua_State *L) {
+	struct rspamd_lua_text *t = lua_touserdata (L, lua_upvalueindex (1)),
+			*new_t;
+	struct rspamd_lua_regexp *re = *(struct rspamd_lua_regexp **)
+			lua_touserdata (L, lua_upvalueindex (2));
+	gboolean stringify = lua_toboolean (L, lua_upvalueindex (3));
+	gint64 pos = lua_tointeger (L, lua_upvalueindex (4));
+	gboolean matched;
+
+	if (pos < 0) {
+		return luaL_error (L, "invalid pos: %d", (gint) pos);
+	}
+
+	if (pos >= t->len) {
+		/* We are done */
+		return 0;
+	}
+
+	const gchar *start, *end, *old_start;
+
+	end = t->start + pos;
+
+	for (;;) {
+		old_start = end;
+
+		matched = rspamd_regexp_search (re->re, t->start, t->len, &start, &end, FALSE,
+				NULL);
+
+		if (matched) {
+			if (start - old_start > 0) {
+				if (stringify) {
+					lua_pushlstring (L, old_start, start - old_start);
+				}
+				else {
+					new_t = lua_newuserdata (L, sizeof (*t));
+					rspamd_lua_setclass (L, "rspamd{text}", -1);
+					new_t->start = old_start;
+					new_t->len = start - old_start;
+					new_t->flags = 0;
+				}
+
+				break;
+			}
+			else {
+				if (start == end) {
+					matched = FALSE;
+					break;
+				}
+				/*
+				 * All match separators (e.g. starting separator,
+				 * we need to skip it). Continue iterations.
+				 */
+			}
+		}
+		else {
+			/* No match, stop */
+			break;
+		}
+	}
+
+	if (!matched && (t->len > 0 && (end == NULL || end < t->start + t->len))) {
+		/* No more matches, but we might need to push the last element */
+		if (end == NULL) {
+			end = t->start;
+		}
+		/* No separators, need to push the whole remaining part */
+		if (stringify) {
+			lua_pushlstring (L, end, (t->start + t->len) - end);
+		}
+		else {
+			new_t = lua_newuserdata (L, sizeof (*t));
+			rspamd_lua_setclass (L, "rspamd{text}", -1);
+			new_t->start = end;
+			new_t->len = (t->start + t->len) - end;
+			new_t->flags = 0;
+		}
+
+		pos = t->len;
+	}
+	else {
+
+		pos = end - t->start;
+	}
+
+	/* Update pos */
+	lua_pushinteger (L, pos);
+	lua_replace (L, lua_upvalueindex (4));
+
+	return 1;
+}
+
+static gint
+lua_text_split (lua_State *L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_lua_text *t = lua_check_text (L, 1);
+	struct rspamd_lua_regexp *re;
+	gboolean stringify = FALSE, own_re = FALSE;
+
+	if (lua_type (L, 2) == LUA_TUSERDATA) {
+		re = lua_check_regexp (L, 2);
+	}
+	else {
+		rspamd_regexp_t *c_re;
+		GError *err = NULL;
+
+		c_re = rspamd_regexp_new (lua_tostring (L, 2), NULL, &err);
+		if (c_re == NULL) {
+
+			gint ret = luaL_error (L, "cannot parse regexp: %s, error: %s",
+					lua_tostring (L, 2),
+					err == NULL ? "undefined" : err->message);
+			if (err) {
+				g_error_free (err);
+			}
+
+			return ret;
+		}
+
+		re = g_malloc0 (sizeof (struct rspamd_lua_regexp));
+		re->re = c_re;
+		re->re_pattern = g_strdup (lua_tostring (L, 2));
+		re->module = rspamd_lua_get_module_name (L);
+		own_re = TRUE;
+	}
+
+	if (t && re) {
+		if (lua_isboolean (L, 3)) {
+			stringify = lua_toboolean (L, 3);
+		}
+
+		/* Upvalues */
+		lua_pushvalue (L, 1); /* text */
+
+		if (own_re) {
+			struct rspamd_lua_regexp **pre;
+			pre = lua_newuserdata (L, sizeof (struct rspamd_lua_regexp *));
+			rspamd_lua_setclass (L, "rspamd{regexp}", -1);
+			*pre = re;
+		}
+		else {
+			lua_pushvalue (L, 2); /* regexp */
+		}
+
+		lua_pushboolean (L, stringify);
+		lua_pushinteger (L, 0); /* Current pos */
+		lua_pushcclosure (L, rspamd_lua_text_regexp_split, 4);
+	}
+	else {
+		return luaL_error (L, "invalid arguments");
+	}
+
+	return 1;
+}
+
+
 static gint
 lua_text_at (lua_State *L)
 {


More information about the Commits mailing list