commit 6c18226: [Rework] Html: Rework lua bindings

Vsevolod Stakhov vsevolod at highsecure.ru
Thu May 27 14:07:13 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-05-27 15:05:01 +0100
URL: https://github.com/rspamd/rspamd/commit/6c1822676cb7b41bed1c76713bb53d187c28a27a (HEAD -> master)

[Rework] Html: Rework lua bindings

---
 doc/Makefile                         |   4 +-
 src/libserver/html/html.cxx          |  12 ++++
 src/libserver/html/html.h            |   8 +--
 src/lua/CMakeLists.txt               |   2 +-
 src/lua/{lua_html.c => lua_html.cxx} | 125 ++++++++++++++++-------------------
 5 files changed, 76 insertions(+), 75 deletions(-)

diff --git a/doc/Makefile b/doc/Makefile
index e93481bb7..548c6b292 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -73,8 +73,8 @@ rspamd_tcp: ../src/lua/lua_tcp.c
 	$(LUADOC) < ../src/lua/lua_tcp.c > markdown/lua/rspamd_tcp.md
 rspamd_mempool: ../src/lua/lua_mempool.c
 	$(LUADOC) < ../src/lua/lua_mempool.c > markdown/lua/rspamd_mempool.md
-rspamd_html: ../src/lua/lua_html.c
-	$(LUADOC) < ../src/lua/lua_html.c > markdown/lua/rspamd_html.md
+rspamd_html: ../src/lua/lua_html.cxx
+	$(LUADOC) < ../src/lua/lua_html.cxx > markdown/lua/rspamd_html.md
 rspamd_util: ../src/lua/lua_util.c
 	$(LUADOC) < ../src/lua/lua_util.c > markdown/lua/rspamd_util.md
 rspamd_sqlite3: ../src/lua/lua_sqlite3.c
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 2ebf46305..cfafbf736 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -2386,4 +2386,16 @@ rspamd_html_tag_by_id(gint id)
 	}
 
 	return nullptr;
+}
+
+const gchar *
+rspamd_html_tag_name(void *p, gsize *len)
+{
+	auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
+
+	if (len) {
+		*len = tag->name.size();
+	}
+
+	return tag->name.data();
 }
\ No newline at end of file
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 46ae99d48..6106688f3 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -152,13 +152,13 @@ const gchar *rspamd_html_tag_by_id(gint id);
 gint rspamd_html_tag_by_name(const gchar *name);
 
 /**
- * Extract URL from HTML tag component and sets component elements if needed
- * @param pool
- * @param start
+ * Gets a name for a tag
+ * @param tag
  * @param len
- * @param comp
  * @return
  */
+const gchar *rspamd_html_tag_name(void *tag, gsize *len);
+
 
 #ifdef  __cplusplus
 }
diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt
index 4782d3f8e..a504f99f8 100644
--- a/src/lua/CMakeLists.txt
+++ b/src/lua/CMakeLists.txt
@@ -21,7 +21,7 @@ SET(LUASRC			  ${CMAKE_CURRENT_SOURCE_DIR}/lua_common.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_url.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_util.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_tcp.c
-					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_html.c
+					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_html.cxx
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_sqlite3.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_cryptobox.c
 					  ${CMAKE_CURRENT_SOURCE_DIR}/lua_map.c
diff --git a/src/lua/lua_html.c b/src/lua/lua_html.cxx
similarity index 84%
rename from src/lua/lua_html.c
rename to src/lua/lua_html.cxx
index 5b1c9c452..629b4f66f 100644
--- a/src/lua/lua_html.c
+++ b/src/lua/lua_html.cxx
@@ -16,8 +16,12 @@
 #include "lua_common.h"
 #include "message.h"
 #include "libserver/html/html.h"
+#include "libserver/html/html_tag.hxx"
 #include "images.h"
-#include "contrib/mumhash/mum.h"
+
+#include <contrib/robin-hood/robin_hood.h>
+#include <frozen/string.h>
+#include <frozen/unordered_map.h>
 
 /***
  * @module rspamd_html
@@ -186,7 +190,7 @@ lua_check_html (lua_State * L, gint pos)
 
 struct lua_html_tag {
 	struct html_content *html;
-	struct html_tag *tag;
+	rspamd::html::html_tag *tag;
 };
 
 static struct lua_html_tag *
@@ -216,6 +220,20 @@ lua_html_has_tag (lua_State *L)
 	return 1;
 }
 
+constexpr const auto prop_map = frozen::make_unordered_map<frozen::string, int>({
+		{"no_html", RSPAMD_HTML_FLAG_BAD_START},
+		{"bad_start", RSPAMD_HTML_FLAG_BAD_START},
+		{"bad_element", RSPAMD_HTML_FLAG_BAD_ELEMENTS},
+		{"bad_elements", RSPAMD_HTML_FLAG_BAD_ELEMENTS},
+		{"xml", RSPAMD_HTML_FLAG_XML},
+		{"unknown_element", RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS},
+		{"unknown_elements", RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS},
+		{"duplicate_element", RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS},
+		{"duplicate_elements", RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS},
+		{"unbalanced", RSPAMD_HTML_FLAG_UNBALANCED},
+		{"data_urls", RSPAMD_HTML_FLAG_HAS_DATA_URLS},
+});
+
 static gint
 lua_html_has_property (lua_State *L)
 {
@@ -225,35 +243,10 @@ lua_html_has_property (lua_State *L)
 	gboolean ret = FALSE;
 
 	if (hc && propname) {
-		/*
-		 * - `no_html`
-		 * - `bad_element`
-		 * - `xml`
-		 * - `unknown_element`
-		 * - `duplicate_element`
-		 * - `unbalanced`
-		 * - `data_urls`
-		 */
-		if (strcmp (propname, "no_html") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_BAD_START;
-		}
-		else if (strcmp (propname, "bad_element") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-		}
-		else if (strcmp (propname, "xml") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_XML;
-		}
-		else if (strcmp (propname, "unknown_element") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
-		}
-		else if (strcmp (propname, "duplicate_element") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
-		}
-		else if (strcmp (propname, "unbalanced") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_UNBALANCED;
-		}
-		else if (strcmp (propname, "data_urls") == 0) {
-			ret = hc->flags & RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+		auto found_prop = prop_map.find(frozen::string(propname));
+
+		if (found_prop != prop_map.end()) {
+			ret = hc->flags & found_prop->second;
 		}
 	}
 
@@ -277,7 +270,7 @@ lua_html_push_image (lua_State *L, struct html_image *img)
 		if (img->flags & RSPAMD_HTML_FLAG_IMAGE_DATA) {
 			struct rspamd_lua_text *t;
 
-			t = lua_newuserdata (L, sizeof (*t));
+			t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
 			t->start = img->src;
 			t->len = strlen (img->src);
 			t->flags = 0;
@@ -293,7 +286,7 @@ lua_html_push_image (lua_State *L, struct html_image *img)
 
 	if (img->url) {
 		lua_pushstring (L, "url");
-		purl = lua_newuserdata (L, sizeof (gpointer));
+		purl = static_cast<rspamd_url **>(lua_newuserdata(L, sizeof(gpointer)));
 		*purl = img->url;
 		rspamd_lua_setclass (L, "rspamd{url}", -1);
 		lua_settable (L, -3);
@@ -301,8 +294,8 @@ lua_html_push_image (lua_State *L, struct html_image *img)
 
 	if (img->tag) {
 		lua_pushstring (L, "tag");
-		ltag = lua_newuserdata (L, sizeof (struct lua_html_tag));
-		ltag->tag = img->tag;
+		ltag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(struct lua_html_tag)));
+		ltag->tag = static_cast<rspamd::html::html_tag *>(img->tag);
 		ltag->html = NULL;
 		rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
 		lua_settable (L, -3);
@@ -360,8 +353,10 @@ lua_html_push_block (lua_State *L, struct html_block *bl)
 	lua_createtable (L, 0, 6);
 
 	if (bl->tag) {
+		gsize name_len;
+		const gchar *name = rspamd_html_tag_name(bl->tag, &name_len);
 		lua_pushstring (L, "tag");
-		lua_pushlstring (L, bl->tag->name.start, bl->tag->name.len);
+		lua_pushlstring (L, name, name_len);
 		lua_settable (L, -3);
 	}
 
@@ -394,9 +389,9 @@ lua_html_push_block (lua_State *L, struct html_block *bl)
 
 	if (bl->style.len > 0) {
 		lua_pushstring (L, "style");
-		t = lua_newuserdata (L, sizeof (*t));
+		t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
 		rspamd_lua_setclass (L, "rspamd{text}", -1);
-		t->start = bl->style.start;
+		t->start = bl->style.begin;
 		t->len = bl->style.len;
 		t->flags = 0;
 		lua_settable (L, -3);
@@ -425,7 +420,7 @@ lua_html_get_blocks (lua_State *L)
 			lua_createtable (L, hc->blocks->len, 0);
 
 			for (i = 0; i < hc->blocks->len; i ++) {
-				bl = g_ptr_array_index (hc->blocks, i);
+				bl = static_cast<decltype(bl)>(g_ptr_array_index (hc->blocks, i));
 				lua_html_push_block (L, bl);
 				lua_rawseti (L, -2, i + 1);
 			}
@@ -445,23 +440,22 @@ struct lua_html_traverse_ud {
 	lua_State *L;
 	struct html_content *html;
 	gint cbref;
-	GHashTable *tags;
+	robin_hood::unordered_flat_set<int> tags;
 	gboolean any;
 };
 
 static gboolean
 lua_html_node_foreach_cb (GNode *n, gpointer d)
 {
-	struct lua_html_traverse_ud *ud = d;
-	struct html_tag *tag = n->data;
+	struct lua_html_traverse_ud *ud = (struct lua_html_traverse_ud *)d;
+	auto *tag = (rspamd::html::html_tag *)n->data;
 	struct lua_html_tag *ltag;
 
-	if (tag && (ud->any || g_hash_table_lookup (ud->tags,
-			GSIZE_TO_POINTER (mum_hash64 (tag->id, 0))))) {
+	if (tag && (ud->any || ud->tags.contains(tag->id))) {
 
 		lua_rawgeti (ud->L, LUA_REGISTRYINDEX, ud->cbref);
 
-		ltag = lua_newuserdata (ud->L, sizeof (*ltag));
+		ltag = static_cast<lua_html_tag *>(lua_newuserdata(ud->L, sizeof(*ltag)));
 		ltag->tag = tag;
 		ltag->html = ud->html;
 		rspamd_lua_setclass (ud->L, "rspamd{html_tag}", -1);
@@ -501,7 +495,6 @@ lua_html_foreach_tag (lua_State *L)
 	const gchar *tagname;
 	gint id;
 
-	ud.tags = g_hash_table_new (g_direct_hash, g_direct_equal);
 	ud.any = FALSE;
 	ud.html = hc;
 
@@ -511,14 +504,14 @@ lua_html_foreach_tag (lua_State *L)
 			ud.any = TRUE;
 		}
 		else {
-			id = rspamd_html_tag_by_name (tagname);
+			id = rspamd_html_tag_by_name(tagname);
 
 			if (id == -1) {
-				g_hash_table_unref (ud.tags);
 				return luaL_error (L, "invalid tagname: %s", tagname);
 			}
-			g_hash_table_insert (ud.tags, GSIZE_TO_POINTER (mum_hash64 (id, 0)),
-					"1");
+
+
+			ud.tags.insert(id);
 		}
 	}
 	else if (lua_type (L, 2) == LUA_TTABLE) {
@@ -533,18 +526,16 @@ lua_html_foreach_tag (lua_State *L)
 				id = rspamd_html_tag_by_name (tagname);
 
 				if (id == -1) {
-					g_hash_table_unref (ud.tags);
 					return luaL_error (L, "invalid tagname: %s", tagname);
 				}
-				g_hash_table_insert (ud.tags,
-						GSIZE_TO_POINTER (mum_hash64 (id, 0)), "1");
+				ud.tags.insert(id);
 			}
 		}
 
 		lua_pop (L, 1);
 	}
 
-	if (hc && (ud.any || g_hash_table_size (ud.tags) > 0) && lua_isfunction (L, 3)) {
+	if (hc && (ud.any || !ud.tags.empty()) && lua_isfunction (L, 3)) {
 		if (hc->html_tags) {
 
 			lua_pushvalue (L, 3);
@@ -558,12 +549,9 @@ lua_html_foreach_tag (lua_State *L)
 		}
 	}
 	else {
-		g_hash_table_unref (ud.tags);
 		return luaL_error (L, "invalid arguments");
 	}
 
-	g_hash_table_unref (ud.tags);
-
 	return 0;
 }
 
@@ -602,8 +590,8 @@ lua_html_tag_get_parent (lua_State *L)
 		node = ltag->tag->parent;
 
 		if (node && node->data) {
-			ptag = lua_newuserdata (L, sizeof (*ptag));
-			ptag->tag = node->data;
+			ptag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(*ptag)));
+			ptag->tag = static_cast<rspamd::html::html_tag *>(node->data);
 			ptag->html = ltag->html;
 			rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
 		}
@@ -670,9 +658,10 @@ lua_html_tag_get_content (lua_State *L)
 	if (ltag) {
 		if (ltag->html && ltag->tag->content_length &&
 				ltag->html->parsed->len >= ltag->tag->content_offset + ltag->tag->content_length) {
-			t = lua_newuserdata (L, sizeof (*t));
+			t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
 			rspamd_lua_setclass (L, "rspamd{text}", -1);
-			t->start = ltag->html->parsed->data + ltag->tag->content_offset;
+			t->start = reinterpret_cast<const char *>(ltag->html->parsed->data) +
+					ltag->tag->content_offset;
 			t->len = ltag->tag->content_length;
 			t->flags = 0;
 		}
@@ -712,19 +701,19 @@ lua_html_tag_get_extra (lua_State *L)
 	struct rspamd_url **purl;
 
 	if (ltag) {
-		if (ltag->tag->extra) {
-			if (ltag->tag->flags & FL_IMAGE) {
-				img = ltag->tag->extra;
+		if (!std::holds_alternative<std::monostate>(ltag->tag->extra)) {
+			if (std::holds_alternative<struct html_image *>(ltag->tag->extra)) {
+				img = std::get<struct html_image *>(ltag->tag->extra);
 				lua_html_push_image (L, img);
 			}
-			else if (ltag->tag->flags & FL_HREF) {
+			else if (std::holds_alternative<struct rspamd_url *>(ltag->tag->extra)) {
 				/* For A that's URL */
-				purl = lua_newuserdata (L, sizeof (gpointer));
-				*purl = ltag->tag->extra;
+				purl = static_cast<rspamd_url **>(lua_newuserdata(L, sizeof(gpointer)));
+				*purl = std::get<struct rspamd_url *>(ltag->tag->extra);
 				rspamd_lua_setclass (L, "rspamd{url}", -1);
 			}
 			else if (ltag->tag->flags & FL_BLOCK) {
-				lua_html_push_block (L, ltag->tag->extra);
+				lua_html_push_block (L, ltag->tag->block);
 			}
 			else {
 				/* Unknown extra ? */


More information about the Commits mailing list