commit 58fafcd: [Rework] Rework HTML tags content attachment
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Jan 6 17:14:07 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-01-06 17:08:02 +0000
URL: https://github.com/rspamd/rspamd/commit/58fafcd653930ec374aba9dc6a1876052d9a1881 (HEAD -> master)
[Rework] Rework HTML tags content attachment
---
src/libserver/html.c | 70 ++++++++++++++++++++++++--------------
src/libserver/html.h | 5 +--
src/lua/lua_html.c | 94 +++++++++++++++++++++++++++++-----------------------
3 files changed, 101 insertions(+), 68 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index d9cddb468..502fa42fa 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -815,8 +815,6 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
return TRUE;
}
}
-
- parent->content_length += tag->content_length;
}
if (hc->total_tags < max_tags) {
@@ -2774,13 +2772,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
p ++;
}
else {
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
- }
-
- content_tag->content_length += p - c;
- }
state = tag_begin;
}
break;
@@ -2798,24 +2789,35 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
if (need_decode) {
goffset old_offset = dest->len;
+ if (content_tag) {
+ if (content_tag->content_offset == 0) {
+ content_tag->content_offset = old_offset;
+ }
+ }
+
g_byte_array_append (dest, c, (p - c));
len = rspamd_html_decode_entitles_inplace (
dest->data + old_offset,
p - c);
dest->len = dest->len + len - (p - c);
+
+ if (content_tag) {
+ content_tag->content_length += len;
+ }
}
else {
len = p - c;
- g_byte_array_append (dest, c, len);
- }
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
+ if (content_tag) {
+ if (content_tag->content_offset == 0) {
+ content_tag->content_offset = dest->len;
+ }
+
+ content_tag->content_length += len;
}
- content_tag->content_length += p - c + 1;
+ g_byte_array_append (dest, c, len);
}
}
@@ -2828,6 +2830,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
if (dest->len > 0 &&
!g_ascii_isspace (dest->data[dest->len - 1])) {
g_byte_array_append (dest, " ", 1);
+ if (content_tag) {
+ content_tag->content_length ++;
+ }
}
save_space = FALSE;
}
@@ -2839,24 +2844,34 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
if (need_decode) {
goffset old_offset = dest->len;
+ if (content_tag) {
+ if (content_tag->content_offset == 0) {
+ content_tag->content_offset = dest->len;
+ }
+ }
+
g_byte_array_append (dest, c, (p - c));
len = rspamd_html_decode_entitles_inplace (
dest->data + old_offset,
p - c);
dest->len = dest->len + len - (p - c);
+
+ if (content_tag) {
+ content_tag->content_length += len;
+ }
}
else {
len = p - c;
- g_byte_array_append (dest, c, len);
- }
+ if (content_tag) {
+ if (content_tag->content_offset == 0) {
+ content_tag->content_offset = dest->len;
+ }
- if (content_tag) {
- if (content_tag->content == NULL) {
- content_tag->content = c;
+ content_tag->content_length += len;
}
- content_tag->content_length += p - c;
+ g_byte_array_append (dest, c, len);
}
}
@@ -2876,10 +2891,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
continue;
}
- if (content_tag) {
- content_tag->content_length ++;
- }
-
p ++;
break;
@@ -2949,6 +2960,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
g_byte_array_append (dest, "\r\n", 2);
+
+ if (content_tag) {
+ content_tag->content_length += 2;
+ }
}
save_space = FALSE;
}
@@ -2958,6 +2973,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
cur_tag->id == Tag_DIV)) {
if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
g_byte_array_append (dest, "\r\n", 2);
+
+ if (content_tag) {
+ content_tag->content_length += 2;
+ }
}
save_space = FALSE;
}
@@ -3106,6 +3125,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
}
g_queue_free (styles_blocks);
+ hc->parsed = dest;
return dest;
}
diff --git a/src/libserver/html.h b/src/libserver/html.h
index 86a266a62..b369bd890 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -107,9 +107,9 @@ struct html_block {
struct html_tag {
gint id;
gint flags;
- guint content_length;
struct html_tag_component name;
- const gchar *content;
+ guint content_length;
+ goffset content_offset;
GQueue *params;
gpointer extra; /** Additional data associated with tag (e.g. image) */
GNode *parent;
@@ -127,6 +127,7 @@ struct html_content {
guchar *tags_seen;
GPtrArray *images;
GPtrArray *blocks;
+ GByteArray *parsed;
};
/*
diff --git a/src/lua/lua_html.c b/src/lua/lua_html.c
index 0af0457da..43c34797c 100644
--- a/src/lua/lua_html.c
+++ b/src/lua/lua_html.c
@@ -186,12 +186,17 @@ lua_check_html (lua_State * L, gint pos)
return ud ? *((struct html_content **)ud) : NULL;
}
-static struct html_tag *
+struct lua_html_tag {
+ struct html_content *html;
+ struct html_tag *tag;
+};
+
+static struct lua_html_tag *
lua_check_html_tag (lua_State * L, gint pos)
{
void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html_tag}");
luaL_argcheck (L, ud != NULL, pos, "'html_tag' expected");
- return ud ? *((struct html_tag **)ud) : NULL;
+ return ud ? ((struct lua_html_tag *)ud) : NULL;
}
static gint
@@ -263,7 +268,7 @@ static void
lua_html_push_image (lua_State *L, struct html_image *img)
{
LUA_TRACE_POINT;
- struct html_tag **ptag;
+ struct lua_html_tag *ltag;
struct rspamd_url **purl;
lua_newtable (L);
@@ -298,8 +303,9 @@ lua_html_push_image (lua_State *L, struct html_image *img)
if (img->tag) {
lua_pushstring (L, "tag");
- ptag = lua_newuserdata (L, sizeof (gpointer));
- *ptag = img->tag;
+ ltag = lua_newuserdata (L, sizeof (struct lua_html_tag));
+ ltag->tag = img->tag;
+ ltag->html = NULL;
rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
lua_settable (L, -3);
}
@@ -440,6 +446,7 @@ lua_html_get_blocks (lua_State *L)
struct lua_html_traverse_ud {
lua_State *L;
+ struct html_content *html;
gint cbref;
GHashTable *tags;
gboolean any;
@@ -449,15 +456,17 @@ static gboolean
lua_html_node_foreach_cb (GNode *n, gpointer d)
{
struct lua_html_traverse_ud *ud = d;
- struct html_tag *tag = n->data, **ptag;
+ struct html_tag *tag = n->data;
+ struct lua_html_tag *ltag;
if (tag && (ud->any || g_hash_table_lookup (ud->tags,
GSIZE_TO_POINTER (mum_hash64 (tag->id, 0))))) {
lua_rawgeti (ud->L, LUA_REGISTRYINDEX, ud->cbref);
- ptag = lua_newuserdata (ud->L, sizeof (*ptag));
- *ptag = tag;
+ ltag = lua_newuserdata (ud->L, sizeof (*ltag));
+ ltag->tag = tag;
+ ltag->html = ud->html;
rspamd_lua_setclass (ud->L, "rspamd{html_tag}", -1);
lua_pushinteger (ud->L, tag->content_length);
@@ -489,6 +498,7 @@ lua_html_foreach_tag (lua_State *L)
ud.tags = g_hash_table_new (g_direct_hash, g_direct_equal);
ud.any = FALSE;
+ ud.html = hc;
if (lua_type (L, 2) == LUA_TSTRING) {
tagname = luaL_checkstring (L, 2);
@@ -556,11 +566,11 @@ static gint
lua_html_tag_get_type (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_tag *tag = lua_check_html_tag (L, 1);
+ struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
const gchar *tagname;
- if (tag != NULL) {
- tagname = rspamd_html_tag_by_id (tag->id);
+ if (ltag != NULL) {
+ tagname = rspamd_html_tag_by_id (ltag->tag->id);
if (tagname) {
lua_pushstring (L, tagname);
@@ -580,15 +590,16 @@ static gint
lua_html_tag_get_parent (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_tag *tag = lua_check_html_tag (L, 1), **ptag;
+ struct lua_html_tag *ltag = lua_check_html_tag (L, 1), *ptag;
GNode *node;
- if (tag != NULL) {
- node = tag->parent;
+ if (ltag != NULL) {
+ node = ltag->tag->parent;
if (node && node->data) {
- ptag = lua_newuserdata (L, sizeof (gpointer));
- *ptag = node->data;
+ ptag = lua_newuserdata (L, sizeof (*ptag));
+ ptag->tag = node->data;
+ ptag->html = ltag->html;
rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
}
else {
@@ -606,33 +617,33 @@ static gint
lua_html_tag_get_flags (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_tag *tag = lua_check_html_tag (L, 1);
+ struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
gint i = 1;
- if (tag) {
+ if (ltag->tag) {
/* Push flags */
lua_createtable (L, 4, 0);
- if (tag->flags & FL_CLOSING) {
+ if (ltag->tag->flags & FL_CLOSING) {
lua_pushstring (L, "closing");
lua_rawseti (L, -2, i++);
}
- if (tag->flags & FL_HREF) {
+ if (ltag->tag->flags & FL_HREF) {
lua_pushstring (L, "href");
lua_rawseti (L, -2, i++);
}
- if (tag->flags & FL_CLOSED) {
+ if (ltag->tag->flags & FL_CLOSED) {
lua_pushstring (L, "closed");
lua_rawseti (L, -2, i++);
}
- if (tag->flags & FL_BROKEN) {
+ if (ltag->tag->flags & FL_BROKEN) {
lua_pushstring (L, "broken");
lua_rawseti (L, -2, i++);
}
- if (tag->flags & FL_XML) {
+ if (ltag->tag->flags & FL_XML) {
lua_pushstring (L, "xml");
lua_rawseti (L, -2, i++);
}
- if (tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) {
+ if (ltag->tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) {
lua_pushstring (L, "unbalanced");
lua_rawseti (L, -2, i++);
}
@@ -648,15 +659,16 @@ static gint
lua_html_tag_get_content (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_tag *tag = lua_check_html_tag (L, 1);
+ struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
struct rspamd_lua_text *t;
- if (tag) {
- if (tag->content && tag->content_length) {
+ if (ltag) {
+ if (ltag->html && ltag->tag->content_offset && ltag->tag->content_length &&
+ ltag->html->parsed->len >= ltag->tag->content_offset + ltag->tag->content_length) {
t = lua_newuserdata (L, sizeof (*t));
rspamd_lua_setclass (L, "rspamd{text}", -1);
- t->start = tag->content;
- t->len = tag->content_length;
+ t->start = ltag->html->parsed->data + ltag->tag->content_offset;
+ t->len = ltag->tag->content_length;
t->flags = 0;
}
else {
@@ -674,10 +686,10 @@ static gint
lua_html_tag_get_content_length (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_tag *tag = lua_check_html_tag (L, 1);
+ struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
- if (tag) {
- lua_pushinteger (L, tag->content_length);
+ if (ltag) {
+ lua_pushinteger (L, ltag->tag->content_length);
}
else {
return luaL_error (L, "invalid arguments");
@@ -690,24 +702,24 @@ static gint
lua_html_tag_get_extra (lua_State *L)
{
LUA_TRACE_POINT;
- struct html_tag *tag = lua_check_html_tag (L, 1);
+ struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
struct html_image *img;
struct rspamd_url **purl;
- if (tag) {
- if (tag->extra) {
- if ((tag->flags & FL_HREF) || tag->id == Tag_BASE) {
+ if (ltag) {
+ if (ltag->tag->extra) {
+ if ((ltag->tag->flags & FL_HREF) || ltag->tag->id == Tag_BASE) {
/* For A that's URL */
purl = lua_newuserdata (L, sizeof (gpointer));
- *purl = tag->extra;
+ *purl = ltag->tag->extra;
rspamd_lua_setclass (L, "rspamd{url}", -1);
}
- else if (tag->id == Tag_IMG) {
- img = tag->extra;
+ else if (ltag->tag->id == Tag_IMG) {
+ img = ltag->tag->extra;
lua_html_push_image (L, img);
}
- else if (tag->flags & FL_BLOCK) {
- lua_html_push_block (L, tag->extra);
+ else if (ltag->tag->flags & FL_BLOCK) {
+ lua_html_push_block (L, ltag->tag->extra);
}
else {
/* Unknown extra ? */
More information about the Commits
mailing list