commit 198e242: [Feature] Treat all tags with HREF as a potential hyperlinks

Vsevolod Stakhov vsevolod at highsecure.ru
Wed Mar 20 14:49:04 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-03-20 14:44:06 +0000
URL: https://github.com/rspamd/rspamd/commit/198e242157ed81b871671f6a77e3d525a57350a5 (HEAD -> master)

[Feature] Treat all tags with HREF as a potential hyperlinks

---
 src/libserver/html.c | 17 +++++++----------
 src/libserver/html.h |  1 +
 src/lua/lua_html.c   |  6 +++++-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/libserver/html.c b/src/libserver/html.c
index 1f11f0cb2..6df545f00 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -53,12 +53,12 @@ INIT_LOG_MODULE(html)
 
 static struct html_tag_def tag_defs[] = {
 	/* W3C defined elements */
-	TAG_DEF(Tag_A, "a", 0),
+	TAG_DEF(Tag_A, "a", FL_HREF),
 	TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
 	TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
 	TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
 	TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
-	TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY)),
+	TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
 	TAG_DEF(Tag_B, "b", (CM_INLINE|FL_BLOCK)),
 	TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
 	TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
@@ -85,7 +85,7 @@ static struct html_tag_def tag_defs[] = {
 	TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
 	TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
 	TAG_DEF(Tag_FORM, "form", (CM_BLOCK)),
-	TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY)),
+	TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
 	TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
 	TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
 	TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
@@ -97,7 +97,7 @@ static struct html_tag_def tag_defs[] = {
 	TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
 	TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
 	TAG_DEF(Tag_I, "i", (CM_INLINE)),
-	TAG_DEF(Tag_IFRAME, "iframe", (0)),
+	TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
 	TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
 	TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
 	TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
@@ -106,9 +106,9 @@ static struct html_tag_def tag_defs[] = {
 	TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
 	TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
 	TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
-	TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY)),
+	TAG_DEF(Tag_LINK, "link", (CM_HEAD | CM_EMPTY|FL_HREF)),
 	TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
-	TAG_DEF(Tag_MAP, "map", (CM_INLINE)),
+	TAG_DEF(Tag_MAP, "map", (CM_INLINE|FL_HREF)),
 	TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
 	TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
 	TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
@@ -2942,7 +2942,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 					save_space = FALSE;
 				}
 
-				if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
+				if (cur_tag->flags & FL_HREF) {
 					if (!(cur_tag->flags & (FL_CLOSING))) {
 						url = rspamd_html_process_url_tag (pool, cur_tag, hc);
 
@@ -3012,9 +3012,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 						}
 					}
 				}
-				else if (cur_tag->id == Tag_LINK) {
-					url = rspamd_html_process_url_tag (pool, cur_tag, hc);
-				}
 				else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
 					struct html_tag *prev_tag = NULL;
 
diff --git a/src/libserver/html.h b/src/libserver/html.h
index f816567bd..f3515d627 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -98,6 +98,7 @@ struct html_block {
 #define FL_BROKEN       (1 << 26)
 #define FL_IGNORE       (1 << 27)
 #define FL_BLOCK        (1 << 28)
+#define FL_HREF         (1 << 29)
 
 struct html_tag {
 	gint id;
diff --git a/src/lua/lua_html.c b/src/lua/lua_html.c
index 1b5828564..39a4a77a0 100644
--- a/src/lua/lua_html.c
+++ b/src/lua/lua_html.c
@@ -616,6 +616,10 @@ lua_html_tag_get_flags (lua_State *L)
 			lua_pushstring (L, "closing");
 			lua_rawseti (L, -2, i++);
 		}
+		if (tag->flags & FL_HREF) {
+			lua_pushstring (L, "href");
+			lua_rawseti (L, -2, i++);
+		}
 		if (tag->flags & FL_CLOSED) {
 			lua_pushstring (L, "closed");
 			lua_rawseti (L, -2, i++);
@@ -692,7 +696,7 @@ lua_html_tag_get_extra (lua_State *L)
 
 	if (tag) {
 		if (tag->extra) {
-			if (tag->id == Tag_A || tag->id == Tag_IFRAME || tag->id == Tag_LINK) {
+			if (tag->flags & FL_HREF) {
 				/* For A that's URL */
 				purl = lua_newuserdata (L, sizeof (gpointer));
 				*purl = tag->extra;


More information about the Commits mailing list