commit 3c926ff: [Minor] Some rework for old html tags flags

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jul 12 15:56:04 UTC 2021


Author: Vsevolod Stakhov
Date: 2021-07-12 16:20:27 +0100
URL: https://github.com/rspamd/rspamd/commit/3c926ff8cc3b0cad0131e993c809c49640b7cf8c

[Minor] Some rework for old html tags flags

---
 src/libserver/html/html_tag.hxx      | 18 +++++-----
 src/libserver/html/html_tag_defs.hxx | 64 +++++++++++++-----------------------
 src/libserver/html/html_tags.h       | 44 +++++--------------------
 3 files changed, 40 insertions(+), 86 deletions(-)

diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index fab1b3867..d7e80f41b 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -46,15 +46,15 @@ enum class html_component_type : std::uint8_t {
 
 /* Public tags flags */
 /* XML tag */
-#define FL_XML          (1 << 22)
+#define FL_XML          (1u << CM_USER_SHIFT)
 /* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED       (1 << 23)
-#define FL_BROKEN       (1 << 24)
-#define FL_IGNORE       (1 << 25)
-#define FL_BLOCK        (1 << 26)
-#define FL_HREF         (1 << 27)
-#define FL_COMMENT      (1 << 28)
-#define FL_VIRTUAL      (1 << 29)
+#define FL_CLOSED       (1 << (CM_USER_SHIFT + 1))
+#define FL_BROKEN       (1 << (CM_USER_SHIFT + 2))
+#define FL_IGNORE       (1 << (CM_USER_SHIFT + 3))
+#define FL_BLOCK        (1 << (CM_USER_SHIFT + 4))
+#define FL_HREF         (1 << (CM_USER_SHIFT + 5))
+#define FL_COMMENT      (1 << (CM_USER_SHIFT + 6))
+#define FL_VIRTUAL      (1 << (CM_USER_SHIFT + 7))
 
 /**
  * Returns component type from a string
@@ -128,6 +128,8 @@ struct html_tag {
 	}
 };
 
+static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY);
+
 }
 
 #endif //RSPAMD_HTML_TAG_HXX
diff --git a/src/libserver/html/html_tag_defs.hxx b/src/libserver/html/html_tag_defs.hxx
index 5854d447b..7e6cc9bf6 100644
--- a/src/libserver/html/html_tag_defs.hxx
+++ b/src/libserver/html/html_tag_defs.hxx
@@ -39,7 +39,7 @@ static const auto html_tag_defs_array = rspamd::array_of<html_tag_def>(
 		TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
 		TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
 		TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
-		TAG_DEF(Tag_APPLET, "applet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
+		TAG_DEF(Tag_APPLET, "applet", (CM_IMG | CM_INLINE | CM_PARAM)),
 		TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
 		TAG_DEF(Tag_B, "b", (CM_INLINE | FL_BLOCK)),
 		TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
@@ -57,9 +57,9 @@ static const auto html_tag_defs_array = rspamd::array_of<html_tag_def>(
 		TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
 		TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
 		TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
-		TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK | CM_MIXED)),
+		TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK)),
 		TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
-		TAG_DEF(Tag_DIR, "dir", (CM_BLOCK | CM_OBSOLETE)),
+		TAG_DEF(Tag_DIR, "dir", (CM_BLOCK)),
 		TAG_DEF(Tag_DIV, "div", (CM_BLOCK | FL_BLOCK)),
 		TAG_DEF(Tag_DL, "dl", (CM_BLOCK | FL_BLOCK)),
 		TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
@@ -67,14 +67,14 @@ static const auto html_tag_defs_array = rspamd::array_of<html_tag_def>(
 		TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
 		TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
 		TAG_DEF(Tag_FORM, "form", (CM_BLOCK | FL_HREF)),
-		TAG_DEF(Tag_FRAME, "frame", (CM_FRAMES | CM_EMPTY | FL_HREF)),
-		TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML | CM_FRAMES)),
-		TAG_DEF(Tag_H1, "h1", (CM_BLOCK | CM_HEADING)),
-		TAG_DEF(Tag_H2, "h2", (CM_BLOCK | CM_HEADING)),
-		TAG_DEF(Tag_H3, "h3", (CM_BLOCK | CM_HEADING)),
-		TAG_DEF(Tag_H4, "h4", (CM_BLOCK | CM_HEADING)),
-		TAG_DEF(Tag_H5, "h5", (CM_BLOCK | CM_HEADING)),
-		TAG_DEF(Tag_H6, "h6", (CM_BLOCK | CM_HEADING)),
+		TAG_DEF(Tag_FRAME, "frame", (CM_EMPTY | FL_HREF)),
+		TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML)),
+		TAG_DEF(Tag_H1, "h1", (CM_BLOCK)),
+		TAG_DEF(Tag_H2, "h2", (CM_BLOCK)),
+		TAG_DEF(Tag_H3, "h3", (CM_BLOCK)),
+		TAG_DEF(Tag_H4, "h4", (CM_BLOCK)),
+		TAG_DEF(Tag_H5, "h5", (CM_BLOCK)),
+		TAG_DEF(Tag_H6, "h6", (CM_BLOCK)),
 		TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
 		TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
 		TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
@@ -82,26 +82,26 @@ static const auto html_tag_defs_array = rspamd::array_of<html_tag_def>(
 		TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
 		TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
 		TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
-		TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK | CM_MIXED)),
+		TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK)),
 		TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
 		TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
 		TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
 		TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
 		TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
 		TAG_DEF(Tag_LINK, "link", (CM_EMPTY | FL_HREF)),
-		TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK | CM_OBSOLETE)),
+		TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK)),
 		TAG_DEF(Tag_MAP, "map", (CM_INLINE | FL_HREF)),
-		TAG_DEF(Tag_MENU, "menu", (CM_BLOCK | CM_OBSOLETE)),
+		TAG_DEF(Tag_MENU, "menu", (CM_BLOCK)),
 		TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
-		TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)),
-		TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)),
-		TAG_DEF(Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
+		TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK)),
+		TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_RAW)),
+		TAG_DEF(Tag_OBJECT, "object", (CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
 		TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
 		TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
 		TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
 		TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
 		TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
-		TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK | CM_OBSOLETE)),
+		TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK)),
 		TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
 		TAG_DEF(Tag_Q, "q", (CM_INLINE)),
 		TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
@@ -112,13 +112,13 @@ static const auto html_tag_defs_array = rspamd::array_of<html_tag_def>(
 		TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
 		TAG_DEF(Tag_S, "s", (CM_INLINE)),
 		TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
-		TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_MIXED)),
+		TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_RAW)),
 		TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
 		TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
 		TAG_DEF(Tag_SPAN, "span", (CM_NO_INDENT | FL_BLOCK)),
 		TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
 		TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
-		TAG_DEF(Tag_STYLE, "style", (CM_HEAD)),
+		TAG_DEF(Tag_STYLE, "style", (CM_HEAD | CM_RAW)),
 		TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
 		TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
 		TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
@@ -134,28 +134,8 @@ static const auto html_tag_defs_array = rspamd::array_of<html_tag_def>(
 		TAG_DEF(Tag_U, "u", (CM_INLINE)),
 		TAG_DEF(Tag_UL, "ul", (CM_BLOCK | FL_BLOCK)),
 		TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
-		TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK | CM_OBSOLETE)),
-		TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)),
-
-		/* proprietary elements */
-		TAG_DEF(Tag_ALIGN, "align", (CM_BLOCK)),
-		TAG_DEF(Tag_BGSOUND, "bgsound", (CM_HEAD | CM_EMPTY)),
-		TAG_DEF(Tag_BLINK, "blink", (CM_INLINE)),
-		TAG_DEF(Tag_COMMENT, "comment", (CM_INLINE)),
-		TAG_DEF(Tag_EMBED, "embed", (CM_INLINE | CM_IMG | CM_EMPTY)),
-		TAG_DEF(Tag_ILAYER, "ilayer", (CM_INLINE)),
-		TAG_DEF(Tag_KEYGEN, "keygen", (CM_INLINE | CM_EMPTY)),
-		TAG_DEF(Tag_LAYER, "layer", (CM_BLOCK)),
-		TAG_DEF(Tag_MARQUEE, "marquee", (CM_INLINE | CM_OPT)),
-		TAG_DEF(Tag_MULTICOL, "multicol", (CM_BLOCK)),
-		TAG_DEF(Tag_NOBR, "nobr", (CM_INLINE)),
-		TAG_DEF(Tag_NOEMBED, "noembed", (CM_INLINE)),
-		TAG_DEF(Tag_NOLAYER, "nolayer", (CM_BLOCK | CM_INLINE | CM_MIXED)),
-		TAG_DEF(Tag_NOSAVE, "nosave", (CM_BLOCK)),
-		TAG_DEF(Tag_SERVER, "server", (CM_HEAD | CM_MIXED | CM_BLOCK | CM_INLINE)),
-		TAG_DEF(Tag_SERVLET, "servlet", (CM_OBJECT | CM_IMG | CM_INLINE | CM_PARAM)),
-		TAG_DEF(Tag_SPACER, "spacer", (CM_INLINE | CM_EMPTY)),
-		TAG_DEF(Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY))
+		TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK)),
+		TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY))
 );
 
 class html_tags_storage {
diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h
index e94dd6a9a..0048a28fc 100644
--- a/src/libserver/html/html_tags.h
+++ b/src/libserver/html/html_tags.h
@@ -27,16 +27,13 @@ typedef enum {
 	Tag_ABBR,   /**< ABBR */
 	Tag_ACRONYM, /**< ACRONYM */
 	Tag_ADDRESS, /**< ADDRESS */
-	Tag_ALIGN,  /**< ALIGN */
 	Tag_APPLET, /**< APPLET */
 	Tag_AREA,   /**< AREA */
 	Tag_B,      /**< B */
 	Tag_BASE,   /**< BASE */
 	Tag_BASEFONT, /**< BASEFONT */
 	Tag_BDO,    /**< BDO */
-	Tag_BGSOUND, /**< BGSOUND */
 	Tag_BIG,    /**< BIG */
-	Tag_BLINK,  /**< BLINK */
 	Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
 	Tag_BODY,   /**< BODY */
 	Tag_BR,     /**< BR */
@@ -47,7 +44,6 @@ typedef enum {
 	Tag_CODE,   /**< CODE */
 	Tag_COL,    /**< COL */
 	Tag_COLGROUP, /**< COLGROUP */
-	Tag_COMMENT, /**< COMMENT */
 	Tag_DD,     /**< DD */
 	Tag_DEL,    /**< DEL */
 	Tag_DFN,    /**< DFN */
@@ -56,7 +52,6 @@ typedef enum {
 	Tag_DL,     /**< DL */
 	Tag_DT,     /**< DT */
 	Tag_EM,     /**< EM */
-	Tag_EMBED,  /**< EMBED */
 	Tag_FIELDSET, /**< FIELDSET */
 	Tag_FONT,   /**< FONT */
 	Tag_FORM,   /**< FORM */
@@ -73,7 +68,6 @@ typedef enum {
 	Tag_HTML,   /**< HTML */
 	Tag_I,      /**< I */
 	Tag_IFRAME, /**< IFRAME */
-	Tag_ILAYER, /**< ILAYER */
 	Tag_IMG,    /**< IMG */
 	Tag_INPUT,  /**< INPUT */
 	Tag_INS,    /**< INS */
@@ -81,21 +75,14 @@ typedef enum {
 	Tag_KBD,    /**< KBD */
 	Tag_KEYGEN, /**< KEYGEN */
 	Tag_LABEL,  /**< LABEL */
-	Tag_LAYER,  /**< LAYER */
 	Tag_LEGEND, /**< LEGEND */
 	Tag_LI,     /**< LI */
 	Tag_LINK,   /**< LINK */
 	Tag_LISTING, /**< LISTING */
 	Tag_MAP,    /**< MAP */
-	Tag_MARQUEE, /**< MARQUEE */
 	Tag_MENU,   /**< MENU */
 	Tag_META,   /**< META */
-	Tag_MULTICOL, /**< MULTICOL */
-	Tag_NOBR,   /**< NOBR */
-	Tag_NOEMBED, /**< NOEMBED */
 	Tag_NOFRAMES, /**< NOFRAMES */
-	Tag_NOLAYER, /**< NOLAYER */
-	Tag_NOSAVE, /**< NOSAVE */
 	Tag_NOSCRIPT, /**< NOSCRIPT */
 	Tag_OBJECT, /**< OBJECT */
 	Tag_OL,     /**< OL */
@@ -116,10 +103,7 @@ typedef enum {
 	Tag_SAMP,   /**< SAMP */
 	Tag_SCRIPT, /**< SCRIPT */
 	Tag_SELECT, /**< SELECT */
-	Tag_SERVER, /**< SERVER */
-	Tag_SERVLET, /**< SERVLET */
 	Tag_SMALL,  /**< SMALL */
-	Tag_SPACER, /**< SPACER */
 	Tag_SPAN,   /**< SPAN */
 	Tag_STRIKE, /**< STRIKE */
 	Tag_STRONG, /**< STRONG */
@@ -139,9 +123,7 @@ typedef enum {
 	Tag_U,      /**< U */
 	Tag_UL,     /**< UL */
 	Tag_VAR,    /**< VAR */
-	Tag_WBR,    /**< WBR */
 	Tag_XMP,    /**< XMP */
-	Tag_XML,    /**< XML */
 	Tag_NEXTID, /**< NEXTID */
 	Tag_MAX,
 
@@ -172,30 +154,20 @@ typedef enum {
 /* Elements whose content must be protected against white space movement.
    Includes some elements that can found in forms. */
 #define CM_FIELD        (1 << 10)
-/* Used to avoid propagating inline emphasis inside some elements
-   such as OBJECT or APPLET. */
-#define CM_OBJECT       (1 << 11)
+#define CM_RAW          (1 << 11)
 /* Elements that allows "PARAM". */
 #define CM_PARAM        (1 << 12)
-/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
-#define CM_FRAMES       (1 << 13)
-/* Heading elements (h1, h2, ...). */
-#define CM_HEADING      (1 << 14)
 /* Elements with an optional end tag. */
-#define CM_OPT          (1 << 15)
+#define CM_OPT          (1 << 13)
 /* Elements that use "align" attribute for vertical position. */
-#define CM_IMG          (1 << 16)
-/* Elements with inline and block model. Used to avoid calling InlineDup. */
-#define CM_MIXED        (1 << 17)
-/* Elements whose content needs to be indented only if containing one
-   CM_BLOCK element. */
-#define CM_NO_INDENT    (1 << 18)
-/* Elements that are obsolete (such as "dir", "menu"). */
-#define CM_OBSOLETE     (1 << 19)
+#define CM_IMG          (1 << 14)
+#define CM_NO_INDENT    (1 << 15)
 /* Elements that cannot be omitted. */
-#define CM_OMITST       (1 << 20)
+#define CM_OMITST       (1 << 16)
 /* Unique elements */
-#define CM_UNIQUE       (1 << 21)
+#define CM_UNIQUE       (1 << 17)
+
+#define CM_USER_SHIFT   (18)
 
 #ifdef  __cplusplus
 }


More information about the Commits mailing list