commit 4afbf86: [Project] Html: Add more tests cases and fix some more corner issues
Vsevolod Stakhov
vsevolod at highsecure.ru
Sat Jun 19 10:42:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-06-19 11:18:33 +0100
URL: https://github.com/rspamd/rspamd/commit/4afbf86ea8b4c184873c5e9df64f4589cff2a913
[Project] Html: Add more tests cases and fix some more corner issues
---
src/libserver/html/html_entities.cxx | 81 +++++++++++++++++++-----------------
1 file changed, 43 insertions(+), 38 deletions(-)
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
index 056dba60e..144de5d99 100644
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2241,34 +2241,27 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
}
else {
/* Try heuristic */
- /* Try 4 letters replacements */
- if (h - e > 4) {
- entity_def = html_entities_defs.by_name({entity, 4});
+ auto heuristic_lookup_func = [&](std::size_t lookup_len) -> bool {
+ if (!entity_def && h - e > lookup_len) {
+ entity_def = html_entities_defs.by_name({entity, lookup_len});
- if (entity_def) {
- replace_entity();
- /* Rewind h by 5 for & character and entity */
- h = e + 4;
- }
- }
- /* Try 3 letters replacements */
- if (!entity_def && h - e > 3) {
- entity_def = html_entities_defs.by_name({entity, 3});
+ if (entity_def) {
+ replace_entity();
+ /* Adjust h back */
+ h = e + lookup_len;
- if (entity_def) {
- replace_entity();
- h = e + 3;
+ return true;
+ }
}
- }
- /* Try 2 letters replacements */
- if (!entity_def && h - e > 2) {
- entity_def = html_entities_defs.by_name({entity, 2});
- if (entity_def) {
- replace_entity();
- h = e + 2;
- }
- }
+ return false;
+ };
+
+ heuristic_lookup_func(5);
+ heuristic_lookup_func(4);
+ heuristic_lookup_func(3);
+ heuristic_lookup_func(2);
+
/* Leave undecoded */
if (!entity_def && (end - t > h - e + 1)) {
memmove(t, e, h - e + 1);
@@ -2386,20 +2379,19 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
goffset off = t - s;
UBool is_error = 0;
- if (uc > 0 && u_isprint(uc)) {
+ if (uc > 0) {
U8_APPEND (s, off, len, uc, is_error);
if (!is_error) {
t = s + off;
}
- else {
- /* Leave invalid entities as is */
- if (end - t > h - e + 1) {
- memmove(t, e, h - e + 1);
- t += h - e + 1;
- }
+ else if (end - t > 3) {
+ /* Not printable code point replace with 0xFFFD */
+ *t++ = '\357';
+ *t++ = '\277';
+ *t++ = '\275';
- return false;
+ return true;
}
}
else if (end - t > 3) {
@@ -2417,13 +2409,15 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
};
auto replace_entity = [&]() -> bool {
- const auto *entity_start = e + 1;
+ if (e + 1 < end) {
+ const auto *entity_start = e + 1;
- if (*entity_start != '#') {
- return replace_named_entity(entity_start, (h - entity_start));
- }
- else if (entity_start + 1 < h) {
- return replace_numeric_entity(entity_start + 1);
+ if (*entity_start != '#') {
+ return replace_named_entity(entity_start, (h - entity_start));
+ }
+ else if (entity_start + 1 < h) {
+ return replace_numeric_entity(entity_start + 1);
+ }
}
return false;
@@ -2514,6 +2508,9 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
/* To follow FSM semantics */
h ++;
}
+ else {
+ h = e; /* Include the last & */
+ }
/* Leftover after replacement */
if (h < end && t + (end - h) <= end) {
@@ -2552,6 +2549,14 @@ TEST_SUITE("html") {
{"FOOºR", "FOOºR"},
{"FOO䆺R", "FOO䆺R"},
{"FOO�ZOO", "FOO�ZOO"},
+ {"FOOZOO", "FOO\u0081ZOO"},
+ {"FOO�ZOO", "FOO�ZOO"},
+ {"FOO�ZOO", "FOO�ZOO"},
+ {"ZZ£_id=23", "ZZ£_id=23"},
+ {"ZZ&prod_id=23", "ZZ&prod_id=23"},
+ {"ZZ>", "ZZ>"},
+ {"ZZ&", "ZZ&"},
+ {"ZZÆ=", "ZZÆ="},
};
for (const auto &c : cases) {
More information about the Commits
mailing list