commit c62f291: [Fix] Fix normalisation flags propagation
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue May 11 14:28:04 UTC 2021
Author: Vsevolod Stakhov
Date: 2021-05-11 15:13:15 +0100
URL: https://github.com/rspamd/rspamd/commit/c62f291c138c795eb6f4ec8ce0e59204f5de3ca2
[Fix] Fix normalisation flags propagation
---
src/libserver/html.c | 19 ++------
src/libserver/url.c | 30 ++++++-------
src/libserver/url.h | 125 ++++++++++++++++++++++++++++++---------------------
3 files changed, 91 insertions(+), 83 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 4cb46445f..c373bb115 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1593,21 +1593,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
url = rspamd_mempool_alloc0 (pool, sizeof (*url));
- enum rspamd_normalise_result norm_res;
-
- norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
-
- if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
- saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
-
- if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
- saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
-
- if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
- saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
- }
- }
+ rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
@@ -2644,6 +2630,9 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
}
+ if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) {
+ saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES;
+ }
rspamd_html_url_is_phished (pool, url,
url->visible_part,
diff --git a/src/libserver/url.c b/src/libserver/url.c
index d36704e73..eb663519d 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1339,7 +1339,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
if (!u_isalnum (uc)) {
/* Bad symbol */
if (IS_ZERO_WIDTH_SPACE (uc)) {
- (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
+ (*flags) |= RSPAMD_URL_FLAG_OBSCURED|RSPAMD_URL_FLAG_ZW_SPACES;
}
else {
if (!u_isgraph (uc)) {
@@ -2308,10 +2308,8 @@ rspamd_url_parse (struct rspamd_url *uri,
unquoted_len = rspamd_url_decode (rspamd_url_host_unsafe (uri),
rspamd_url_host_unsafe (uri), uri->hostlen);
- if (rspamd_normalise_unicode_inplace (pool,
- rspamd_url_host_unsafe (uri), &unquoted_len)) {
- uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
+ rspamd_url_normalise_propagate_flags (pool, rspamd_url_host_unsafe (uri),
+ &unquoted_len, uri->flags);
rspamd_url_shift (uri, unquoted_len, UF_HOST);
@@ -2380,10 +2378,10 @@ rspamd_url_parse (struct rspamd_url *uri,
if (uri->datalen) {
unquoted_len = rspamd_url_decode (rspamd_url_data_unsafe (uri),
rspamd_url_data_unsafe (uri), uri->datalen);
- if (rspamd_normalise_unicode_inplace (pool, rspamd_url_data_unsafe (uri),
- &unquoted_len)) {
- uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
+
+ rspamd_url_normalise_propagate_flags (pool, rspamd_url_data_unsafe (uri),
+ &unquoted_len, uri->flags);
+
rspamd_url_shift (uri, unquoted_len, UF_PATH);
/* We now normalize path */
rspamd_http_normalize_path_inplace (rspamd_url_data_unsafe (uri),
@@ -2395,10 +2393,9 @@ rspamd_url_parse (struct rspamd_url *uri,
unquoted_len = rspamd_url_decode (rspamd_url_query_unsafe (uri),
rspamd_url_query_unsafe (uri),
uri->querylen);
- if (rspamd_normalise_unicode_inplace (pool, rspamd_url_query_unsafe (uri),
- &unquoted_len)) {
- uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
+
+ rspamd_url_normalise_propagate_flags (pool, rspamd_url_query_unsafe (uri),
+ &unquoted_len, uri->flags);
rspamd_url_shift (uri, unquoted_len, UF_QUERY);
}
@@ -2406,10 +2403,9 @@ rspamd_url_parse (struct rspamd_url *uri,
unquoted_len = rspamd_url_decode (rspamd_url_fragment_unsafe (uri),
rspamd_url_fragment_unsafe (uri),
uri->fragmentlen);
- if (rspamd_normalise_unicode_inplace (pool, rspamd_url_fragment_unsafe (uri),
- &unquoted_len)) {
- uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
- }
+
+ rspamd_url_normalise_propagate_flags (pool, rspamd_url_fragment_unsafe (uri),
+ &unquoted_len, uri->flags);
rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
}
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 249c316e4..72fce5f9e 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -127,9 +127,9 @@ enum rspamd_url_find_type {
* Initialize url library
* @param cfg
*/
-void rspamd_url_init (const gchar *tld_file);
+void rspamd_url_init(const gchar *tld_file);
-void rspamd_url_deinit (void);
+void rspamd_url_deinit(void);
/*
* Parse urls inside text
@@ -138,10 +138,10 @@ void rspamd_url_deinit (void);
* @param part current text part
* @param is_html turn on html euristic
*/
-void rspamd_url_text_extract (rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- enum rspamd_url_find_type how);
+void rspamd_url_text_extract(rspamd_mempool_t *pool,
+ struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_url_find_type how);
/*
* Parse a single url into an uri structure
@@ -149,11 +149,11 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool,
* @param uristring text form of url
* @param uri url object, must be pre allocated
*/
-enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
- gchar *uristring,
- gsize len,
- rspamd_mempool_t *pool,
- enum rspamd_url_parse_flags flags);
+enum uri_errno rspamd_url_parse(struct rspamd_url *uri,
+ gchar *uristring,
+ gsize len,
+ rspamd_mempool_t *pool,
+ enum rspamd_url_parse_flags flags);
/*
* Try to extract url from a text
@@ -165,17 +165,17 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
* @param url_str storage for url string(or NULL)
* @return TRUE if url is found in specified text
*/
-gboolean rspamd_url_find (rspamd_mempool_t *pool,
- const gchar *begin, gsize len,
- gchar **url_str,
- enum rspamd_url_find_type how,
- goffset *url_pos,
- gboolean *prefix_added);
+gboolean rspamd_url_find(rspamd_mempool_t *pool,
+ const gchar *begin, gsize len,
+ gchar **url_str,
+ enum rspamd_url_find_type how,
+ goffset *url_pos,
+ gboolean *prefix_added);
/*
* Return text representation of url parsing error
*/
-const gchar *rspamd_url_strerror (int err);
+const gchar *rspamd_url_strerror(int err);
/**
@@ -185,10 +185,10 @@ const gchar *rspamd_url_strerror (int err);
* @param out output rspamd_ftok_t with tld position
* @return TRUE if tld has been found
*/
-gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
+gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out);
-typedef gboolean (*url_insert_function) (struct rspamd_url *url,
- gsize start_offset, gsize end_offset, void *ud);
+typedef gboolean (*url_insert_function)(struct rspamd_url *url,
+ gsize start_offset, gsize end_offset, void *ud);
/**
* Search for multiple urls in text and call `func` for each url found
@@ -199,12 +199,12 @@ typedef gboolean (*url_insert_function) (struct rspamd_url *url,
* @param func
* @param ud
*/
-void rspamd_url_find_multiple (rspamd_mempool_t *pool,
- const gchar *in, gsize inlen,
- enum rspamd_url_find_type how,
- GPtrArray *nlines,
- url_insert_function func,
- gpointer ud);
+void rspamd_url_find_multiple(rspamd_mempool_t *pool,
+ const gchar *in, gsize inlen,
+ enum rspamd_url_find_type how,
+ GPtrArray *nlines,
+ url_insert_function func,
+ gpointer ud);
/**
* Search for a single url in text and call `func` for each url found
@@ -215,11 +215,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool,
* @param func
* @param ud
*/
-void rspamd_url_find_single (rspamd_mempool_t *pool,
- const gchar *in, gsize inlen,
- enum rspamd_url_find_type how,
- url_insert_function func,
- gpointer ud);
+void rspamd_url_find_single(rspamd_mempool_t *pool,
+ const gchar *in, gsize inlen,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud);
/**
* Generic callback to insert URLs into rspamd_task
@@ -228,9 +228,9 @@ void rspamd_url_find_single (rspamd_mempool_t *pool,
* @param end_offset
* @param ud
*/
-gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
- gsize start_offset,
- gsize end_offset, gpointer ud);
+gboolean rspamd_url_task_subject_callback(struct rspamd_url *url,
+ gsize start_offset,
+ gsize end_offset, gpointer ud);
/**
* Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
@@ -239,7 +239,7 @@ gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
* @param size
* @return
*/
-gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
+gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size);
/**
* Encode url if needed. In this case, memory is allocated from the specific pool.
@@ -248,8 +248,8 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
* @param pool
* @return
*/
-const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
- rspamd_mempool_t *pool);
+const gchar *rspamd_url_encode(struct rspamd_url *url, gsize *dlen,
+ rspamd_mempool_t *pool);
/**
@@ -257,14 +257,14 @@ const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
* @param c
* @return
*/
-gboolean rspamd_url_is_domain (int c);
+gboolean rspamd_url_is_domain(int c);
/**
* Returns symbolic name for protocol
* @param proto
* @return
*/
-const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto);
+const gchar *rspamd_url_protocol_name(enum rspamd_url_protocol proto);
/**
@@ -272,7 +272,7 @@ const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto);
* @param str
* @return
*/
-enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str);
+enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str);
/**
* Converts string to a url flag
@@ -280,14 +280,14 @@ enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str);
* @param flag
* @return
*/
-bool rspamd_url_flag_from_string (const gchar *str, gint *flag);
+bool rspamd_url_flag_from_string(const gchar *str, gint *flag);
/**
* Converts url flag to a string
* @param flag
* @return
*/
-const gchar * rspamd_url_flag_to_string (int flag);
+const gchar *rspamd_url_flag_to_string(int flag);
/* Defines sets of urls indexed by url as is */
KHASH_DECLARE (rspamd_url_hash, struct rspamd_url *, char);
@@ -310,24 +310,25 @@ bool rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set,
* @param u
* @return
*/
-struct rspamd_url * rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
- struct rspamd_url *u);
+struct rspamd_url *rspamd_url_set_add_or_return(khash_t (rspamd_url_hash) *set,
+ struct rspamd_url *u);
/**
* Helper for url host set
* @param set
* @param u
* @return
*/
-bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
- struct rspamd_url *u);
+bool rspamd_url_host_set_add(khash_t (rspamd_url_host_hash) *set,
+ struct rspamd_url *u);
/**
* Checks if a url is in set
* @param set
* @param u
* @return
*/
-bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
-bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
+bool rspamd_url_set_has(khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
+
+bool rspamd_url_host_set_has(khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);
/**
* Compares two urls (similar to C comparison functions) lexicographically
@@ -335,15 +336,37 @@ bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd
* @param u2
* @return
*/
-int rspamd_url_cmp (const struct rspamd_url *u1, const struct rspamd_url *u2);
+int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2);
+
/**
* Same but used for qsort to sort `struct rspamd_url *[]` array
* @param u1
* @param u2
* @return
*/
-int rspamd_url_cmp_qsort (const void *u1, const void *u2);
+int rspamd_url_cmp_qsort(const void *u1, const void *u2);
+/**
+ * Normalize unicode input and set out url flags as appropriate
+ * @param pool
+ * @param input
+ * @param len_out (must be &var)
+ * @param url_flags_out (must be just a var with no dereference)
+ */
+#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
+ do { \
+ enum rspamd_normalise_result norm_res; \
+ norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out)); \
+ if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \
+ url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \
+ } \
+ if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \
+ url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \
+ } \
+ if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \
+ url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \
+ } \
+ } while(0)
#ifdef __cplusplus
}
#endif
More information about the Commits
mailing list