commit f1e6d84: [Minor] More heuristics in HTML urls detection
Vsevolod Stakhov
vsevolod at highsecure.ru
Thu Feb 21 13:49:03 UTC 2019
Author: Vsevolod Stakhov
Date: 2019-02-21 13:42:05 +0000
URL: https://github.com/rspamd/rspamd/commit/f1e6d84387a1faf4048b2faf5dc268bc255b1152 (HEAD -> master)
[Minor] More heuristics in HTML urls detection
---
src/libserver/html.c | 72 ++++++++++++++++++++++++++++------------------------
1 file changed, 39 insertions(+), 33 deletions(-)
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 8ade5a61e..3353db7b7 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1301,7 +1301,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
gchar *decoded;
gint rc;
gsize decoded_len;
- const gchar *p, *s;
+ const gchar *p, *s, *prefix = "http://";
gchar *d;
guint i, dlen;
gboolean has_bad_chars = FALSE, no_prefix = FALSE;
@@ -1347,44 +1347,50 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
}
if (rspamd_substring_search (start, len, "://", 3) == -1) {
- /* We have no prefix */
- dlen += sizeof ("http://") - 1;
- no_prefix = TRUE;
+ if (len >= sizeof ("mailto:") &&
+ (memcmp (start, "mailto:", sizeof ("mailto:") - 1) == 0 ||
+ memcmp (start, "tel:", sizeof ("tel:") - 1) == 0)) {
+ /* Exclusion, has valid but 'strange' prefix */
+ }
+ else {
+ for (i = 0; i < len; i ++) {
+ if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
+ if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
+ prefix = "http:";
+ dlen += sizeof ("http:") - 1;
+ no_prefix = TRUE;
+ }
+ else if (s[i] == '@') {
+ /* Likely email prefix */
+ prefix = "mailto://";
+ dlen += sizeof ("mailto://") - 1;
+ no_prefix = TRUE;
+ }
+ else {
+ if (i == 0) {
+ /* No valid data */
+ return NULL;
+ }
+ }
+
+ break;
+ }
+ }
+
+ if (!no_prefix) {
+ no_prefix = TRUE;
+ dlen += strlen (prefix);
+ }
+ }
}
decoded = rspamd_mempool_alloc (pool, dlen + 1);
d = decoded;
if (no_prefix) {
- if (s[0] == '/' && (len > 2 && s[1] == '/')) {
- /* //bla case */
- memcpy (d, "http:", sizeof ("http:") - 1);
- d += sizeof ("http:") - 1;
- }
- else if (s[0] == '\\' && (len > 2 && s[1] == '\\')) {
- /* Likely SMB share, ignore */
- return NULL;
- }
- else {
- if (s[0] == '.') {
- /*
- * We have relative URL without base URL:
- * the former is covered by caller function which
- * checks for the base URL.
- *
- * In the most cases, it is caused by a broken client
- */
- return NULL;
- }
- else if ((s[0] & 0x80) || g_ascii_isalnum (s[0])) {
- memcpy (d, "http://", sizeof ("http://") - 1);
- d += sizeof ("http://") - 1;
- }
- else {
- /* Some crap */
- return NULL;
- }
- }
+ gsize plen = strlen (prefix);
+ memcpy (d, prefix, plen);
+ d += plen;
}
/*
More information about the Commits
mailing list