commit f605d67: [Rework] URL: Another update for urls extraction logic

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Mar 23 14:56:10 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-03-23 14:50:24 +0000
URL: https://github.com/rspamd/rspamd/commit/f605d670505baad46b8ef4cdfa3dc32f48d4150e (HEAD -> master)

[Rework] URL: Another update for urls extraction logic
URL extraction from HTML parts should look like this:
1. Extract href links
2. Convert HTML to plain text and extract:
  a) (http|https|ftp)://foo.bar and www.foo
  b) email like strings \bfoo at bar.baz\b .
For all extracted strings check if we have host with a domain from the public suffix.

---
 src/libmime/message.c | 17 ++---------------
 src/libserver/url.c   | 14 +++++++-------
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index f167730d4..49d879090 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -887,21 +887,8 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 		}
 	}
 	else {
-		if (mime_part->parent_part) {
-			struct rspamd_mime_part *parent = mime_part->parent_part;
-
-			if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) {
-				/* Do not extract urls from HTML at all */
-			}
-			else {
-				rspamd_url_text_extract (task->task_pool, task, text_part,
-						RSPAMD_URL_FIND_STRICT);
-			}
-		}
-		else {
-			rspamd_url_text_extract (task->task_pool, task, text_part,
-					RSPAMD_URL_FIND_STRICT);
-		}
+		rspamd_url_text_extract (task->task_pool, task, text_part,
+				RSPAMD_URL_FIND_STRICT);
 	}
 
 	if (text_part->exceptions) {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 0669d932f..6aceb8fa6 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -63,10 +63,10 @@ typedef struct url_match_s {
 	gchar st;
 } url_match_t;
 
-#define URL_FLAG_NOHTML (1 << 0)
-#define URL_FLAG_TLD_MATCH (1 << 1)
-#define URL_FLAG_STAR_MATCH (1 << 2)
-#define URL_FLAG_REGEXP (1 << 3)
+#define URL_FLAG_NOHTML (1u << 0u)
+#define URL_FLAG_TLD_MATCH (1u << 1u)
+#define URL_FLAG_STAR_MATCH (1u << 2u)
+#define URL_FLAG_REGEXP (1u << 3u)
 
 struct url_callback_data;
 
@@ -206,12 +206,12 @@ struct url_matcher static_matchers[] = {
 		{"sip:",      "",          url_web_start,   url_web_end,
 				0},
 		{"www.",      "http://",   url_web_start,   url_web_end,
-				URL_FLAG_NOHTML},
+				0},
 		{"ftp.",      "ftp://",    url_web_start,   url_web_end,
-				URL_FLAG_NOHTML},
+				0},
 		/* Likely emails */
 		{"@",         "mailto://", url_email_start, url_email_end,
-				URL_FLAG_NOHTML}
+				0}
 };
 
 


More information about the Commits mailing list