commit 78ce6c3: [Minor] Lua_content: Make text/urls extraction optional

Vsevolod Stakhov vsevolod at highsecure.ru
Sun Jan 19 09:56:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-19 09:39:06 +0000
URL: https://github.com/rspamd/rspamd/commit/78ce6c313d9b8d1a104ba15d27363b5303cdc6c8

[Minor] Lua_content: Make text/urls extraction optional

---
 lualib/lua_content/pdf.lua | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 85f939869..b577677e8 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -122,6 +122,8 @@ local exports = {}
 local config = {
   max_extraction_size = 512 * 1024,
   max_processing_size = 32 * 1024,
+  text_extraction = false, -- NYI feature
+  url_extraction = true,
   enabled = true,
 }
 
@@ -626,7 +628,11 @@ local function process_dict(task, pdf, obj, dict)
 
         if obj.fonts[k] then
           local font = obj.fonts[k]
-          process_font(task, pdf, font, k)
+
+          if config.text_extraction then
+            process_font(task, pdf, font, k)
+          end
+
           lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
               k, obj.major, obj.minor, font)
         end
@@ -1047,8 +1053,12 @@ local function process_pdf(input, _, task)
     if pdf_output.start_objects and pdf_output.end_objects then
       -- Postprocess objects
       postprocess_pdf_objects(task, input, pdf_output)
-      search_text(task, pdf_output)
-      search_urls(task, pdf_output)
+      if config.text_extraction then
+        search_text(task, pdf_output)
+      end
+      if config.url_extraction then
+        search_urls(task, pdf_output)
+      end
     else
       pdf_output.flags.no_objects = true
     end


More information about the Commits mailing list