commit 69b1c2e: [Project] Lua_content: Add some pdf support

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Nov 26 17:35:08 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-11-26 16:30:48 +0000
URL: https://github.com/rspamd/rspamd/commit/69b1c2e53f9427a6529f4a1b296de8f8f1ee7322

[Project] Lua_content: Add some pdf support

---
 lualib/lua_content/init.lua |   6 +++
 lualib/lua_content/pdf.lua  | 126 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)

diff --git a/lualib/lua_content/init.lua b/lualib/lua_content/init.lua
index 06d509e00..c23ca9d09 100644
--- a/lualib/lua_content/init.lua
+++ b/lualib/lua_content/init.lua
@@ -31,6 +31,12 @@ local content_modules = {
     extensions = {'ical'},
     output = "text"
   },
+  pdf = {
+    mime_type = "application/pdf",
+    module = require "lua_content/pdf",
+    extensions = {'pdf'},
+    output = "table"
+  },
 }
 
 local modules_by_mime_type
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
new file mode 100644
index 000000000..9ff3d0260
--- /dev/null
+++ b/lualib/lua_content/pdf.lua
@@ -0,0 +1,126 @@
+--[[
+Copyright (c) 2019, Vsevolod Stakhov <vsevolod at highsecure.ru>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]--
+
+--[[[
+-- @module lua_content/pdf
+-- This module contains some heuristics for PDF files
+--]]
+
+local rspamd_trie = require "rspamd_trie"
+local bit = require "bit"
+local pdf_trie
+local N = "lua_content"
+local lua_util = require "lua_util"
+local pdf_patterns = {
+  trailer = {
+    patterns = {
+      [[\ntrailer\r?\n]]
+    }
+  }
+}
+
+-- index[n] ->
+--  t[1] - pattern,
+--  t[2] - key in patterns table,
+--  t[3] - value in patterns table
+--  t[4] - local pattern index
+local pdf_indexes = {}
+local exports = {}
+
+-- Used to process patterns found in PDF
+-- positions for functional processors should be a iter/table from trie matcher in form
+---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
+---- pat_idxn is pattern index and n1 ... nn are match positions
+local processors = {}
+
+local function compile_tries()
+  local default_compile_flags = bit.bor(rspamd_trie.flags.re,
+      rspamd_trie.flags.dot_all,
+      rspamd_trie.flags.single_match,
+      rspamd_trie.flags.no_start)
+  local function compile_pats(patterns, indexes, compile_flags)
+    local strs = {}
+    for what,data in pairs(patterns) do
+      for i,pat in ipairs(data.patterns) do
+        strs[#strs + 1] = pat
+        indexes[#indexes + 1] = {what, data, pat, i}
+      end
+    end
+
+    return rspamd_trie.create(strs, compile_flags or default_compile_flags)
+  end
+
+  if not pdf_trie then
+    pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
+  end
+end
+
+-- Call immediately on require
+compile_tries()
+
+local function process_pdf(input, _, task)
+  local matches = pdf_trie:match(input)
+  local pdf_output = {}
+
+  if matches then
+    local grouped_processors = {}
+    for npat,matched_positions in pairs(matches) do
+      local index = pdf_indexes[npat]
+
+      local proc_key,loc_npat = index[1], index[4]
+
+      if not grouped_processors[proc_key] then
+        grouped_processors[proc_key] = {
+          processor_func = processors[proc_key],
+          offsets = {},
+        }
+      end
+      local proc = grouped_processors[proc_key]
+      -- Fill offsets
+      for _,pos in ipairs(matched_positions) do
+        proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
+      end
+    end
+
+    for name,processor in pairs(grouped_processors) do
+      -- Sort by offset
+      lua_util.debugm(N, task, "pdf: process group %s with %s matches",
+          name, #processor.offsets)
+      table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
+      processor.processor_func(input, task, processor.offsets, pdf_output)
+    end
+
+    return pdf_output
+  end
+end
+
+-- Processes the PDF trailer
+processors.trailer = function(input, task, positions, output)
+  local last_pos = positions[#positions]
+
+  local last_span = input:span(last_pos[1])
+  for line in last_span:lines(true) do
+    if line:find('/Encrypt ') then
+      lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
+          line)
+      output.encrypted = true
+    end
+  end
+end
+
+exports.process = process_pdf
+
+return exports
\ No newline at end of file


More information about the Commits mailing list