commit 5add81d: [Feature] Lua_mime: Add ability to do multipattern replacement

Vsevolod Stakhov vsevolod at
Tue Oct 27 15:21:08 UTC 2020

Author: Vsevolod Stakhov
Date: 2020-10-27 15:15:39 +0000
[Feature] Lua_mime: Add ability to do multipattern replacement

 lualib/lua_mime.lua | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)

diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua
index 1f0fb38f2..5088f768d 100644
--- a/lualib/lua_mime.lua
+++ b/lualib/lua_mime.lua
@@ -249,6 +249,256 @@ exports.add_text_footer = function(task, html_footer, text_footer)
   return state
+local function do_replacement (task, part, mp, replacements,
+                               is_multipart, out, state)
+  local tp = part:get_text()
+  local ct = 'text/plain'
+  local cte = 'quoted-printable'
+  local newline_s = state.newline_s
+  if tp:is_html() then
+    ct = 'text/html'
+  end
+  local encode_func = function(input)
+    return rspamd_util.encode_qp(input, 80, task:get_newlines_type())
+  end
+  if part:get_cte() == '7bit' then
+    cte = '7bit'
+    encode_func = function(input)
+      if type(input) == 'userdata' then
+        return input
+      else
+        return rspamd_text.fromstring(input)
+      end
+    end
+  end
+  if is_multipart then
+    out[#out + 1] = string.format('Content-Type: %s; charset=utf-8%s'..
+        'Content-Transfer-Encoding: %s',
+        ct, newline_s, cte)
+    out[#out + 1] = ''
+  else
+    state.new_cte = cte
+  end
+  local content = tp:get_content('raw_utf') or rspamd_text.fromstring('')
+  local match_pos = mp:match(content, true)
+  if match_pos then
+    -- sort matches and form the table:
+    -- start .. end for inclusion position
+    local matches_flattened = {}
+    for npat,matches in pairs(match_pos) do
+      for _,m in ipairs(matches) do
+        table.insert(matches_flattened, {m, npat})
+      end
+    end
+    -- Handle the case of empty match
+    if #matches_flattened == 0 then
+      out[#out + 1] = {part:get_raw_headers(), true}
+      out[#out + 1] = {part:get_raw_content(), false}
+      return
+    end
+    -- now sort flattened by start of match and eliminate all overlaps
+    table.sort(matches_flattened, function(m1, m2) return m1[1][1] < m2[1][1] end)
+    for i=1,#matches_flattened - 1 do
+      local st = matches_flattened[i][1][1] -- current start of match
+      local e = matches_flattened[i][1][2] -- current end of match
+      local max_npat = matches_flattened[i][2]
+      for j=i+1,#matches_flattened do
+        if matches_flattened[j][1][1] == st then
+          -- overlap
+          if matches_flattened[j][1][2] > e then
+            -- larger exclusion and switch replacement
+            e = matches_flattened[j][1][2]
+            max_npat = matches_flattened[j][2]
+          end
+        else
+          break
+        end
+      end
+      -- Maximum overlap for all matches
+      for j=i,#matches_flattened do
+        if matches_flattened[j][1][1] == st then
+          if e > matches_flattened[j][1][2] then
+            matches_flattened[j][1][2] = e
+            matches_flattened[j][2] = max_npat
+          end
+        else
+          break
+        end
+      end
+    end
+    -- Now flattened match table is sorted by start pos and has the maximum overlapped pattern
+    local cur_start = 1
+    local fragments = {}
+    for _,m in ipairs(matches_flattened) do
+      if m[1][1] > cur_start then
+        fragments[#fragments + 1] = content:span(cur_start, m[1][1] - cur_start)
+        fragments[#fragments + 1] = replacements[m[2]]
+        cur_start = m[1][2] + 1 -- end of match
+      end
+    end
+    -- last part
+    if cur_start < #content then
+      fragments[#fragments + 1] = content:span(cur_start)
+    end
+    -- Final stuff
+    out[#out + 1] = {encode_func(rspamd_text.fromtable(fragments)), true}
+    out[#out + 1] = ''
+  else
+    -- No matches
+    out[#out + 1] = {part:get_raw_headers(), true}
+    out[#out + 1] = {part:get_raw_content(), false}
+  end
+-- @function lua_mime.multipattern_text_replace(task, mp, replacements)
+-- Replaces text according to multipattern matches. It returns a table with the following
+-- fields:
+-- * out: new content (body only)
+-- * need_rewrite_ct: boolean field that means if we must rewrite content type
+-- * new_ct: new content type (type => string, subtype => string)
+-- * new_cte: new content-transfer encoding (string)
+exports.multipattern_text_replace = function(task, mp, replacements)
+  local newline_s = newline(task)
+  local state = {
+    newline_s = newline_s
+  }
+  local out = {}
+  local text_parts = task:get_text_parts()
+  if not mp or not (text_parts and #text_parts > 0) then
+    return false
+  end
+  -- We need to take extra care about content-type and cte
+  local ct = task:get_header('Content-Type')
+  if ct then
+    ct = rspamd_util.parse_content_type(ct, task:get_mempool())
+  end
+  if ct then
+    if ct.type and ct.type == 'text' then
+      state.need_rewrite_ct = true
+      state.new_ct = ct
+    end
+  else
+    -- No explicit CT, need to guess
+    if text_parts then
+      if #text_parts == 1 then
+        state.need_rewrite_ct = true
+        state.new_ct = {
+          type = 'text',
+          subtype = 'plain'
+        }
+      elseif #text_parts > 1 then
+        -- XXX: in fact, it cannot be
+        state.new_ct = {
+          type = 'multipart',
+          subtype = 'mixed'
+        }
+      end
+    end
+  end
+  local boundaries = {}
+  local cur_boundary
+  for _,part in ipairs(task:get_parts()) do
+    local boundary = part:get_boundary()
+    if part:is_multipart() then
+      if cur_boundary then
+        out[#out + 1] = string.format('--%s',
+            boundaries[#boundaries])
+      end
+      boundaries[#boundaries + 1] = boundary or '--XXX'
+      cur_boundary = boundary
+      local rh = part:get_raw_headers()
+      if #rh > 0 then
+        out[#out + 1] = {rh, true}
+      end
+    elseif part:is_message() then
+      if boundary then
+        if cur_boundary and boundary ~= cur_boundary then
+          -- Need to close boundary
+          out[#out + 1] = string.format('--%s--%s',
+              boundaries[#boundaries], newline_s)
+          table.remove(boundaries)
+          cur_boundary = nil
+        end
+        out[#out + 1] = string.format('--%s',
+            boundary)
+      end
+      out[#out + 1] = {part:get_raw_headers(), true}
+    else
+      local skip_replacement = part:is_attachment()
+      local parent = part:get_parent()
+      if parent then
+        local t,st = parent:get_type()
+        if t == 'multipart' and st == 'signed' then
+          -- Do not modify signed parts
+          skip_replacement = true
+        end
+      end
+      if not part:is_text() then
+        skip_replacement = true
+      end
+      if boundary then
+        if cur_boundary and boundary ~= cur_boundary then
+          -- Need to close boundary
+          out[#out + 1] = string.format('--%s--%s',
+              boundaries[#boundaries], newline_s)
+          table.remove(boundaries)
+          cur_boundary = boundary
+        end
+        out[#out + 1] = string.format('--%s',
+            boundary)
+      end
+      if not skip_replacement then
+        do_replacement(task, part, mp, replacements,
+            parent and parent:is_multipart(), out, state)
+      else
+        -- Append as is
+        out[#out + 1] = {part:get_raw_headers(), true}
+        out[#out + 1] = {part:get_raw_content(), false}
+      end
+    end
+  end
+  -- Close remaining
+  local b = table.remove(boundaries)
+  while b do
+    out[#out + 1] = string.format('--%s--', b)
+    if #boundaries > 0 then
+      out[#out + 1] = ''
+    end
+    b = table.remove(boundaries)
+  end
+  state.out = out
+  return state
 -- All mime extensions with corresponding content types
 exports.full_extensions_map = {
   {"323", "text/h323"},

