commit b5aea5a: [Project] Lua_content: Start pdf grammar implementation for texts
Vsevolod Stakhov
vsevolod at highsecure.ru
Mon Jan 6 14:56:11 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-01-06 14:42:25 +0000
URL: https://github.com/rspamd/rspamd/commit/b5aea5adb819d74ab97b764dc08f697ee672c6e2
[Project] Lua_content: Start pdf grammar implementation for texts
---
lualib/lua_content/pdf.lua | 100 +++++++++++++++++++++++++++++++++++++++++----
1 file changed, 93 insertions(+), 7 deletions(-)
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 91f317dbe..2bc9e5ce5 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -21,6 +21,7 @@ limitations under the License.
local rspamd_trie = require "rspamd_trie"
local rspamd_util = require "rspamd_util"
+local rspamd_text = require "rspamd_text"
local bit = require "bit"
local N = "lua_content"
local lua_util = require "lua_util"
@@ -53,7 +54,7 @@ local pdf_patterns = {
},
start_object = {
patterns = {
- [[\n\s*\d+ \d+ obj\r?\n]]
+ [=[[\r\n]\s*\d+ \d+ obj[\r\n]]=]
}
},
end_object = {
@@ -63,7 +64,7 @@ local pdf_patterns = {
},
start_stream = {
patterns = {
- [[>\s*stream\r?\n]],
+ [=[>\s*stream[\r\n]]=],
}
},
end_stream = {
@@ -76,12 +77,12 @@ local pdf_patterns = {
local pdf_text_patterns = {
start = {
patterns = {
- [[\s+BT\s+]]
+ [[\sBT\s]]
}
},
stop = {
patterns = {
- [[\s+ET\b]]
+ [[\sET\b]]
}
}
}
@@ -223,9 +224,77 @@ local function gen_outer_grammar()
}
end
+-- Graphic state in PDF
+local function gen_graphics_unary()
+ local P = lpeg.P
+ local S = lpeg.S
+
+ return P("q") + P("Q") + P("h") +
+ P("W") + P("W*") + S("SsFfBb") * P("*")^0 + P("n")
+
+end
+local function gen_graphics_binary()
+ local P = lpeg.P
+
+ return P("g") + P("G") + P("W") + P("J") +
+ P("j") + P("M") + P("ri") + P("gs") + P("i") +
+ P("CS") + P("cs")
+end
+local function gen_graphics_ternary()
+ local P = lpeg.P
+
+ return P("RG") + P("rg") + P("d")
+end
+local function gen_graphics_nary()
+ local P = lpeg.P
+
+ return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K")
+end
+
+-- Generates a grammar to parse text blocks (between BT and ET)
+local function gen_text_grammar()
+ local V = lpeg.V
+ local P = lpeg.P
+ local C = lpeg.C
+ local gen = generic_grammar_elts()
+
+ local empty = ""
+ local unary_ops = C("T*") / "\n" +
+ C(gen_graphics_unary()) / empty
+ local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
+ gen_graphics_binary()
+ local ternary_ops = P("TD") + P("Td") + P("Tf") + gen_graphics_ternary()
+ local nary_op = P("Tm") + gen_graphics_nary()
+ local text_binary_op = P("Tj") + P("TJ") + P("'")
+ local text_quote_op = P('"')
+
+ return lpeg.P{
+ "EXPR";
+ EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
+ COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") + gen.comment) * gen.ws^0,
+ UNARY = unary_ops,
+ BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
+ TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
+ NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
+ ARG = V("ARRAY") + V("DICT") + V("ATOM"),
+ ATOM = (gen.comment + gen.boolean + gen.ref +
+ gen.number + V("STRING") + gen.id),
+ DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
+ KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
+ ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
+ STRING = lpeg.P{gen.str + gen.hexstr},
+ TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
+ (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
+ TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
+ TEXT_ARRAY = "[" *
+ lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
+ }
+end
+
-- Call immediately on require
compile_tries()
pdf_outer_grammar = gen_outer_grammar()
+pdf_text_grammar = gen_text_grammar()
local function extract_text_data(specific)
return nil -- NYI
@@ -363,7 +432,9 @@ local function postprocess_pdf_objects(task, input, pdf)
last = pdf.end_streams[end_pos]
end
-- Strip the first \n
- if input:at(first) == 10 then
+ while first < last do
+ local chr = input:at(first)
+ if chr ~= 13 and chr ~= 10 then break end
first = first + 1
end
local len = last - first
@@ -519,8 +590,23 @@ local function search_text(task, pdf)
end
bl.data = obj.uncompressed:span(bl.start, bl.len)
- lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
- obj.major, obj.minor, bl.data)
+ --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
+ -- obj.major, obj.minor, bl.data)
+
+ if bl.len < 10 * 1024 then
+ local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
+ bl.data)
+
+ if ret then
+ obj.text = rspamd_text.fromtable(obj_or_err)
+ lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
+ obj.major, obj.minor, obj.text)
+ else
+ lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
+ obj.major, obj.minor, obj_or_err)
+ end
+
+ end
end
end
end
More information about the Commits
mailing list