defmodule PdfEx.Content.Tokens do
@moduledoc false
alias PdfEx.COS.Lexer
@max_array_depth 128
# Every standard PDF content-stream operator (ISO 32000 Table A.1), declared
# as literal atoms so they are interned at module load. The lexer interns
# operators via String.to_existing_atom, so without this registry an operator
# would only become an atom if some other module happened to load it first —
# the same lazy-load heisenbug as COS.Parser @known_names. A contributor who
# matches `op.operator == :BDC` is safe because :BDC is registered here; new
# operators must be added to this list (see `known_operators/0`).
@content_operators ~w(
w J j M d ri i gs q Q cm
m l c v y h re
S s f F f* B B* b b* n W W*
BT ET
Tc Tw Tz TL Tf Tr Ts
Td TD Tm T*
Tj TJ ' "
d0 d1
CS cs SC SCN sc scn G g RG rg K k
sh
BI ID EI
Do
MP DP BMC BDC EMC
BX EX
)a
@doc "Every content-stream operator the engine registers as an atom. Add new operators here."
@spec known_operators() :: [atom()]
def known_operators, do: @content_operators
@type op :: %{
operator: atom() | binary(),
operands: [term()],
region: {non_neg_integer(), non_neg_integer()}
}
@spec parse_ops(binary()) :: [op()]
def parse_ops(bin) when is_binary(bin) do
walk(bin, 0, 0, [], [])
end
defp walk(bin, offset, _region_start, _operands, acc) when offset >= byte_size(bin),
do: Enum.reverse(acc)
defp walk(bin, offset, region_start, operands, acc) do
case Lexer.tokenize_next(bin, offset) do
:eof ->
Enum.reverse(acc)
{:ok, :array_open, next} ->
{array, after_array} = collect_array(bin, next, [], 0)
walk(bin, after_array, region_start, [array | operands], acc)
{:ok, {:keyword, op}, next} ->
entry = %{operator: op, operands: Enum.reverse(operands), region: {region_start, next}}
walk(bin, next, next, [], [entry | acc])
{:ok, token, next} ->
walk(bin, next, region_start, [token | operands], acc)
{:error, _} ->
walk(bin, offset + 1, region_start, operands, acc)
end
end
defp collect_array(bin, offset, acc, depth) do
case Lexer.tokenize_next(bin, offset) do
{:ok, :array_close, next} ->
{Enum.reverse(acc), next}
{:ok, :array_open, next} when depth < @max_array_depth ->
{nested, after_nested} = collect_array(bin, next, [], depth + 1)
collect_array(bin, after_nested, [nested | acc], depth)
{:ok, :array_open, next} ->
collect_array(bin, next, acc, depth)
{:ok, token, next} ->
collect_array(bin, next, [token | acc], depth)
:eof ->
{Enum.reverse(acc), offset}
{:error, _} ->
{Enum.reverse(acc), offset + 1}
end
end
end