Skip to main content

lib/pdf_ex/content/tokens.ex

defmodule PdfEx.Content.Tokens do
  @moduledoc false

  alias PdfEx.COS.Lexer

  @max_array_depth 128

  # Every standard PDF content-stream operator (ISO 32000 Table A.1), declared
  # as literal atoms so they are interned at module load. The lexer interns
  # operators via String.to_existing_atom, so without this registry an operator
  # would only become an atom if some other module happened to load it first —
  # the same lazy-load heisenbug as COS.Parser @known_names. A contributor who
  # matches `op.operator == :BDC` is safe because :BDC is registered here; new
  # operators must be added to this list (see `known_operators/0`).
  @content_operators ~w(
    w J j M d ri i gs q Q cm
    m l c v y h re
    S s f F f* B B* b b* n W W*
    BT ET
    Tc Tw Tz TL Tf Tr Ts
    Td TD Tm T*
    Tj TJ ' "
    d0 d1
    CS cs SC SCN sc scn G g RG rg K k
    sh
    BI ID EI
    Do
    MP DP BMC BDC EMC
    BX EX
  )a

  @doc "Every content-stream operator the engine registers as an atom. Add new operators here."
  @spec known_operators() :: [atom()]
  def known_operators, do: @content_operators

  @type op :: %{
          operator: atom() | binary(),
          operands: [term()],
          region: {non_neg_integer(), non_neg_integer()}
        }

  @spec parse_ops(binary()) :: [op()]
  def parse_ops(bin) when is_binary(bin) do
    walk(bin, 0, 0, [], [])
  end

  defp walk(bin, offset, _region_start, _operands, acc) when offset >= byte_size(bin),
    do: Enum.reverse(acc)

  defp walk(bin, offset, region_start, operands, acc) do
    case Lexer.tokenize_next(bin, offset) do
      :eof ->
        Enum.reverse(acc)

      {:ok, :array_open, next} ->
        {array, after_array} = collect_array(bin, next, [], 0)
        walk(bin, after_array, region_start, [array | operands], acc)

      {:ok, {:keyword, op}, next} ->
        entry = %{operator: op, operands: Enum.reverse(operands), region: {region_start, next}}
        walk(bin, next, next, [], [entry | acc])

      {:ok, token, next} ->
        walk(bin, next, region_start, [token | operands], acc)

      {:error, _} ->
        walk(bin, offset + 1, region_start, operands, acc)
    end
  end

  defp collect_array(bin, offset, acc, depth) do
    case Lexer.tokenize_next(bin, offset) do
      {:ok, :array_close, next} ->
        {Enum.reverse(acc), next}

      {:ok, :array_open, next} when depth < @max_array_depth ->
        {nested, after_nested} = collect_array(bin, next, [], depth + 1)
        collect_array(bin, after_nested, [nested | acc], depth)

      {:ok, :array_open, next} ->
        collect_array(bin, next, acc, depth)

      {:ok, token, next} ->
        collect_array(bin, next, [token | acc], depth)

      :eof ->
        {Enum.reverse(acc), offset}

      {:error, _} ->
        {Enum.reverse(acc), offset + 1}
    end
  end
end