lib/pdf/reader/cid/cmap_parser.ex

Select File:
lib/pdf/reader/cid/cmap_parser.ex

defmodule Pdf.Reader.CID.CMapParser do
  @moduledoc """
  Minimal PostScript subset parser for Adobe predefined CMap files.

  Handles only the operators required for CID lookup:
  `begin/endcodespacerange`, `begin/endcidchar`, `begin/endcidrange`,
  `begin/endnotdefchar`, `begin/endnotdefrange`, `usecmap`.

  All other PostScript content (comments, /CMapName, /CIDSystemInfo,
  /WMode, dict/array literals, dup/def/pop, etc.) is silently skipped.

  Returns a parsed struct compatible with `Pdf.Reader.CID.PredefinedCMap`
  for caching and lookup.

  ## Spec references

  - PDF 1.7 (ISO 32000-1) § 9.7.5 — Predefined CMaps:
    https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf
  - PDF 1.7 § 9.7.6 — Codespace ranges:
    https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf
  - Adobe Tech Note #5099 — CMap and CIDFont Files Specification:
    https://adobe-type-tools.github.io/font-tech-notes/pdfs/5099.CMapResources.pdf
  - Adobe Tech Note #5014 — CID-Keyed Font Technology Overview:
    https://adobe-type-tools.github.io/font-tech-notes/pdfs/5014.CIDFont_Spec.pdf
  """

  @type cmap :: %{
          cidchar: %{non_neg_integer() => non_neg_integer()},
          cidrange: [{non_neg_integer(), non_neg_integer(), non_neg_integer()}],
          notdef_chars: %{non_neg_integer() => non_neg_integer()},
          notdef_ranges: [{non_neg_integer(), non_neg_integer(), non_neg_integer()}],
          codespaces: %{(1..4) => [{non_neg_integer(), non_neg_integer()}]},
          parent: String.t() | nil
        }

  @empty_cmap %{
    cidchar: %{},
    cidrange: [],
    notdef_chars: %{},
    notdef_ranges: [],
    codespaces: %{},
    parent: nil
  }

  @doc """
  Parse a PostScript CMap text and return a plain map with the extracted
  CID mapping data.

  Returns `{:ok, cmap_fields}` on success or `{:error, reason}` if the
  input is fundamentally unparseable. Unknown or irrelevant tokens are
  silently skipped — this function NEVER raises.

  ## Return map keys

  - `:cidchar` — `%{code_integer => cid_integer}`
  - `:cidrange` — `[{lo, hi, base_cid}]`
  - `:notdef_chars` — `%{code_integer => cid_integer}`
  - `:notdef_ranges` — `[{lo, hi, base_cid}]`
  - `:codespaces` — `%{byte_length => [{lo, hi}]}`, grouped by byte width
  - `:parent` — `String.t() | nil` — name from `usecmap` directive
  """
  @spec parse(text :: binary()) :: {:ok, cmap()} | {:error, term()}
  def parse(text) when is_binary(text) do
    tokens = tokenize(text)
    acc = dispatch(tokens, @empty_cmap)
    {:ok, acc}
  rescue
    e -> {:error, e}
  end

  def parse(_), do: {:error, :not_binary}

  # ---------------------------------------------------------------------------
  # Tokenizer — produces a flat list of tagged tokens
  # ---------------------------------------------------------------------------

  # Known keyword atoms mapped from their string forms
  @keywords %{
    "begincodespacerange" => :begincodespacerange,
    "endcodespacerange" => :endcodespacerange,
    "begincidchar" => :begincidchar,
    "endcidchar" => :endcidchar,
    "begincidrange" => :begincidrange,
    "endcidrange" => :endcidrange,
    "beginnotdefchar" => :beginnotdefchar,
    "endnotdefchar" => :endnotdefchar,
    "beginnotdefrange" => :beginnotdefrange,
    "endnotdefrange" => :endnotdefrange,
    "usecmap" => :usecmap
  }

  # Tokens we can immediately discard (PostScript boilerplate)
  # NOTE: bare identifiers that are NOT keywords and NOT in this list will be
  # emitted as {:name, word} so they can participate in `/NAME usecmap` and
  # `NAME usecmap` patterns. Only truly structural boilerplate is discarded.
  @skip_words ~w(
    dup def pop begin end dict currentdict findresource defineresource
    begincmap endcmap
  )

  @spec tokenize(binary()) :: list()
  defp tokenize(text) do
    tokenize_loop(text, [])
    |> Enum.reverse()
  end

  defp tokenize_loop(<<>>, acc), do: acc

  # Skip comments: % to end of line
  defp tokenize_loop(<<?%, rest::binary>>, acc) do
    rest2 = skip_to_newline(rest)
    tokenize_loop(rest2, acc)
  end

  # Skip whitespace
  defp tokenize_loop(<<c, rest::binary>>, acc) when c in [?\s, ?\t, ?\n, ?\r] do
    tokenize_loop(rest, acc)
  end

  # Hex string <HHHH>
  defp tokenize_loop(<<?<, rest::binary>>, acc) do
    case read_hex_string(rest) do
      {hex_bytes, rest2} ->
        tokenize_loop(rest2, [{:hex, hex_bytes} | acc])

      :error ->
        # skip malformed hex, advance one char
        tokenize_loop(rest, acc)
    end
  end

  # Parenthesised string — skip entirely (track nesting depth)
  defp tokenize_loop(<<?(, rest::binary>>, acc) do
    rest2 = skip_paren_string(rest, 1)
    tokenize_loop(rest2, acc)
  end

  # Array literal [...] — skip
  defp tokenize_loop(<<?[, rest::binary>>, acc) do
    rest2 = skip_bracket(rest, ?[, ?])
    tokenize_loop(rest2, acc)
  end

  # Dict literal <<...>> — skip (the < is already consumed above for hex)
  # This path handles bare { ... } — PS procedure body
  defp tokenize_loop(<<?{, rest::binary>>, acc) do
    rest2 = skip_bracket(rest, ?{, ?})
    tokenize_loop(rest2, acc)
  end

  # Name token: /IDENT
  defp tokenize_loop(<<?/, rest::binary>>, acc) do
    {name, rest2} = read_identifier(rest)
    tokenize_loop(rest2, [{:name, name} | acc])
  end

  # Integer or identifier
  defp tokenize_loop(<<c, _::binary>> = input, acc) when c in ?0..?9 or c == ?- do
    {word, rest} = read_word(input)

    token =
      case Integer.parse(word) do
        {n, ""} -> {:int, n}
        _ -> classify_word(word)
      end

    case token do
      :skip -> tokenize_loop(rest, acc)
      t -> tokenize_loop(rest, [t | acc])
    end
  end

  defp tokenize_loop(<<_c, _::binary>> = input, acc) do
    {word, rest} = read_word(input)

    token = classify_word(word)

    case token do
      :skip -> tokenize_loop(rest, acc)
      t -> tokenize_loop(rest, [t | acc])
    end
  end

  # Read characters up to (but not including) the next newline
  defp skip_to_newline(<<?\n, rest::binary>>), do: rest
  defp skip_to_newline(<<?\r, rest::binary>>), do: rest
  defp skip_to_newline(<<_c, rest::binary>>), do: skip_to_newline(rest)
  defp skip_to_newline(<<>>), do: <<>>

  # Read hex digits inside <...>, return decoded binary
  defp read_hex_string(bin), do: read_hex_string(bin, <<>>)

  defp read_hex_string(<<?>, rest::binary>>, hex_acc) do
    hex_str = String.trim(hex_acc)

    # Pad to even length
    hex_str =
      if rem(byte_size(hex_str), 2) == 1,
        do: hex_str <> "0",
        else: hex_str

    case Base.decode16(hex_str, case: :mixed) do
      {:ok, decoded} -> {decoded, rest}
      :error -> :error
    end
  end

  defp read_hex_string(<<?\n, rest::binary>>, acc), do: read_hex_string(rest, acc)
  defp read_hex_string(<<?\r, rest::binary>>, acc), do: read_hex_string(rest, acc)
  defp read_hex_string(<<?\s, rest::binary>>, acc), do: read_hex_string(rest, acc)
  defp read_hex_string(<<?\t, rest::binary>>, acc), do: read_hex_string(rest, acc)
  defp read_hex_string(<<c, rest::binary>>, acc), do: read_hex_string(rest, <<acc::binary, c>>)
  defp read_hex_string(<<>>, _acc), do: :error

  # Skip a parenthesised string, tracking nesting
  defp skip_paren_string(<<>>, _depth), do: <<>>
  defp skip_paren_string(<<?), rest::binary>>, 1), do: rest
  defp skip_paren_string(<<?), rest::binary>>, depth), do: skip_paren_string(rest, depth - 1)
  defp skip_paren_string(<<?(, rest::binary>>, depth), do: skip_paren_string(rest, depth + 1)
  defp skip_paren_string(<<_c, rest::binary>>, depth), do: skip_paren_string(rest, depth)

  # Skip matched bracket pair, with nesting
  defp skip_bracket(<<>>, _open, _close), do: <<>>

  defp skip_bracket(<<c, rest::binary>>, _open, close) when c == close, do: rest

  defp skip_bracket(<<c, rest::binary>>, open, close) when c == open do
    rest2 = skip_bracket(rest, open, close)
    skip_bracket(rest2, open, close)
  end

  defp skip_bracket(<<_c, rest::binary>>, open, close),
    do: skip_bracket(rest, open, close)

  # Read word: run of non-whitespace, non-special chars
  @word_stop [?\s, ?\t, ?\n, ?\r, ?<, ?>, ?/, ?(, ?), ?[, ?], ?{, ?}, ?%]

  defp read_word(bin), do: read_word(bin, <<>>)

  defp read_word(<<>>, acc), do: {acc, <<>>}

  defp read_word(<<c, _::binary>> = rest, acc) when c in @word_stop, do: {acc, rest}

  defp read_word(<<c, rest::binary>>, acc), do: read_word(rest, <<acc::binary, c>>)

  # Read an identifier (after /)
  defp read_identifier(bin), do: read_word(bin)

  # Classify a bare word
  defp classify_word(word) when word in @skip_words, do: :skip
  defp classify_word(""), do: :skip

  defp classify_word(word) do
    case Map.get(@keywords, word) do
      nil ->
        # Emit unknown identifiers as {:name, word} — required for bare
        # `NAME usecmap` patterns (usecmap without a leading slash).
        {:name, word}

      kw ->
        {:keyword, kw}
    end
  end

  # ---------------------------------------------------------------------------
  # Dispatcher — walks token list and calls handlers when keywords are found
  # ---------------------------------------------------------------------------

  defp dispatch([], acc), do: acc

  # usecmap — look-ahead: NAME usecmap or /NAME usecmap
  # Both produce {:name, name} in the token stream; match the pair.
  defp dispatch([{:name, name}, {:keyword, :usecmap} | rest], acc) do
    dispatch(rest, %{acc | parent: name})
  end

  # Bare usecmap without a preceding name token (malformed or unparseable context) — skip
  defp dispatch([{:keyword, :usecmap} | rest], acc) do
    dispatch(rest, acc)
  end

  # begincodespacerange
  defp dispatch([{:keyword, :begincodespacerange} | rest], acc) do
    {entries, rest2} = collect_until(rest, :endcodespacerange)
    new_codespaces = handle_codespacerange(entries, acc.codespaces)
    dispatch(rest2, %{acc | codespaces: new_codespaces})
  end

  # begincidchar
  defp dispatch([{:keyword, :begincidchar} | rest], acc) do
    {entries, rest2} = collect_until(rest, :endcidchar)
    new_cidchar = handle_cidchar(entries, acc.cidchar)
    dispatch(rest2, %{acc | cidchar: new_cidchar})
  end

  # begincidrange
  defp dispatch([{:keyword, :begincidrange} | rest], acc) do
    {entries, rest2} = collect_until(rest, :endcidrange)
    new_cidrange = handle_cidrange(entries, acc.cidrange)
    dispatch(rest2, %{acc | cidrange: new_cidrange})
  end

  # beginnotdefchar
  defp dispatch([{:keyword, :beginnotdefchar} | rest], acc) do
    {entries, rest2} = collect_until(rest, :endnotdefchar)
    new_notdef = handle_cidchar(entries, acc.notdef_chars)
    dispatch(rest2, %{acc | notdef_chars: new_notdef})
  end

  # beginnotdefrange
  defp dispatch([{:keyword, :beginnotdefrange} | rest], acc) do
    {entries, rest2} = collect_until(rest, :endnotdefrange)
    new_notdef_ranges = handle_cidrange(entries, acc.notdef_ranges)
    dispatch(rest2, %{acc | notdef_ranges: new_notdef_ranges})
  end

  # int token before a begin* — count; skip it
  defp dispatch([{:int, _n} | rest], acc), do: dispatch(rest, acc)

  # lone name token without usecmap following — skip
  defp dispatch([{:name, _} | rest], acc), do: dispatch(rest, acc)

  # anything else — skip
  defp dispatch([_token | rest], acc), do: dispatch(rest, acc)

  # ---------------------------------------------------------------------------
  # Collect tokens between begin* and end* keywords
  # ---------------------------------------------------------------------------

  defp collect_until(tokens, end_keyword) do
    collect_until(tokens, end_keyword, [])
  end

  defp collect_until([], _end_kw, acc), do: {Enum.reverse(acc), []}

  defp collect_until([{:keyword, end_kw} | rest], end_kw, acc) do
    {Enum.reverse(acc), rest}
  end

  defp collect_until([token | rest], end_kw, acc) do
    collect_until(rest, end_kw, [token | acc])
  end

  # ---------------------------------------------------------------------------
  # Operator handlers
  # ---------------------------------------------------------------------------

  # codespacerange: pairs of hex lo/hi, grouped by byte length of lo
  defp handle_codespacerange(tokens, codespaces) do
    pairs = collect_hex_pairs(tokens)

    Enum.reduce(pairs, codespaces, fn {lo_bytes, hi_bytes}, cs ->
      len = byte_size(lo_bytes)
      lo = bytes_to_integer(lo_bytes)
      hi = bytes_to_integer(hi_bytes)
      Map.update(cs, len, [{lo, hi}], fn existing -> existing ++ [{lo, hi}] end)
    end)
  end

  # cidchar: pairs of hex_code, int_cid
  defp handle_cidchar(tokens, cidchar) do
    collect_hex_int_pairs(tokens)
    |> Enum.reduce(cidchar, fn {code_bytes, cid}, map ->
      code = bytes_to_integer(code_bytes)
      Map.put(map, code, cid)
    end)
  end

  # cidrange: triples of hex_lo, hex_hi, int_base_cid
  defp handle_cidrange(tokens, cidrange) do
    collect_hex_hex_int_triples(tokens)
    |> Enum.reduce(cidrange, fn {lo_bytes, hi_bytes, base_cid}, list ->
      lo = bytes_to_integer(lo_bytes)
      hi = bytes_to_integer(hi_bytes)
      list ++ [{lo, hi, base_cid}]
    end)
  end

  # ---------------------------------------------------------------------------
  # Token collection helpers
  # ---------------------------------------------------------------------------

  defp collect_hex_pairs(tokens), do: collect_hex_pairs(tokens, [])
  defp collect_hex_pairs([], acc), do: Enum.reverse(acc)

  defp collect_hex_pairs([{:hex, lo}, {:hex, hi} | rest], acc) do
    collect_hex_pairs(rest, [{lo, hi} | acc])
  end

  # Skip stray int tokens (the N count leaks here sometimes)
  defp collect_hex_pairs([{:int, _} | rest], acc), do: collect_hex_pairs(rest, acc)
  defp collect_hex_pairs([_other | rest], acc), do: collect_hex_pairs(rest, acc)

  defp collect_hex_int_pairs(tokens), do: collect_hex_int_pairs(tokens, [])
  defp collect_hex_int_pairs([], acc), do: Enum.reverse(acc)

  defp collect_hex_int_pairs([{:hex, code}, {:int, cid} | rest], acc) do
    collect_hex_int_pairs(rest, [{code, cid} | acc])
  end

  defp collect_hex_int_pairs([_other | rest], acc), do: collect_hex_int_pairs(rest, acc)

  defp collect_hex_hex_int_triples(tokens), do: collect_hex_hex_int_triples(tokens, [])
  defp collect_hex_hex_int_triples([], acc), do: Enum.reverse(acc)

  defp collect_hex_hex_int_triples([{:hex, lo}, {:hex, hi}, {:int, base} | rest], acc) do
    collect_hex_hex_int_triples(rest, [{lo, hi, base} | acc])
  end

  defp collect_hex_hex_int_triples([_other | rest], acc),
    do: collect_hex_hex_int_triples(rest, acc)

  # ---------------------------------------------------------------------------
  # Byte helpers
  # ---------------------------------------------------------------------------

  defp bytes_to_integer(bytes) do
    bytes
    |> :binary.bin_to_list()
    |> Enum.reduce(0, fn byte, acc -> acc * 256 + byte end)
  end
end