lib/pdf/reader/cid/decoder.ex

Select File:
lib/pdf/reader/cid/decoder.ex

defmodule Pdf.Reader.CID.Decoder do
  @moduledoc """
  CID font decoder for Type0/Identity-H and Identity-V composite fonts.

  Returns a `decoder_fn()` closure with the same contract as the simple-font
  decoder: `(binary() -> {String.t(), [{non_neg_integer(), binary()}]})`.

  ## Resolution cascade (per CID)

  1. **ToUnicode CMap** — if the font has a `/ToUnicode` stream, its `bf_char`/
     `bf_range` entries are checked first (most specific).
  2. **Adobe registry table** — `/CIDSystemInfo /Ordering` maps to one of the four
     bundled collection modules (`AdobeJapan1`, `AdobeCNS1`, `AdobeKorea1`,
     `AdobeGB1`). O(1) pattern-match dispatch.
  3. **U+FFFD fallback** — unresolved CIDs yield `U+FFFD` plus a sentinel tuple
     `{idx, "cid:0xHHHH"}` appended to the unresolved list.

  ## `__test_cmap__` shortcut

  For unit tests, a pre-parsed `%Pdf.Reader.CMap{}` can be injected by storing
  it in the font dict under the key `"__test_cmap__"`. This bypasses stream
  resolution. (Mirrors the same shortcut in `Pdf.Reader.Font`.)

  ## Width / advance computation

  This module handles **character decoding only** (bytes → Unicode text). Glyph
  advance widths (`/W` and `/DW` entries on the DescendantFonts[0] dict) are read
  separately by `Pdf.Reader.Font.Widths` (§ 9.7.4.3). The two concerns are
  intentionally kept in separate modules: decoding and advance computation are
  independent of each other.

  ## Spec references

  - PDF 1.7 (ISO 32000-1) § 9.7 — Composite Fonts:
    https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf
  - PDF 1.7 § 9.7.4 — CIDFonts
  - PDF 1.7 § 9.7.4.3 — /W and /DW arrays (handled by `Pdf.Reader.Font.Widths`)
  - PDF 1.7 § 9.7.5 — CMaps (Identity-H, Identity-V predefined)
  """

  alias Pdf.Reader.CID.{
    AdobeCNS1,
    AdobeGB1,
    AdobeJapan1,
    AdobeKorea1,
    CIDToGIDMap,
    Codespace,
    PredefinedCMap
  }

  alias Pdf.Reader.{CMap, Document, Filter, ObjectResolver}

  @type decoder_fn :: (binary() -> {String.t(), [{non_neg_integer(), binary()}]})

  @unresolved_char "�"

  @doc """
  Build a CID decoder closure from a Type0 font dict.

  `font_dict` is the top-level Type0 font dictionary (already resolved).
  Reads `DescendantFonts`, `CIDSystemInfo`, `CIDToGIDMap`, and `ToUnicode`.

  Returns `{:ok, decoder_fn, updated_doc}`.
  """
  @spec build(map(), Document.t()) :: {:ok, decoder_fn(), Document.t()} | {:error, term()}
  def build(font_dict, doc) do
    # Step 1: Resolve ToUnicode CMap (may be nil).
    {cmap, doc1} = resolve_cmap(font_dict, doc)

    # Step 2: Resolve DescendantFonts array → first descendant CIDFont dict.
    {descendant, doc2} = resolve_descendant(font_dict, doc1)

    # Step 3: Extract registry atom from CIDSystemInfo.Ordering.
    registry_atom = parse_registry(descendant)

    # Step 4: Parse CIDToGIDMap (stored for completeness; not used in Unicode cascade).
    {_cid_to_gid, doc3} =
      case Map.get(descendant, "CIDToGIDMap") do
        nil ->
          {:identity, doc2}

        cid_to_gid_value ->
          case CIDToGIDMap.parse(cid_to_gid_value, doc2) do
            {:ok, map, doc_out} -> {map, doc_out}
            {:error, _} -> {:identity, doc2}
          end
      end

    # Step 5: Build and return the closure.
    decoder = build_closure(cmap, registry_atom)
    {:ok, decoder, doc3}
  end

  @doc """
  Build a predefined CMap decoder closure from a Type0 font dict whose
  `/Encoding` names a bundled predefined CMap (e.g. `UniJIS-UTF16-H`).

  Resolution cascade per code token (PDF 1.7 § 9.7.5, D9):
  1. **ToUnicode CMap** — if present, checked first (most specific).
  2. **Predefined CMap** — `cidchar` → `cidrange` → `notdef` lookup.
  3. **Adobe registry table** — CID → Unicode via AdobeJapan1/CNS1/Korea1/GB1.
  4. **U+FFFD fallback** — unresolved codes yield `U+FFFD` + sentinel.

  Returns `{:ok, decoder_fn, updated_doc}`.
  """
  @spec build_predefined(map(), Document.t()) ::
          {:ok, decoder_fn(), Document.t()} | {:error, term()}
  def build_predefined(font_dict, doc) do
    # Step 1: Resolve ToUnicode CMap (may be nil).
    {to_unicode, doc1} = resolve_cmap(font_dict, doc)

    # Step 2: Extract predefined CMap name and load it.
    predefined_name =
      case Map.get(font_dict, "Encoding") do
        {:name, name} -> name
        _ -> nil
      end

    with {:ok, predefined, doc2} <- PredefinedCMap.load_by_name(predefined_name, doc1) do
      # Step 3: Resolve DescendantFonts → CIDSystemInfo registry atom.
      {descendant, doc3} = resolve_descendant(font_dict, doc2)
      registry_atom = parse_registry(descendant)

      # Step 4: Build the predefined closure.
      decoder = build_predefined_closure(predefined, registry_atom, to_unicode)
      {:ok, decoder, doc3}
    end
  end

  # ---------------------------------------------------------------------------
  # Predefined CMap closure builder
  # ---------------------------------------------------------------------------

  defp build_predefined_closure(predefined, registry_atom, to_unicode) do
    fn bytes ->
      codes = Codespace.tokenize(bytes, predefined.codespaces)
      decode_predefined_codes(codes, predefined, registry_atom, to_unicode, [], [], 0)
    end
  end

  defp decode_predefined_codes([], _pred, _reg, _tou, text_acc, unresolved_acc, _idx) do
    {IO.iodata_to_binary(Enum.reverse(text_acc)), Enum.reverse(unresolved_acc)}
  end

  defp decode_predefined_codes([code | rest], pred, reg, tou, text_acc, unresolved_acc, idx) do
    # 1. ToUnicode CMap takes precedence (R-PCM17, D9)
    case to_unicode_lookup(tou, code) do
      {:ok, string} ->
        decode_predefined_codes(
          rest,
          pred,
          reg,
          tou,
          [string | text_acc],
          unresolved_acc,
          idx + 1
        )

      :error ->
        # 2. Predefined CMap lookup → CID
        case PredefinedCMap.lookup(pred, code) do
          {:ok, cid} ->
            # 3. Registry lookup → Unicode codepoint
            case registry_lookup(cid, reg) do
              {:ok, codepoint} ->
                decode_predefined_codes(
                  rest,
                  pred,
                  reg,
                  tou,
                  [<<codepoint::utf8>> | text_acc],
                  unresolved_acc,
                  idx + 1
                )

              :error ->
                hex = String.pad_leading(Integer.to_string(cid, 16), 4, "0")
                sentinel = {idx, "cid:0x" <> hex}

                decode_predefined_codes(
                  rest,
                  pred,
                  reg,
                  tou,
                  [@unresolved_char | text_acc],
                  [sentinel | unresolved_acc],
                  idx + 1
                )
            end

          :error ->
            # 4. Code in codespace but unmapped → U+FFFD + sentinel
            hex = String.pad_leading(Integer.to_string(code, 16), 4, "0")
            sentinel = {idx, "code:0x" <> hex}

            decode_predefined_codes(
              rest,
              pred,
              reg,
              tou,
              [@unresolved_char | text_acc],
              [sentinel | unresolved_acc],
              idx + 1
            )
        end
    end
  end

  # ToUnicode lookup helper — returns {:ok, string} or :error
  defp to_unicode_lookup(nil, _code), do: :error

  defp to_unicode_lookup(%CMap{} = cmap, code) do
    case CMap.lookup(cmap, code) do
      nil -> :error
      string when is_binary(string) -> {:ok, string}
    end
  end

  # ---------------------------------------------------------------------------
  # Closure builder (Identity-H/V path)
  # ---------------------------------------------------------------------------

  defp build_closure(cmap, registry_atom) do
    fn bytes ->
      {text_chunks, unresolved, _idx} =
        for <<cid::big-unsigned-16 <- bytes>>, reduce: {[], [], 0} do
          {text_acc, unresolved_acc, idx} ->
            case resolve_cid(cid, cmap, registry_atom) do
              {:ok, codepoint} ->
                {[text_acc, <<codepoint::utf8>>], unresolved_acc, idx + 1}

              :error ->
                sentinel =
                  {idx, "cid:0x" <> String.pad_leading(Integer.to_string(cid, 16), 4, "0")}

                {[text_acc, @unresolved_char], [sentinel | unresolved_acc], idx + 1}
            end
        end

      {IO.iodata_to_binary(text_chunks), Enum.reverse(unresolved)}
    end
  end

  # ---------------------------------------------------------------------------
  # CID resolution cascade
  # ---------------------------------------------------------------------------

  defp resolve_cid(cid, cmap, registry_atom) do
    cmap_result =
      case cmap do
        nil -> nil
        _ -> CMap.lookup(cmap, cid)
      end

    case cmap_result do
      nil ->
        registry_lookup(cid, registry_atom)

      string when is_binary(string) ->
        # CMap returns a UTF-8 string; extract the first codepoint.
        case String.to_charlist(string) do
          [cp | _] -> {:ok, cp}
          [] -> :error
        end
    end
  end

  defp registry_lookup(cid, :japan1), do: AdobeJapan1.lookup(cid)
  defp registry_lookup(cid, :cns1), do: AdobeCNS1.lookup(cid)
  defp registry_lookup(cid, :korea1), do: AdobeKorea1.lookup(cid)
  defp registry_lookup(cid, :gb1), do: AdobeGB1.lookup(cid)
  defp registry_lookup(_cid, _), do: :error

  # ---------------------------------------------------------------------------
  # Registry atom parsing
  # ---------------------------------------------------------------------------

  defp parse_registry(descendant) when is_map(descendant) do
    case Map.get(descendant, "CIDSystemInfo") do
      %{"Ordering" => ordering} -> ordering_to_atom(ordering)
      _ -> nil
    end
  end

  # PDF strings come from the parser as {:string, value} or plain binary strings
  # depending on how they appear in the PDF (literal string vs name).
  defp ordering_to_atom({:string, ordering}), do: ordering_to_atom(ordering)
  defp ordering_to_atom("Japan1"), do: :japan1
  defp ordering_to_atom("CNS1"), do: :cns1
  defp ordering_to_atom("Korea1"), do: :korea1
  defp ordering_to_atom("GB1"), do: :gb1
  defp ordering_to_atom(_), do: nil

  # ---------------------------------------------------------------------------
  # DescendantFonts resolution
  # ---------------------------------------------------------------------------

  defp resolve_descendant(font_dict, doc) do
    case Map.get(font_dict, "DescendantFonts") do
      [first | _] ->
        resolve_font_value(first, doc)

      _ ->
        {%{}, doc}
    end
  end

  defp resolve_font_value({:ref, _, _} = ref, doc) do
    case ObjectResolver.resolve(doc, ref) do
      {:ok, dict, doc2} when is_map(dict) -> {dict, doc2}
      _ -> {%{}, doc}
    end
  end

  defp resolve_font_value(dict, doc) when is_map(dict), do: {dict, doc}
  defp resolve_font_value(_, doc), do: {%{}, doc}

  # ---------------------------------------------------------------------------
  # CMap resolution (mirrors Font.resolve_cmap pattern)
  # ---------------------------------------------------------------------------

  # Test shortcut: pre-parsed CMap stored under "__test_cmap__" key.
  defp resolve_cmap(%{"__test_cmap__" => %CMap{} = cmap}, doc), do: {cmap, doc}

  defp resolve_cmap(font_dict, doc) do
    case Map.get(font_dict, "ToUnicode") do
      nil ->
        {nil, doc}

      {:ref, _, _} = ref ->
        resolve_cmap_ref(ref, doc)

      binary when is_binary(binary) ->
        {CMap.parse(binary), doc}
    end
  end

  defp resolve_cmap_ref(ref, doc) do
    with {:ok, {:stream, dict, raw_bytes}, doc2} <- ObjectResolver.resolve(doc, ref),
         filter = Map.get(dict, "Filter"),
         parms = Map.get(dict, "DecodeParms"),
         {:ok, decoded} <- decode_stream(raw_bytes, filter, parms) do
      {CMap.parse(decoded), doc2}
    else
      _ -> {nil, doc}
    end
  end

  defp decode_stream(raw_bytes, nil, _parms), do: {:ok, raw_bytes}

  defp decode_stream(raw_bytes, filter, parms) do
    Filter.apply_chain(raw_bytes, filter, parms || %{})
  end
end