lib/pdf/reader/encoding.ex

defmodule Pdf.Reader.Encoding do
  @moduledoc """
  Encoding cascade facade for resolving PDF character codes to Unicode codepoints.

  Spec reference: PDF 1.7 § 9.6.5 (Type1 encoding), § 9.10.3 (ToUnicode CMap).

  ## Cascade priority (highest first)

  1. **ToUnicode CMap** — if a `%Pdf.Reader.CMap{}` is provided and the code is mapped,
     return that codepoint immediately.
  2. **`/Differences` + AGL** — if a `/Differences` map is provided and has a glyph name
     for this byte, resolve the glyph name through the Adobe Glyph List.
  3. **Base encoding** — one of `:win_ansi`, `:mac_roman`, or `:standard` (Standard Encoding).
  4. **Unresolved fallback** — emit `{:unresolved, marker}` where `marker` is the glyph name
     (from /Differences) or `"byte:0xNN"` if no glyph name is available.

  ## `resolve_byte/3`

      resolve_byte(byte, cmap_or_nil, opts) :: {:ok, codepoint :: integer()} | {:unresolved, binary()}

  Options:
  - `:differences` — `%{integer() => glyph_name :: binary()}` or `nil`
  - `:base_encoding` — `:win_ansi | :mac_roman | :standard | nil`

  The caller substitutes `U+FFFD` for each `{:unresolved, _}` result and accumulates
  unresolved entries for the `TextRun.unresolved` field (option B shape).
  """

  alias Pdf.Reader.{AGL, CMap}
  alias Pdf.Reader.Encoding.{MacRoman, StandardEncoding, WinAnsi}

  @doc """
  Resolves a single byte to a Unicode codepoint using the encoding cascade.

  Returns `{:ok, codepoint}` on success or `{:unresolved, marker}` when no
  mapping can be found. The marker is either a glyph name (if `/Differences`
  provided one) or `"byte:0xNN"` for a raw byte with no name.
  """
  @spec resolve_byte(0..255, CMap.t() | nil, keyword()) ::
          {:ok, non_neg_integer()} | {:unresolved, binary()}
  def resolve_byte(byte, cmap, opts \\ []) do
    differences = Keyword.get(opts, :differences)
    base_encoding = Keyword.get(opts, :base_encoding)

    # Step 1: ToUnicode CMap (highest priority)
    cmap_result = try_cmap(byte, cmap)

    if cmap_result != nil do
      cmap_result
    else
      # Step 2: /Differences + AGL
      # Returns {:ok, cp} | {:unresolved, glyph_name} | nil
      # nil means: byte not covered by /Differences — fall through.
      # Non-nil means: /Differences claimed this byte (even if AGL missed it).
      diff_result = try_differences(byte, differences)

      if diff_result != nil do
        diff_result
      else
        # Step 3: Base encoding
        base_result = try_base_encoding(byte, base_encoding)

        if base_result != nil do
          base_result
        else
          # Step 4: Fallback — unresolved
          glyph_name = glyph_name_from_differences(byte, differences)

          marker =
            glyph_name ||
              "byte:0x#{Integer.to_string(byte, 16) |> String.pad_leading(2, "0") |> String.upcase()}"

          {:unresolved, marker}
        end
      end
    end
  end

  # ---------------------------------------------------------------------------
  # Step 1: ToUnicode CMap
  # ---------------------------------------------------------------------------

  defp try_cmap(_byte, nil), do: nil

  defp try_cmap(byte, cmap) do
    case CMap.lookup(cmap, byte) do
      nil ->
        nil

      str when is_binary(str) ->
        # The CMap returns a UTF-8 string; we need the first codepoint integer
        case String.next_codepoint(str) do
          {cp_str, _rest} ->
            <<cp::utf8>> = cp_str
            {:ok, cp}

          nil ->
            nil
        end
    end
  end

  # ---------------------------------------------------------------------------
  # Step 2: /Differences + AGL
  # ---------------------------------------------------------------------------

  defp try_differences(_byte, nil), do: nil
  defp try_differences(_byte, differences) when map_size(differences) == 0, do: nil

  defp try_differences(byte, differences) do
    case Map.fetch(differences, byte) do
      {:ok, glyph_name} ->
        # /Differences explicitly named this byte → AGL or unresolved.
        # We do NOT fall through to base encoding when /Differences covers this byte.
        case AGL.glyph_to_unicode(glyph_name) do
          {:ok, codepoint} -> {:ok, codepoint}
          # AGL miss → still "handled" by /Differences; signal unresolved directly.
          :error -> {:unresolved, glyph_name}
        end

      :error ->
        # Byte not mentioned in /Differences → fall through to base encoding.
        nil
    end
  end

  # Helper: get glyph name from differences without resolving
  defp glyph_name_from_differences(_byte, nil), do: nil

  defp glyph_name_from_differences(byte, differences) do
    Map.get(differences, byte)
  end

  # ---------------------------------------------------------------------------
  # Step 3: Base encoding
  # ---------------------------------------------------------------------------

  defp try_base_encoding(_byte, nil), do: nil

  defp try_base_encoding(byte, :win_ansi) do
    case WinAnsi.decode(byte) do
      # WinAnsi returns 0x0000 for undefined bytes
      0 -> nil
      cp -> {:ok, cp}
    end
  end

  defp try_base_encoding(byte, :mac_roman) do
    case MacRoman.decode(byte) do
      :undefined -> nil
      cp -> {:ok, cp}
    end
  end

  defp try_base_encoding(byte, :standard) do
    case StandardEncoding.decode(byte) do
      :undefined -> nil
      cp -> {:ok, cp}
    end
  end
end