lib/pdf/reader/object_resolver.ex

defmodule Pdf.Reader.ObjectResolver do
  @moduledoc """
  Lazy indirect-object resolver with Map-based cache.

  Per the design (`sdd/pdf-reader-core/design` § 5 "Lazy Resolution Contract"):

  - **Cache**: a plain `Map` on `%Pdf.Reader.Document{}.cache`. No GenServer.
    Key: `{obj_num, gen_num}`. Value: the resolved Elixir term.
  - **Signature**: `resolve(doc, {:ref, n, g}) :: {:ok, value, doc} | {:error, reason}`.
    The returned `doc` carries the updated cache. The caller threads `doc` forward
    for cache benefit; dropping the updated doc still yields correct results on
    the next call (re-parse, same value — the binary is immutable).
  - **Idempotent**: calling `resolve/2` twice on the same ref with the same doc
    returns the same value. The cache is a hint, not state.

  ## Resolution paths

  1. **Cache hit**: `Map.get(doc.cache, {n, g})` → immediate return.
  2. **In-use (classic)**: look up `{n, g}` in `doc.xref` → `{:in_use, offset, _gen}`.
     Slice `binary_part(doc.binary, offset, ...)`, run `Parser.parse_object/1`.
  3. **Compressed (ObjStm)**: look up `{n, g}` → `{:compressed, objstm_n, index}`.
     Recursively `resolve(doc, {:ref, objstm_n, 0})`, decode filters, then
     `ObjectStream.fetch/3`.
  4. **Free / absent**: `{:error, {:unresolved_ref, {n, g}}}`.

  ## Ref chasing

  `resolve/2` does **NOT** automatically follow nested refs. If a resolved value
  is itself `{:ref, _, _}`, the caller decides whether to chase it. This avoids
  infinite loops on circular references and keeps the interface predictable.
  """

  alias Pdf.Reader.{Document, Filter, ObjectStream, Parser}
  alias Pdf.Reader.Encryption.{StandardHandler, V1V2, V4, V5}

  # ---------------------------------------------------------------------------
  # Public API
  # ---------------------------------------------------------------------------

  @doc """
  Resolve an indirect object reference to its value.

  Returns `{:ok, value, updated_doc}` on success, where `updated_doc` has
  the resolved value cached. Returns `{:error, reason}` on failure.

  The caller should thread the returned `doc` forward to benefit from caching
  on subsequent resolutions.

  ## Error reasons

  - `{:error, {:unresolved_ref, {n, g}}}` — ref is absent from xref or is a free entry.
  - `{:error, :malformed}` — parse failure.
  - `{:error, {:unsupported_filter, name}}` — propagated from filter chain.
  """
  @spec resolve(Document.t(), {:ref, pos_integer(), non_neg_integer()}) ::
          {:ok, term(), Document.t()} | {:error, term()}
  def resolve(%Document{} = doc, {:ref, n, g}) do
    key = {n, g}

    case Map.get(doc.cache, key) do
      nil ->
        # Cache miss — look up in xref and resolve from binary.
        do_resolve(doc, key)

      value ->
        # Cache hit — return immediately, doc unchanged.
        {:ok, value, doc}
    end
  end

  # ---------------------------------------------------------------------------
  # Internal — xref dispatch
  # ---------------------------------------------------------------------------

  defp do_resolve(doc, {n, g} = key) do
    case Map.get(doc.xref, key) do
      nil ->
        {:error, {:unresolved_ref, {n, g}}}

      :free ->
        {:error, {:unresolved_ref, {n, g}}}

      {:in_use, offset, _gen} ->
        resolve_in_use(doc, key, offset)

      {:compressed, objstm_n, index} ->
        resolve_compressed(doc, key, objstm_n, index)
    end
  end

  # ---------------------------------------------------------------------------
  # Internal — in-use resolution path
  # ---------------------------------------------------------------------------

  defp resolve_in_use(doc, {n, g} = key, offset) do
    total = byte_size(doc.binary)

    if offset >= total do
      {:error, {:unresolved_ref, key}}
    else
      # Slice from offset to end of binary (Parser reads until endobj).
      slice = binary_part(doc.binary, offset, total - offset)

      case Parser.parse_object(slice) do
        {:ok, _ref, value, _rest} ->
          # R-ENC9: decryption hook MUST be here, AFTER parse, BEFORE cache write.
          # R-ENC11: when doc.encryption is nil, skip entirely — no perf regression.
          value = maybe_decrypt_value(value, n, g, doc.encryption)
          # Cache the resolved (and possibly decrypted) value.
          updated_doc = %{doc | cache: Map.put(doc.cache, key, value)}
          {:ok, value, updated_doc}

        {:error, _} ->
          {:error, :malformed}
      end
    end
  end

  # ---------------------------------------------------------------------------
  # Internal — compressed resolution path (ObjStm)
  # ---------------------------------------------------------------------------

  # Per design § 5: ObjStm resolution steps:
  # 1. Recursively resolve the ObjStm object itself → {:stream, dict, raw_bytes}.
  #    The ObjStm stream's raw bytes are decrypted by resolve_in_use/3 (the encryption
  #    hook fires there). After that point the ObjStm body is already plaintext.
  # 2. Apply filter chain to decode the body.
  # 3. Call ObjectStream.fetch/3 with decoded body, /First, and index.
  # 4. Cache the result and return.
  #
  # INVARIANT (R-ENC10): This function MUST NOT apply any decryption to inner
  # objects. Objects extracted from an ObjStm are already plaintext after the
  # ObjStm stream itself was decrypted in step 1. Decrypting them here would
  # corrupt their values (double-decryption). Never add a decrypt_value call
  # anywhere in this function.
  defp resolve_compressed(doc, key, objstm_n, index) do
    with {:ok, stream_value, doc2} <- resolve(doc, {:ref, objstm_n, 0}),
         {:stream, dict, raw_body} <- ensure_stream(stream_value),
         {:ok, decoded_body} <- decode_stream_body(dict, raw_body),
         {:ok, first} <- extract_first(dict),
         {:ok, value} <- ObjectStream.fetch(first, decoded_body, index) do
      # Cache the compressed object under its own key.
      updated_doc = %{doc2 | cache: Map.put(doc2.cache, key, value)}
      {:ok, value, updated_doc}
    else
      {:error, _} = err -> err
      :not_a_stream -> {:error, {:objstm_unsupported, :not_a_stream}}
    end
  end

  defp ensure_stream({:stream, _, _} = stream), do: stream
  defp ensure_stream(_), do: :not_a_stream

  # Decode the ObjStm stream body through the filter chain.
  # Filter names may be {:name, binary()} tuples (from the parser) or plain strings.
  # Filter.apply_chain/3 handles both via its internal resolve_module/1.
  defp decode_stream_body(dict, raw_body) do
    filter = Map.get(dict, "Filter")
    parms = Map.get(dict, "DecodeParms")

    if is_nil(filter) do
      {:ok, raw_body}
    else
      Filter.apply_chain(raw_body, filter, parms || %{})
    end
  end

  defp extract_first(dict) do
    case Map.get(dict, "First") do
      v when is_integer(v) -> {:ok, v}
      _ -> {:error, {:objstm_unsupported, :missing_first}}
    end
  end

  # ---------------------------------------------------------------------------
  # Encryption value walker (R-ENC9, R-ENC11, R-ENC12, R-ENC16)
  # ---------------------------------------------------------------------------

  # R-ENC11: when encryption is nil, return value unchanged — no branches executed.
  defp maybe_decrypt_value(value, _obj_num, _gen_num, nil), do: value

  # Dispatch by handler version to the correct decrypt module.
  defp maybe_decrypt_value(value, obj_num, gen_num, %StandardHandler{} = handler) do
    decrypt_value(value, obj_num, gen_num, handler)
  end

  # Streams: decrypt raw_bytes, recurse into dict for embedded strings (R-ENC12, R-ENC16).
  # R-ENC15: /EncryptMetadata false and /Identity crypt filter short-circuit in V4/V5.
  # The stream dict may contain string values (e.g. /Author) that must also be decrypted.
  defp decrypt_value({:stream, dict, raw_bytes}, obj_num, gen_num, handler) do
    # Decrypt the stream bytes using the version-appropriate function.
    decrypted_bytes =
      case do_decrypt_stream(raw_bytes, dict, obj_num, gen_num, handler) do
        {:ok, plain} -> plain
        # On error (e.g. bad PKCS7 padding with wrong key), keep ciphertext intact
        # to avoid masking authentication failures with crashes.
        _ -> raw_bytes
      end

    # Recurse into the stream dict to decrypt any embedded string values.
    decrypted_dict = decrypt_dict_values(dict, obj_num, gen_num, handler)
    {:stream, decrypted_dict, decrypted_bytes}
  end

  # Dicts: recurse into values, decrypting every string leaf (R-ENC16).
  defp decrypt_value(dict, obj_num, gen_num, handler) when is_map(dict) do
    decrypt_dict_values(dict, obj_num, gen_num, handler)
  end

  # Arrays: recurse into each element (R-ENC16).
  defp decrypt_value(list, obj_num, gen_num, handler) when is_list(list) do
    Enum.map(list, &decrypt_value(&1, obj_num, gen_num, handler))
  end

  # String values: decrypt using per-object key (R-ENC16).
  defp decrypt_value({:string, bytes}, obj_num, gen_num, handler) when is_binary(bytes) do
    case do_decrypt_string(bytes, obj_num, gen_num, handler) do
      {:ok, plain} -> {:string, plain}
      _ -> {:string, bytes}
    end
  end

  defp decrypt_value({:hex_string, bytes}, obj_num, gen_num, handler) when is_binary(bytes) do
    case do_decrypt_string(bytes, obj_num, gen_num, handler) do
      {:ok, plain} -> {:hex_string, plain}
      _ -> {:hex_string, bytes}
    end
  end

  # Plain values (numbers, bools, names, refs, null, atoms) pass through unchanged.
  defp decrypt_value(value, _obj_num, _gen_num, _handler), do: value

  # ---------------------------------------------------------------------------
  # Dict recursion — decrypt all string values in a dict map
  # ---------------------------------------------------------------------------

  defp decrypt_dict_values(dict, obj_num, gen_num, handler) when is_map(dict) do
    Map.new(dict, fn {k, v} ->
      {k, decrypt_value(v, obj_num, gen_num, handler)}
    end)
  end

  # ---------------------------------------------------------------------------
  # Version dispatch for stream decryption
  # ---------------------------------------------------------------------------

  defp do_decrypt_stream(bytes, stream_dict, obj_num, gen_num, %StandardHandler{version: v} = h)
       when v in [1, 2] do
    V1V2.decrypt_stream(bytes, stream_dict, obj_num, gen_num, h)
  end

  defp do_decrypt_stream(bytes, stream_dict, obj_num, gen_num, %StandardHandler{version: 4} = h) do
    V4.decrypt_stream(bytes, stream_dict, obj_num, gen_num, h)
  end

  defp do_decrypt_stream(bytes, stream_dict, obj_num, gen_num, %StandardHandler{version: 5} = h) do
    V5.decrypt_stream(bytes, stream_dict, obj_num, gen_num, h)
  end

  defp do_decrypt_stream(bytes, _stream_dict, _obj_num, _gen_num, _handler), do: {:ok, bytes}

  # ---------------------------------------------------------------------------
  # Version dispatch for string decryption
  # ---------------------------------------------------------------------------

  defp do_decrypt_string(bytes, obj_num, gen_num, %StandardHandler{version: v} = h)
       when v in [1, 2] do
    V1V2.decrypt_string(bytes, obj_num, gen_num, h)
  end

  defp do_decrypt_string(bytes, obj_num, gen_num, %StandardHandler{version: 4} = h) do
    V4.decrypt_string(bytes, obj_num, gen_num, h)
  end

  defp do_decrypt_string(bytes, obj_num, gen_num, %StandardHandler{version: 5} = h) do
    V5.decrypt_string(bytes, obj_num, gen_num, h)
  end

  defp do_decrypt_string(bytes, _obj_num, _gen_num, _handler), do: {:ok, bytes}
end