lib/pdf/reader/page.ex

Select File:
lib/pdf/reader/page.ex

defmodule Pdf.Reader.Page do
  @moduledoc """
  Page tree walker for `Pdf.Reader`.

  Spec reference: PDF 1.7 § 7.7.3 (Page Tree), § 7.7.3.4 (Inheritance of Page Attributes).

  ## Page tree structure

  The Catalog's `/Pages` entry points to the root of the page tree.
  A node with `/Type /Pages` is an intermediate node containing a `/Kids`
  array of refs to child nodes (either `/Pages` or `/Page`).
  A node with `/Type /Page` is a leaf — one actual page.

  ## API

      list_refs(doc) :: {:ok, [ref], updated_doc} | {:error, reason}

  Walks the tree recursively, collecting leaf `/Page` refs in document order.
  Threads `doc` forward so that resolved objects accumulate in the cache.

  ## Catalog/Pages tree fallback (R-4)

  When `doc.recover_mode` is `true` and the normal tree walk fails (missing
  `/Root`, dangling `/Pages` ref, or other catalog resolution error), the
  recovery branch scans the xref table directly for objects that match ALL of:

  - `/Type /Page` in the object dict
  - Either `/Contents` OR `/Parent` present (disambiguates from Form XObjects
    which also carry `/Type /XObject /Subtype /Form`)

  The recovered list is in xref-insertion order, NOT document order. This
  known limitation is by design — reconstruction from corrupt trees is
  unreliable. A `{:page_tree_recovered, n}` event is appended to the
  `recovery_log` so callers know page order may differ.

  ## Known limitations (R-4)

  - **Page order loss** — catalog-fallback page order follows xref-insertion
    order, not the original document order. `/Parent` chain reconstruction is
    not attempted (unreliable on corrupt trees). The `{:page_tree_recovered, n}`
    event explicitly signals this to callers.

  - **Encrypted AND corrupted PDFs** — when both the xref table and the catalog
    are corrupt, the R-3 linear scan reconstructs the xref but cannot include
    `/Encrypt` in the synthetic trailer. Without `/Encrypt`, decryption cannot
    proceed and the PDF is non-decryptable even with `recover: true`.

  Spec citations:
  - PDF 1.7 § 7.7.2 — Document catalog (Catalog dict, /Pages entry)
  - PDF 1.7 § 7.7.3 — Page tree (/Pages /Kids traversal)
  - PDF 1.7 § 7.7.3.4 — Inheritance of page attributes
  """

  alias Pdf.Reader.{Document, ObjectResolver}

  # `ref_key/1` and `extract_kid_key/1` keep a defensive fallback clause that
  # accepts already-unwrapped `{n, g}` tuples (or any other shape, in
  # `extract_kid_key/1`'s case). Dialyzer's success-typing on the call sites
  # proves the input is always a parser-emitted `{:ref, _, _}` tuple — the
  # fallback is unreachable today but cheap insurance against parser shape
  # widening in the future. Silence the "pattern can never match" warnings.
  # Reference for the wrap convention: PDF 1.7 § 7.3.10 (indirect references).
  @dialyzer {:nowarn_function, ref_key: 1, extract_kid_key: 1}

  @doc """
  Walks the page tree and returns a list of leaf `/Page` object refs in order.

  Returns `{:ok, refs, updated_doc}` where:
  - `refs` is `[{obj_num, gen_num}]` in page order (or xref order in fallback)
  - `updated_doc` has cache populated from the traversal

  Returns `{:error, reason}` if the page tree cannot be traversed and
  `recover_mode` is `false`.

  When `recover_mode` is `true` and traversal fails, falls back to xref scan
  and appends `{:page_tree_recovered, n}` to `recovery_log`.
  """
  @spec list_refs(Document.t()) ::
          {:ok, [Document.ref()], Document.t()} | {:error, term()}
  def list_refs(%Document{} = doc) do
    # If page_refs were already resolved and cached (e.g. by the R-4 probe in
    # do_open/2 when recover_mode: true), return them immediately without
    # re-walking the tree.
    case doc.page_refs do
      refs when is_list(refs) and refs != [] ->
        {:ok, refs, doc}

      _ ->
        case strict_list_refs(doc) do
          {:ok, refs, doc2} ->
            {:ok, refs, doc2}

          {:error, reason} when doc.recover_mode ->
            recover_pages(doc, reason)

          {:error, _} = err ->
            err
        end
    end
  end

  # ---------------------------------------------------------------------------
  # Internal — strict tree walk
  # ---------------------------------------------------------------------------

  defp strict_list_refs(%Document{trailer: trailer} = doc) do
    case Map.get(trailer, "Root") do
      nil ->
        {:error, :no_pages}

      root_ref ->
        with {:ok, catalog, doc2} <- ObjectResolver.resolve(doc, root_ref),
             {:ok, pages_ref} <- fetch_pages_ref_or_self(root_ref, catalog),
             {:ok, refs, doc3} <- walk_kids(doc2, pages_ref) do
          {:ok, refs, doc3}
        end
    end
  end

  # Attempt to fetch the /Pages ref from the catalog dict.
  # If the root dict IS itself a /Pages node (no Catalog wrapper — some legacy PDFs),
  # return the root ref directly so the walker treats it as the pages tree root.
  defp fetch_pages_ref_or_self(root_ref, catalog) when is_map(catalog) do
    case Map.get(catalog, "Pages") do
      nil ->
        # If the root is already a /Pages node, use it directly as the tree root
        case Map.get(catalog, "Type") do
          {:name, "Pages"} -> {:ok, root_ref}
          _ -> {:error, :no_pages}
        end

      ref ->
        {:ok, ref}
    end
  end

  defp fetch_pages_ref_or_self(_root_ref, _), do: {:error, :no_pages}

  # ---------------------------------------------------------------------------
  # Internal — R-4 catalog/pages fallback
  # ---------------------------------------------------------------------------

  # Scan xref entries for /Type /Page objects that also carry /Contents or /Parent.
  # This filters out Form XObjects which may appear to have /Type /Page in some
  # malformed PDFs, or stream dicts with /Type /XObject /Subtype /Form which
  # definitely do NOT have /Parent or /Contents.
  defp recover_pages(doc, _reason) do
    page_refs = scan_xref_for_pages(doc)
    doc1 = Document.log_recovery(doc, {:page_tree_recovered, length(page_refs)})
    {:ok, page_refs, doc1}
  end

  defp scan_xref_for_pages(%Document{xref: xref} = doc) do
    xref
    |> Enum.flat_map(fn
      {{n, g}, {:in_use, _, _}} when is_integer(n) and is_integer(g) and n > 0 ->
        ref = {:ref, n, g}

        case ObjectResolver.resolve(doc, ref) do
          {:ok, dict, _} when is_map(dict) ->
            if is_page_dict?(dict), do: [{n, g}], else: []

          _ ->
            []
        end

      _ ->
        []
    end)
    |> Enum.sort()
  end

  # A dict qualifies as a recoverable page if:
  # 1. It has /Type /Page
  # 2. It has either /Contents or /Parent (to exclude Form XObjects and bare stream dicts)
  defp is_page_dict?(dict) do
    has_page_type =
      case Map.get(dict, "Type") do
        {:name, "Page"} -> true
        _ -> false
      end

    has_page_type and (Map.has_key?(dict, "Contents") or Map.has_key?(dict, "Parent"))
  end

  # ---------------------------------------------------------------------------
  # Internal — strict tree traversal (used from both strict_list_refs and
  # lenient walk_pages_node in recovery mode)
  # ---------------------------------------------------------------------------

  # Resolve a ref and dispatch on /Type
  defp walk_kids(doc, {:ref, _, _} = ref) do
    case ObjectResolver.resolve(doc, ref) do
      {:ok, node, doc2} -> walk_node(doc2, ref, node)
      {:error, _} = err -> err
    end
  end

  # Walk an already-resolved node — just dispatch on type
  defp walk_node(doc, ref, node) when is_map(node) do
    case Map.get(node, "Type") do
      {:name, "Pages"} -> walk_pages_node(doc, node)
      {:name, "Page"} -> {:ok, [ref_key(ref)], doc}
      # Fallback: if a Kids array exists treat as intermediate
      _ when is_map_key(node, "Kids") -> walk_pages_node(doc, node)
      _ -> {:error, {:malformed, :page_tree, %{unexpected_type: Map.get(node, "Type")}}}
    end
  end

  defp walk_node(_doc, _ref, other),
    do: {:error, {:malformed, :page_tree, %{expected_dict: other}}}

  # Walk a /Pages intermediate node: iterate Kids, collect leaf refs.
  #
  # In strict mode (recover_mode false): uses Enum.reduce_while — first error halts.
  # In lenient mode (recover_mode true): uses Enum.reduce — bad kids are logged and
  # skipped, good kids accumulate. Appends {:page_failed, ref, reason} to recovery_log.
  defp walk_pages_node(%Document{recover_mode: true} = doc, %{"Kids" => kids})
       when is_list(kids) do
    {refs, final_doc} =
      Enum.reduce(kids, {[], doc}, fn kid_ref, {acc_refs, acc_doc} ->
        case walk_kids(acc_doc, kid_ref) do
          {:ok, new_refs, updated_doc} ->
            {acc_refs ++ new_refs, updated_doc}

          {:error, reason} ->
            # Log the bad kid and continue — do NOT halt
            kid_key = extract_kid_key(kid_ref)
            updated_doc = Document.log_recovery(acc_doc, {:page_failed, kid_key, reason})
            {acc_refs, updated_doc}
        end
      end)

    {:ok, refs, final_doc}
  end

  defp walk_pages_node(doc, %{"Kids" => kids}) when is_list(kids) do
    Enum.reduce_while(kids, {:ok, [], doc}, fn kid_ref, {:ok, acc_refs, acc_doc} ->
      case walk_kids(acc_doc, kid_ref) do
        {:ok, new_refs, updated_doc} -> {:cont, {:ok, acc_refs ++ new_refs, updated_doc}}
        {:error, _} = err -> {:halt, err}
      end
    end)
  end

  defp walk_pages_node(_doc, node),
    do: {:error, {:malformed, :page_tree, %{missing_kids: node}}}

  # ---------------------------------------------------------------------------
  # Internal helpers
  # ---------------------------------------------------------------------------

  # Extract the {n, g} key from a ref tuple
  defp ref_key({:ref, n, g}), do: {n, g}
  defp ref_key({n, g}), do: {n, g}

  # Extract a display key from a kid ref for the recovery log
  defp extract_kid_key({:ref, n, g}), do: {n, g}
  defp extract_kid_key(other), do: other
end