lib/pdf/reader/xref/classic.ex

defmodule Pdf.Reader.XRef.Classic do
  @moduledoc """
  Parses a classic PDF cross-reference table (keyword `xref`).

  Per PDF spec § 7.5.4:
  - Starts with the `xref` keyword on its own line.
  - Followed by one or more subsections. Each subsection has a header line
    `<first_obj_num> <count>` and then exactly `count` 20-byte entries.
  - Each entry format: `<10-digit-offset> <5-digit-gen> <n|f><EOL>`
    where EOL is `\\r\\n`, ` \\r`, or ` \\n` (3 variants = 20 bytes total).
  - After all subsections, a `trailer` keyword + dictionary.
  """

  alias Pdf.Reader.Parser

  @type xref_entry ::
          {:in_use, non_neg_integer(), non_neg_integer()}
          | :free

  @type entries :: %{{pos_integer(), non_neg_integer()} => xref_entry()}

  # ---------------------------------------------------------------------------
  # Public API
  # ---------------------------------------------------------------------------

  @doc """
  Parses a classic xref table starting at `offset` within `binary`.

  Returns `{:ok, entries_map}` where keys are `{obj_num, gen_num}` and values
  are `{:in_use, offset, gen}` or `:free`.

  Returns `{:error, reason}` if the binary at that offset is not a valid
  classic xref section.
  """
  @spec parse(binary(), non_neg_integer()) :: {:ok, entries()} | {:error, term()}
  def parse(binary, offset) when is_binary(binary) and is_integer(offset) do
    total = byte_size(binary)

    if offset >= total do
      {:error, :offset_out_of_range}
    else
      slice = binary_part(binary, offset, total - offset)
      do_parse(slice)
    end
  end

  # ---------------------------------------------------------------------------
  # Internal parsing
  # ---------------------------------------------------------------------------

  defp do_parse(<<"xref", rest::binary>>) do
    rest2 = skip_eol(rest)
    parse_subsections(rest2, %{})
  end

  defp do_parse(_), do: {:error, :not_an_xref}

  # Parse subsection headers until we hit "trailer"
  defp parse_subsections(rest, entries) do
    rest = skip_whitespace(rest)

    cond do
      # End of xref — next is "trailer"
      String.starts_with?(rest, "trailer") ->
        {:ok, entries}

      # Subsection header: two integers on a line
      true ->
        case parse_subsection_header(rest) do
          {:ok, first, count, rest2} ->
            {new_entries, rest3} = parse_entries(rest2, first, count, %{})
            parse_subsections(rest3, Map.merge(entries, new_entries))

          :error ->
            {:error, :invalid_xref_subsection}
        end
    end
  end

  # Parse "first count\n" subsection header
  defp parse_subsection_header(bin) do
    case Integer.parse(bin) do
      {first, rest} ->
        rest = skip_spaces(rest)

        case Integer.parse(rest) do
          {count, rest2} ->
            rest3 = skip_eol(rest2)
            {:ok, first, count, rest3}

          :error ->
            :error
        end

      :error ->
        :error
    end
  end

  # Parse `count` 20-byte entries starting at obj number `first`
  defp parse_entries(rest, _first, 0, acc), do: {acc, rest}

  defp parse_entries(
         <<offset_str::binary-size(10), _sp1, gen_str::binary-size(5), _sp2, kind,
           _eol::binary-size(2), rest::binary>>,
         first,
         count,
         acc
       ) do
    offset = String.to_integer(offset_str)
    gen = String.to_integer(gen_str)

    entry =
      case kind do
        ?f -> :free
        ?n -> {:in_use, offset, gen}
        _ -> {:in_use, offset, gen}
      end

    # Key: {obj_num, gen_num}
    key = {first, gen}
    parse_entries(rest, first + 1, count - 1, Map.put(acc, key, entry))
  end

  defp parse_entries(rest, _first, _count, acc), do: {acc, rest}

  # ---------------------------------------------------------------------------
  # Helpers
  # ---------------------------------------------------------------------------

  defp skip_eol(<<?\r, ?\n, rest::binary>>), do: rest
  defp skip_eol(<<?\r, rest::binary>>), do: rest
  defp skip_eol(<<?\n, rest::binary>>), do: rest
  defp skip_eol(rest), do: rest

  defp skip_whitespace(<<c, rest::binary>>) when c in [?\s, ?\t, ?\r, ?\n, ?\f, 0] do
    skip_whitespace(rest)
  end

  defp skip_whitespace(rest), do: rest

  defp skip_spaces(<<?\s, rest::binary>>), do: skip_spaces(rest)
  defp skip_spaces(<<?\t, rest::binary>>), do: skip_spaces(rest)
  defp skip_spaces(rest), do: rest

  # Public but unused here — keep for downstream use
  @doc false
  def parse_trailer_dict(binary, offset) do
    total = byte_size(binary)

    if offset >= total do
      {:error, :malformed}
    else
      slice = binary_part(binary, offset, total - offset)

      case :binary.match(slice, "trailer") do
        {pos, len} ->
          after_kw = binary_part(slice, pos + len, byte_size(slice) - pos - len)
          {dict, _rest} = Parser.parse_value(String.trim_leading(after_kw))
          {:ok, dict}

        :nomatch ->
          {:error, :no_trailer}
      end
    end
  end
end