lib/pdf/reader/trailer.ex

defmodule Pdf.Reader.Trailer do
  @moduledoc """
  Locates the `startxref` byte offset in a PDF binary and parses the
  trailer dictionary at a given xref section offset.

  ## PDF spec references
  - § 7.5.5 — File Trailer
  - § 7.5.4 — Cross-Reference Table
  """

  alias Pdf.Reader.Parser

  # ---------------------------------------------------------------------------
  # Struct
  # ---------------------------------------------------------------------------

  @type t :: %__MODULE__{
          root: Pdf.Reader.Document.ref() | nil,
          info: Pdf.Reader.Document.ref() | nil,
          size: pos_integer() | nil,
          encrypt: term(),
          id: [{:hex_string, binary()} | {:string, binary()} | binary()] | nil,
          prev: non_neg_integer() | nil,
          dict: map()
        }

  defstruct root: nil,
            info: nil,
            size: nil,
            encrypt: nil,
            id: nil,
            prev: nil,
            dict: %{}

  # ---------------------------------------------------------------------------
  # Public API
  # ---------------------------------------------------------------------------

  @doc """
  Scans `binary` in reverse for `%%EOF`, then reads the `startxref` offset
  on the line immediately before it.

  Returns `{:ok, offset}` or `{:error, :malformed}`.

  Per PDF spec § 7.5.5: the file ends with `%%EOF`. The line above that is
  the byte offset to the xref section. The line above that is the keyword
  `startxref`.
  """
  @spec locate_startxref(binary()) :: {:ok, non_neg_integer()} | {:error, :malformed}
  def locate_startxref(binary) when is_binary(binary) do
    # Split into lines, iterate from the end to find %%EOF
    lines = binary |> String.split(~r/\r\n|\r|\n/) |> Enum.reverse()
    find_eof_and_offset(lines)
  end

  @doc """
  Parses the xref section and trailer dictionary at `offset` within `binary`.

  Seeks forward from `offset` to the `trailer` keyword, then parses the
  dictionary that follows. Populates a `%Pdf.Reader.Trailer{}` struct.

  Returns `{:ok, %Pdf.Reader.Trailer{}}` or `{:error, :malformed}`.
  """
  @spec parse(binary(), non_neg_integer()) ::
          {:ok, t()} | {:error, :malformed}
  def parse(binary, offset) when is_binary(binary) and is_integer(offset) do
    total = byte_size(binary)

    if offset >= total do
      {:error, :malformed}
    else
      # Slice from offset to end
      slice = binary_part(binary, offset, total - offset)

      # Find "trailer" keyword in the slice, then parse the dict that follows
      case find_trailer_dict(slice) do
        {:ok, dict_binary} ->
          {dict, _rest} = Parser.parse_value(dict_binary)
          {:ok, build_trailer(dict)}

        :error ->
          {:error, :malformed}
      end
    end
  end

  # ---------------------------------------------------------------------------
  # locate_startxref helpers
  # ---------------------------------------------------------------------------

  # Walk reversed lines: skip blank lines, find %%EOF, then read offset + startxref
  defp find_eof_and_offset(lines) do
    lines
    |> Enum.drop_while(&blank_or_whitespace?/1)
    |> do_find_eof()
  end

  defp do_find_eof([line | rest]) do
    if String.trim(line) == "%%EOF" do
      extract_offset(rest)
    else
      # %%EOF not the first non-blank — might be multiple %%EOF blocks (linearized)
      # or junk at end. Keep looking.
      do_find_eof(Enum.drop_while(rest, &(String.trim(&1) != "%%EOF")))
    end
  end

  defp do_find_eof([]), do: {:error, :malformed}

  defp extract_offset(lines) do
    # Skip blank lines to find the offset number
    lines = Enum.drop_while(lines, &blank_or_whitespace?/1)

    case lines do
      [offset_line | rest] ->
        case Integer.parse(String.trim(offset_line)) do
          {offset, ""} ->
            # Verify the line before is `startxref`
            rest = Enum.drop_while(rest, &blank_or_whitespace?/1)

            case rest do
              [kw | _] ->
                if String.trim(kw) == "startxref" do
                  {:ok, offset}
                else
                  # Be lenient — accept valid integer even without keyword verification
                  {:ok, offset}
                end

              [] ->
                {:ok, offset}
            end

          _ ->
            {:error, :malformed}
        end

      [] ->
        {:error, :malformed}
    end
  end

  defp blank_or_whitespace?(str), do: String.trim(str) == ""

  # ---------------------------------------------------------------------------
  # parse helpers
  # ---------------------------------------------------------------------------

  # Find the `trailer` keyword in the slice and return everything after it
  defp find_trailer_dict(slice) do
    case :binary.match(slice, "trailer") do
      {pos, len} ->
        after_keyword = binary_part(slice, pos + len, byte_size(slice) - pos - len)
        # Skip whitespace to reach <<
        trimmed = String.trim_leading(after_keyword)
        {:ok, trimmed}

      :nomatch ->
        :error
    end
  end

  # ---------------------------------------------------------------------------
  # Build Trailer struct from parsed dict
  # ---------------------------------------------------------------------------

  defp build_trailer(dict) when is_map(dict) do
    %__MODULE__{
      root: Map.get(dict, "Root"),
      info: Map.get(dict, "Info"),
      size: extract_integer(dict, "Size"),
      encrypt: Map.get(dict, "Encrypt"),
      id: extract_id(dict),
      prev: extract_integer(dict, "Prev"),
      dict: dict
    }
  end

  defp build_trailer(_), do: %__MODULE__{}

  defp extract_integer(dict, key) do
    case Map.get(dict, key) do
      v when is_integer(v) -> v
      _ -> nil
    end
  end

  defp extract_id(dict) do
    case Map.get(dict, "ID") do
      list when is_list(list) -> list
      _ -> nil
    end
  end
end