Skip to main content

lib/pdf_ex/xref/stream_parser.ex

defmodule PdfEx.XRef.StreamParser do
  @moduledoc false

  alias PdfEx.COS.{Parser, Stream}
  alias PdfEx.Filter

  @spec parse(binary(), non_neg_integer()) ::
          {:ok, map(), map()} | {:error, Parser.error_reason()}
  def parse(bin, offset) do
    with {:ok, {_obj_num, _gen_num, %Stream{dictionary: dict} = stream}, _next} <-
           Parser.parse_indirect_object(bin, offset),
         :ok <- verify_type(dict),
         {:ok, decompressed} <- Filter.decode(stream),
         {:ok, entries} <- decode_entries(decompressed, dict) do
      {:ok, entries, dict}
    else
      {:ok, {_, _, _non_stream}, _} -> {:error, :not_xref_stream}
      {:error, _} = err -> err
    end
  end

  defp verify_type(%{Type: :XRef}), do: :ok
  defp verify_type(_), do: {:error, :not_xref_stream}

  defp decode_entries(data, dict) do
    w = Map.get(dict, :W, [1, 4, 2])
    size = Map.get(dict, :Size, 0)

    # /Index is optional; a malformed scalar (non-list) form falls back to the
    # default full-range subsection rather than crashing chunk_pairs/1.
    raw_index =
      case Map.get(dict, :Index, [0, size]) do
        list when is_list(list) -> list
        _ -> [0, size]
      end

    case w do
      [_, _, _] ->
        [w1, w2, w3] = Enum.map(w, &to_integer/1)

        if w1 < 0 or w2 < 0 or w3 < 0 or w1 + w2 + w3 == 0 do
          {:error, :malformed_xref_stream_widths}
        else
          widths = {w1, w2, w3}
          entry_size = w1 + w2 + w3

          index_pairs =
            raw_index
            |> chunk_pairs()
            |> Enum.filter(fn {first, count} -> is_integer(first) and is_integer(count) end)

          {entries, _} =
            Enum.reduce(index_pairs, {%{}, 0}, fn {first, count}, {acc, data_offset} ->
              new_acc = parse_section(data, data_offset, first, count, widths, acc)
              {new_acc, data_offset + max(count, 0) * entry_size}
            end)

          {:ok, entries}
        end

      _ ->
        {:error, :malformed_xref_stream_widths}
    end
  end

  defp parse_section(_data, _off, _first, count, _widths, acc) when count <= 0, do: acc

  defp parse_section(data, data_offset, first_id, count, {w1, w2, w3} = widths, acc) do
    entry_size = w1 + w2 + w3
    available = max(byte_size(data) - data_offset, 0)
    count = min(count, div(available, entry_size))

    if count <= 0 do
      acc
    else
      Enum.reduce(0..(count - 1), acc, fn i, entries ->
        byte_pos = data_offset + i * entry_size
        {{obj_id, gen}, entry} = parse_entry(data, byte_pos, first_id + i, widths)
        Map.put(entries, {obj_id, gen}, entry)
      end)
    end
  end

  defp parse_entry(data, byte_pos, obj_id, {w1, w2, w3}) do
    type = read_field(data, byte_pos, w1)
    field2 = read_field(data, byte_pos + w1, w2)
    field3 = read_field(data, byte_pos + w1 + w2, w3)

    actual_type = if w1 == 0, do: 1, else: type

    case actual_type do
      0 ->
        {{obj_id, field3}, :free}

      1 ->
        {{obj_id, field3}, {:resolved, field2}}

      2 ->
        {{obj_id, 0}, {:compressed, field2, field3}}

      _ ->
        {{obj_id, 0}, :free}
    end
  end

  defp read_field(_data, _offset, 0), do: 0

  defp read_field(data, offset, width) when offset + width <= byte_size(data) do
    data
    |> binary_part(offset, width)
    |> :binary.bin_to_list()
    |> Enum.reduce(0, fn byte, acc -> acc * 256 + byte end)
  end

  defp read_field(_data, _offset, _width), do: 0

  defp chunk_pairs([]), do: []
  defp chunk_pairs([first, count | rest]), do: [{first, count} | chunk_pairs(rest)]
  defp chunk_pairs([_]), do: []

  defp to_integer(n) when is_integer(n), do: n
  defp to_integer(_), do: 1
end