defmodule PdfEx.XRef.StreamParser do
@moduledoc false
alias PdfEx.COS.{Parser, Stream}
alias PdfEx.Filter
@spec parse(binary(), non_neg_integer()) ::
{:ok, map(), map()} | {:error, Parser.error_reason()}
def parse(bin, offset) do
with {:ok, {_obj_num, _gen_num, %Stream{dictionary: dict} = stream}, _next} <-
Parser.parse_indirect_object(bin, offset),
:ok <- verify_type(dict),
{:ok, decompressed} <- Filter.decode(stream),
{:ok, entries} <- decode_entries(decompressed, dict) do
{:ok, entries, dict}
else
{:ok, {_, _, _non_stream}, _} -> {:error, :not_xref_stream}
{:error, _} = err -> err
end
end
defp verify_type(%{Type: :XRef}), do: :ok
defp verify_type(_), do: {:error, :not_xref_stream}
defp decode_entries(data, dict) do
w = Map.get(dict, :W, [1, 4, 2])
size = Map.get(dict, :Size, 0)
# /Index is optional; a malformed scalar (non-list) form falls back to the
# default full-range subsection rather than crashing chunk_pairs/1.
raw_index =
case Map.get(dict, :Index, [0, size]) do
list when is_list(list) -> list
_ -> [0, size]
end
case w do
[_, _, _] ->
[w1, w2, w3] = Enum.map(w, &to_integer/1)
if w1 < 0 or w2 < 0 or w3 < 0 or w1 + w2 + w3 == 0 do
{:error, :malformed_xref_stream_widths}
else
widths = {w1, w2, w3}
entry_size = w1 + w2 + w3
index_pairs =
raw_index
|> chunk_pairs()
|> Enum.filter(fn {first, count} -> is_integer(first) and is_integer(count) end)
{entries, _} =
Enum.reduce(index_pairs, {%{}, 0}, fn {first, count}, {acc, data_offset} ->
new_acc = parse_section(data, data_offset, first, count, widths, acc)
{new_acc, data_offset + max(count, 0) * entry_size}
end)
{:ok, entries}
end
_ ->
{:error, :malformed_xref_stream_widths}
end
end
defp parse_section(_data, _off, _first, count, _widths, acc) when count <= 0, do: acc
defp parse_section(data, data_offset, first_id, count, {w1, w2, w3} = widths, acc) do
entry_size = w1 + w2 + w3
available = max(byte_size(data) - data_offset, 0)
count = min(count, div(available, entry_size))
if count <= 0 do
acc
else
Enum.reduce(0..(count - 1), acc, fn i, entries ->
byte_pos = data_offset + i * entry_size
{{obj_id, gen}, entry} = parse_entry(data, byte_pos, first_id + i, widths)
Map.put(entries, {obj_id, gen}, entry)
end)
end
end
defp parse_entry(data, byte_pos, obj_id, {w1, w2, w3}) do
type = read_field(data, byte_pos, w1)
field2 = read_field(data, byte_pos + w1, w2)
field3 = read_field(data, byte_pos + w1 + w2, w3)
actual_type = if w1 == 0, do: 1, else: type
case actual_type do
0 ->
{{obj_id, field3}, :free}
1 ->
{{obj_id, field3}, {:resolved, field2}}
2 ->
{{obj_id, 0}, {:compressed, field2, field3}}
_ ->
{{obj_id, 0}, :free}
end
end
defp read_field(_data, _offset, 0), do: 0
defp read_field(data, offset, width) when offset + width <= byte_size(data) do
data
|> binary_part(offset, width)
|> :binary.bin_to_list()
|> Enum.reduce(0, fn byte, acc -> acc * 256 + byte end)
end
defp read_field(_data, _offset, _width), do: 0
defp chunk_pairs([]), do: []
defp chunk_pairs([first, count | rest]), do: [{first, count} | chunk_pairs(rest)]
defp chunk_pairs([_]), do: []
defp to_integer(n) when is_integer(n), do: n
defp to_integer(_), do: 1
end