Skip to main content

lib/pdf_ex/serializer.ex

defmodule PdfEx.Serializer do
  @moduledoc """
  Serializes a `PdfEx.Document` back to PDF bytes, in one of two modes.

    * `:incremental` (default) — **lossless**: edits are appended as a PDF
      incremental update (matching the source's classic-table or xref-stream
      style), so the original bytes are a byte-for-byte prefix of the output.
      A document with no pending edits serializes back to its source verbatim.
    * `:full` — rewrites a single clean revision (live objects renumbered,
      references rewritten, `/ObjStm` and `/XRef` containers dropped, one xref,
      no `/Prev`). Explicitly **not** byte-lossless; use it to compact away
      accumulated revisions.
  """

  alias PdfEx.{Document, Resolver, Telemetry}
  alias PdfEx.COS.{HexString, Reference, Stream}
  alias PdfEx.COS.Serializer, as: COS
  alias PdfEx.XRef.{Builder, StreamBuilder}

  @doc "Serializes `doc` to PDF bytes. See the module doc for `:incremental` (default) vs `:full`."
  @spec serialize(Document.t(), [{:mode, :incremental | :full}]) :: binary()
  def serialize(%Document{} = doc, opts \\ []) do
    mode = Keyword.get(opts, :mode, :incremental)

    :telemetry.span(
      Telemetry.serialize(),
      %{dirty: map_size(doc.dirty_objects), mode: mode},
      fn -> {serialize_mode(mode, doc, opts), %{}} end
    )
  end

  defp serialize_mode(:full, doc, _opts), do: full_serialize(doc)
  defp serialize_mode(_incremental, doc, opts), do: do_serialize(doc, opts)

  defp do_serialize(%Document{dirty_objects: d} = doc, _opts) when map_size(d) == 0,
    do: doc.source

  defp do_serialize(%Document{} = doc, _opts) do
    source =
      if String.ends_with?(doc.source, ["\n", "\r"]),
        do: doc.source,
        else: doc.source <> "\n"

    start = byte_size(source)

    {objects_bin, offsets} =
      doc.dirty_objects
      |> Enum.sort_by(fn {{id, _g}, _v} -> id end)
      |> Enum.reduce({<<>>, %{}}, fn {{id, gen} = key, entry}, {bin, omap} ->
        case entry do
          :free ->
            {bin, Map.put(omap, key, :free)}

          {:update, value} ->
            offset = start + byte_size(bin)
            obj = "#{id} #{gen} obj\n#{COS.serialize_val(value)}\nendobj\n"
            {bin <> obj, Map.put(omap, key, offset)}
        end
      end)

    original_size =
      case doc.trailer[:Size] do
        s when is_integer(s) and s >= 0 -> s
        _ -> 0
      end

    highest_dirty_id =
      doc.dirty_objects |> Map.keys() |> Enum.map(fn {id, _g} -> id end) |> Enum.max()

    new_size = max(original_size, highest_dirty_id + 1)
    prev = doc.revision_xref_offset

    # TODO(roadmap): once encrypted-document editing is supported (open/1
    # currently rejects /Encrypt), the appended trailer must carry /Encrypt
    # forward — and any other producer-required keys (e.g. /XRefStm for hybrid
    # files) — or the final revision becomes unreadable. Today's Map.take is
    # safe only because such docs never reach serialization.
    trailer =
      doc.trailer
      |> Map.take([:Root, :Info])
      |> put_id(doc.trailer[:ID])
      |> Map.put(:Size, new_size)
      |> maybe_put_prev(prev)

    case doc.xref_style do
      :stream ->
        xref_obj_id = new_size
        xref_offset = start + byte_size(objects_bin)
        full = Map.put(offsets, {xref_obj_id, 0}, xref_offset)
        stream = StreamBuilder.build_xref_stream(full, xref_obj_id, trailer)
        obj = "#{xref_obj_id} 0 obj\n#{COS.serialize_val(stream)}\nendobj\n"
        source <> objects_bin <> obj <> "startxref\n#{xref_offset}\n%%EOF\n"

      _classic ->
        xref_offset = start + byte_size(objects_bin)
        table = Builder.build_classic_table(offsets)
        trailer_bin = "trailer\n#{COS.serialize_val(trailer)}\nstartxref\n#{xref_offset}\n%%EOF\n"
        source <> objects_bin <> table <> trailer_bin
    end
  end

  defp full_serialize(%Document{} = doc) do
    version = pdf_version(doc.source)
    header = "%PDF-#{version}\n%\xFF\xFF\xFF\xFF\n"

    live = collect_live(doc)
    remap = build_remap(live)

    {body, rev_offsets, count} =
      live
      |> Enum.with_index(1)
      |> Enum.reduce({<<>>, [], 0}, fn {{_old_ref, value}, new_id}, {bin, offs, _} ->
        rewritten = remap_value(value, remap)
        offset = byte_size(header) + byte_size(bin)
        obj = "#{new_id} 0 obj\n#{COS.serialize_val(rewritten)}\nendobj\n"
        {bin <> obj, [{{new_id, 0}, offset} | offs], new_id}
      end)

    xref_offset = byte_size(header) + byte_size(body)

    table =
      rev_offsets
      |> Map.new()
      |> Map.put({0, 65535}, :free)
      |> Builder.build_classic_table()

    trailer = full_trailer(doc, remap, count)

    header <>
      body <>
      table <> "trailer\n#{COS.serialize_val(trailer)}\nstartxref\n#{xref_offset}\n%%EOF\n"
  end

  defp collect_live(doc) do
    (Map.keys(doc.xref_map) ++ Map.keys(doc.dirty_objects))
    |> Enum.uniq()
    |> Enum.sort_by(fn {id, _gen} -> id end)
    |> Enum.flat_map(fn ref ->
      case Resolver.resolve(doc, ref) do
        {:ok, value} -> if container?(value), do: [], else: [{ref, value}]
        _ -> []
      end
    end)
  end

  defp container?(%Stream{dictionary: d}), do: d[:Type] in [:ObjStm, :XRef]
  defp container?(_), do: false

  defp build_remap(live) do
    live
    |> Enum.with_index(1)
    |> Map.new(fn {{old_ref, _value}, new_id} -> {old_ref, %Reference{id: new_id, gen: 0}} end)
  end

  defp full_trailer(doc, remap, count) do
    %{}
    |> Map.put(:Size, count + 1)
    |> maybe_put_ref(:Root, doc.trailer[:Root], remap)
    |> maybe_put_ref(:Info, doc.trailer[:Info], remap)
    |> put_id(doc.trailer[:ID])
  end

  defp maybe_put_ref(map, key, %Reference{} = ref, remap),
    do: Map.put(map, key, remap_value(ref, remap))

  defp maybe_put_ref(map, _key, _other, _remap), do: map

  # A reference whose target was dropped from the live set (unresolvable object,
  # or a container we don't re-emit) becomes `null` rather than a dangling ref
  # into the old, now-nonexistent numbering space.
  defp remap_value(%Reference{id: id, gen: gen}, remap), do: Map.get(remap, {id, gen})

  defp remap_value(%Stream{dictionary: d, raw_bytes: b}, remap),
    do: %Stream{dictionary: remap_value(d, remap), raw_bytes: b}

  defp remap_value(%_{} = struct, _remap), do: struct

  defp remap_value(m, remap) when is_map(m),
    do: Map.new(m, fn {k, v} -> {k, remap_value(v, remap)} end)

  defp remap_value(l, remap) when is_list(l), do: Enum.map(l, &remap_value(&1, remap))

  defp remap_value(other, _remap), do: other

  defp pdf_version(<<"%PDF-", rest::binary>>) do
    ver =
      rest
      |> :binary.bin_to_list()
      |> Enum.take_while(&(&1 in ?0..?9 or &1 == ?.))
      |> List.to_string()

    if ver == "", do: "1.7", else: ver
  end

  defp pdf_version(_), do: "1.7"

  defp put_id(trailer, [a, b]) when is_binary(a) and is_binary(b) do
    Map.put(trailer, :ID, [
      %HexString{value: Base.encode16(a)},
      %HexString{value: Base.encode16(b)}
    ])
  end

  defp put_id(trailer, _), do: trailer

  # /Prev must be a byte offset; never emit `/Prev null` (unparseable chain).
  defp maybe_put_prev(trailer, prev) when is_integer(prev), do: Map.put(trailer, :Prev, prev)
  defp maybe_put_prev(trailer, _), do: trailer
end