defmodule PdfEx.Serializer do
@moduledoc """
Serializes a `PdfEx.Document` back to PDF bytes, in one of two modes.
* `:incremental` (default) — **lossless**: edits are appended as a PDF
incremental update (matching the source's classic-table or xref-stream
style), so the original bytes are a byte-for-byte prefix of the output.
A document with no pending edits serializes back to its source verbatim.
* `:full` — rewrites a single clean revision (live objects renumbered,
references rewritten, `/ObjStm` and `/XRef` containers dropped, one xref,
no `/Prev`). Explicitly **not** byte-lossless; use it to compact away
accumulated revisions.
"""
alias PdfEx.{Document, Resolver, Telemetry}
alias PdfEx.COS.{HexString, Reference, Stream}
alias PdfEx.COS.Serializer, as: COS
alias PdfEx.XRef.{Builder, StreamBuilder}
@doc "Serializes `doc` to PDF bytes. See the module doc for `:incremental` (default) vs `:full`."
@spec serialize(Document.t(), [{:mode, :incremental | :full}]) :: binary()
def serialize(%Document{} = doc, opts \\ []) do
mode = Keyword.get(opts, :mode, :incremental)
:telemetry.span(
Telemetry.serialize(),
%{dirty: map_size(doc.dirty_objects), mode: mode},
fn -> {serialize_mode(mode, doc, opts), %{}} end
)
end
defp serialize_mode(:full, doc, _opts), do: full_serialize(doc)
defp serialize_mode(_incremental, doc, opts), do: do_serialize(doc, opts)
defp do_serialize(%Document{dirty_objects: d} = doc, _opts) when map_size(d) == 0,
do: doc.source
defp do_serialize(%Document{} = doc, _opts) do
source =
if String.ends_with?(doc.source, ["\n", "\r"]),
do: doc.source,
else: doc.source <> "\n"
start = byte_size(source)
{objects_bin, offsets} =
doc.dirty_objects
|> Enum.sort_by(fn {{id, _g}, _v} -> id end)
|> Enum.reduce({<<>>, %{}}, fn {{id, gen} = key, entry}, {bin, omap} ->
case entry do
:free ->
{bin, Map.put(omap, key, :free)}
{:update, value} ->
offset = start + byte_size(bin)
obj = "#{id} #{gen} obj\n#{COS.serialize_val(value)}\nendobj\n"
{bin <> obj, Map.put(omap, key, offset)}
end
end)
original_size =
case doc.trailer[:Size] do
s when is_integer(s) and s >= 0 -> s
_ -> 0
end
highest_dirty_id =
doc.dirty_objects |> Map.keys() |> Enum.map(fn {id, _g} -> id end) |> Enum.max()
new_size = max(original_size, highest_dirty_id + 1)
prev = doc.revision_xref_offset
# TODO(roadmap): once encrypted-document editing is supported (open/1
# currently rejects /Encrypt), the appended trailer must carry /Encrypt
# forward — and any other producer-required keys (e.g. /XRefStm for hybrid
# files) — or the final revision becomes unreadable. Today's Map.take is
# safe only because such docs never reach serialization.
trailer =
doc.trailer
|> Map.take([:Root, :Info])
|> put_id(doc.trailer[:ID])
|> Map.put(:Size, new_size)
|> maybe_put_prev(prev)
case doc.xref_style do
:stream ->
xref_obj_id = new_size
xref_offset = start + byte_size(objects_bin)
full = Map.put(offsets, {xref_obj_id, 0}, xref_offset)
stream = StreamBuilder.build_xref_stream(full, xref_obj_id, trailer)
obj = "#{xref_obj_id} 0 obj\n#{COS.serialize_val(stream)}\nendobj\n"
source <> objects_bin <> obj <> "startxref\n#{xref_offset}\n%%EOF\n"
_classic ->
xref_offset = start + byte_size(objects_bin)
table = Builder.build_classic_table(offsets)
trailer_bin = "trailer\n#{COS.serialize_val(trailer)}\nstartxref\n#{xref_offset}\n%%EOF\n"
source <> objects_bin <> table <> trailer_bin
end
end
defp full_serialize(%Document{} = doc) do
version = pdf_version(doc.source)
header = "%PDF-#{version}\n%\xFF\xFF\xFF\xFF\n"
live = collect_live(doc)
remap = build_remap(live)
{body, rev_offsets, count} =
live
|> Enum.with_index(1)
|> Enum.reduce({<<>>, [], 0}, fn {{_old_ref, value}, new_id}, {bin, offs, _} ->
rewritten = remap_value(value, remap)
offset = byte_size(header) + byte_size(bin)
obj = "#{new_id} 0 obj\n#{COS.serialize_val(rewritten)}\nendobj\n"
{bin <> obj, [{{new_id, 0}, offset} | offs], new_id}
end)
xref_offset = byte_size(header) + byte_size(body)
table =
rev_offsets
|> Map.new()
|> Map.put({0, 65535}, :free)
|> Builder.build_classic_table()
trailer = full_trailer(doc, remap, count)
header <>
body <>
table <> "trailer\n#{COS.serialize_val(trailer)}\nstartxref\n#{xref_offset}\n%%EOF\n"
end
defp collect_live(doc) do
(Map.keys(doc.xref_map) ++ Map.keys(doc.dirty_objects))
|> Enum.uniq()
|> Enum.sort_by(fn {id, _gen} -> id end)
|> Enum.flat_map(fn ref ->
case Resolver.resolve(doc, ref) do
{:ok, value} -> if container?(value), do: [], else: [{ref, value}]
_ -> []
end
end)
end
defp container?(%Stream{dictionary: d}), do: d[:Type] in [:ObjStm, :XRef]
defp container?(_), do: false
defp build_remap(live) do
live
|> Enum.with_index(1)
|> Map.new(fn {{old_ref, _value}, new_id} -> {old_ref, %Reference{id: new_id, gen: 0}} end)
end
defp full_trailer(doc, remap, count) do
%{}
|> Map.put(:Size, count + 1)
|> maybe_put_ref(:Root, doc.trailer[:Root], remap)
|> maybe_put_ref(:Info, doc.trailer[:Info], remap)
|> put_id(doc.trailer[:ID])
end
defp maybe_put_ref(map, key, %Reference{} = ref, remap),
do: Map.put(map, key, remap_value(ref, remap))
defp maybe_put_ref(map, _key, _other, _remap), do: map
# A reference whose target was dropped from the live set (unresolvable object,
# or a container we don't re-emit) becomes `null` rather than a dangling ref
# into the old, now-nonexistent numbering space.
defp remap_value(%Reference{id: id, gen: gen}, remap), do: Map.get(remap, {id, gen})
defp remap_value(%Stream{dictionary: d, raw_bytes: b}, remap),
do: %Stream{dictionary: remap_value(d, remap), raw_bytes: b}
defp remap_value(%_{} = struct, _remap), do: struct
defp remap_value(m, remap) when is_map(m),
do: Map.new(m, fn {k, v} -> {k, remap_value(v, remap)} end)
defp remap_value(l, remap) when is_list(l), do: Enum.map(l, &remap_value(&1, remap))
defp remap_value(other, _remap), do: other
defp pdf_version(<<"%PDF-", rest::binary>>) do
ver =
rest
|> :binary.bin_to_list()
|> Enum.take_while(&(&1 in ?0..?9 or &1 == ?.))
|> List.to_string()
if ver == "", do: "1.7", else: ver
end
defp pdf_version(_), do: "1.7"
defp put_id(trailer, [a, b]) when is_binary(a) and is_binary(b) do
Map.put(trailer, :ID, [
%HexString{value: Base.encode16(a)},
%HexString{value: Base.encode16(b)}
])
end
defp put_id(trailer, _), do: trailer
# /Prev must be a byte offset; never emit `/Prev null` (unparseable chain).
defp maybe_put_prev(trailer, prev) when is_integer(prev), do: Map.put(trailer, :Prev, prev)
defp maybe_put_prev(trailer, _), do: trailer
end