defmodule PdfEx.Convert do
@moduledoc """
HTML projections of a document, and the reverse (HTML edit → PDF text ops).
Two modes, both built off the glyph index:
* `:visual` — byte-faithful absolute layout, one `<span data-uid=...>` per
glyph (`y_html = mediabox_height - y_pdf - font_size`).
* `:semantic` — y-band row clustering classified into `<h1>`/`<h2>`/`<li>`/
`<p>` blocks carrying `data-uids` ranges (heuristic; best-effort for
rotated text).
Reverse mapping turns an edited semantic block back into per-run
`PdfEx.Op.UpdateText`s (`semantic_ops/3`) or applies them
(`apply_semantic_mutation/3`). `apply_visual_mutation/3` repositions one
glyph's run; its delta applies to a `Tm` matrix's translation components only
(documented limitation for scaled/rotated matrices).
"""
alias PdfEx.{ContentEdit, Document, Error, Glyphs, Op, PageTree, Resolver}
alias PdfEx.Content.{Number, Tokens}
alias PdfEx.COS.Stream
@positioning_ops [:Td, :TD, :Tm]
@show_ops [:Tj, :TJ, :"'", :"\""]
@doc """
Renders the document to HTML. `mode: :visual` (default) is a byte-faithful
absolute layout; `mode: :semantic` emits classified `data-uid` blocks.
"""
@spec to_html(Document.t(), [{:mode, :visual | :semantic}]) ::
{:ok, binary()} | {:error, Error.t()}
def to_html(%Document{} = doc, opts \\ []) do
case Keyword.get(opts, :mode, :visual) do
:visual -> visual_html(doc)
:semantic -> semantic_html(doc)
_ -> {:error, Error.new(:unsupported_mode)}
end
end
@doc """
Moves the span containing `uid` so that glyph lands at the given x/y.
Token-span patch: rewrites only the nearest preceding Td/TD/Tm operands in
the content stream (no regeneration); marks only that /Contents object
dirty. Equal-position mutations are no-ops (dirty_objects untouched).
"""
@spec apply_visual_mutation(Document.t(), binary(), %{x: number(), y: number()}) ::
{:ok, Document.t()} | {:error, Error.t()}
def apply_visual_mutation(%Document{} = doc, uid, %{x: new_x, y: new_y}) do
with {:ok, indexed} <- Glyphs.index(doc),
{:ok, contents_ref, op_index, glyph} <- lookup(indexed, uid) do
dx = new_x / 1.0 - glyph.x
dy = new_y / 1.0 - glyph.y
if dx == 0.0 and dy == 0.0 do
{:ok, indexed}
else
patch_position(indexed, contents_ref, op_index, dx, dy)
end
end
end
defp lookup(doc, uid) do
case doc.uid_index[uid] do
{:glyph, contents_ref, op_index, _glyph_idx} ->
glyph = doc.glyphs |> Map.values() |> List.flatten() |> Enum.find(&(&1.id == uid))
{:ok, contents_ref, op_index, glyph}
_ ->
{:error, Error.new(:unknown_uid)}
end
end
defp patch_position(doc, contents_ref, op_index, dx, dy) do
with {:ok, %Stream{dictionary: dict} = stream} <- resolve_stream(doc, contents_ref),
:ok <- ensure_uncompressed(dict),
{:ok, pos_op} <- find_positioning_op(Tokens.parse_ops(stream.raw_bytes), op_index) do
bytes = stream.raw_bytes
{start, stop} = pos_op.region
replacement = " " <> render_patched(pos_op, dx, dy)
new_bytes =
binary_part(bytes, 0, start) <>
replacement <> binary_part(bytes, stop, byte_size(bytes) - stop)
new_stream = %Stream{
dictionary: Map.put(dict, :Length, byte_size(new_bytes)),
raw_bytes: new_bytes
}
dirty = Map.put(doc.dirty_objects, contents_ref, {:update, new_stream})
{:ok, Document.commit_dirty(doc, dirty)}
end
end
defp resolve_stream(doc, ref) do
case Resolver.resolve(doc, ref) do
{:ok, %Stream{} = s} -> {:ok, s}
_ -> {:error, Error.new(:contents_not_a_stream)}
end
end
defp ensure_uncompressed(dict) do
if dict[:Filter] || dict[:F],
do: {:error, Error.new(:unsupported_filter, "uncompressed streams only")},
else: :ok
end
defp find_positioning_op(ops, show_op_index) do
show_positions =
ops
|> Enum.with_index()
|> Enum.filter(fn {op, _i} -> op.operator in @show_ops end)
case Enum.at(show_positions, show_op_index) do
nil ->
{:error, Error.new(:unknown_uid)}
{_show_op, ops_idx} ->
ops
|> Enum.take(ops_idx)
|> Enum.reverse()
|> Enum.find(&(&1.operator in @positioning_ops))
|> case do
nil ->
{:error, Error.new(:unpatchable, "no Td/TD/Tm precedes this span")}
pos_op ->
if valid_positioning_operands?(pos_op),
do: {:ok, pos_op},
else: {:error, Error.new(:unpatchable, "malformed positioning operands")}
end
end
end
defp valid_positioning_operands?(%{operator: op, operands: [{:number, _}, {:number, _}]})
when op in [:Td, :TD],
do: true
defp valid_positioning_operands?(%{operator: :Tm, operands: operands}),
do: length(operands) == 6 and Enum.all?(operands, &match?({:number, _}, &1))
defp valid_positioning_operands?(_), do: false
defp render_patched(%{operator: op, operands: operands}, dx, dy) when op in [:Td, :TD] do
[{:number, tx}, {:number, ty}] = operands
"#{Number.format(tx + dx)} #{Number.format(ty + dy)} #{op}"
end
defp render_patched(%{operator: :Tm, operands: operands}, dx, dy) do
[{:number, a}, {:number, b}, {:number, c}, {:number, d}, {:number, e}, {:number, f}] =
operands
Enum.map_join([a, b, c, d, e + dx, f + dy], " ", &Number.format/1) <> " Tm"
end
defp visual_html(doc) do
with {:ok, indexed} <- Glyphs.index(doc),
{:ok, pages} <- PageTree.walk_pages(indexed) do
body =
Enum.map_join(pages, "\n", fn page ->
height = mediabox_height(page.mediabox)
glyphs = indexed.glyphs[page.id] || []
spans =
Enum.map_join(glyphs, "\n", fn g ->
top = height - g.y - g.font_size
~s(<span data-uid="#{g.id}" style="position:absolute;) <>
~s(left:#{fmt(g.x)}px;top:#{fmt(top)}px;font-size:#{fmt(g.font_size)}px">) <>
escape(g.char) <> "</span>"
end)
~s(<div class="pdf-page" style="position:relative;) <>
~s(width:#{fmt(mediabox_width(page.mediabox))}px;height:#{fmt(height)}px">\n) <>
spans <> "\n</div>"
end)
{:ok, "<!DOCTYPE html>\n<html><body>\n" <> body <> "\n</body></html>"}
else
{:error, %Error{} = e} -> {:error, e}
{:error, reason} when is_atom(reason) -> {:error, Error.new(reason)}
end
end
defp semantic_html(doc) do
with {:ok, indexed} <- Glyphs.index(doc),
{:ok, pages} <- PageTree.walk_pages(indexed) do
body =
Enum.map_join(pages, "\n", fn page ->
glyphs = indexed.glyphs[page.id] || []
~s(<div class="pdf-page">\n) <> page_blocks(glyphs) <> "\n</div>"
end)
{:ok, "<!DOCTYPE html>\n<html><body>\n" <> body <> "\n</body></html>"}
else
{:error, %Error{} = e} -> {:error, e}
{:error, reason} when is_atom(reason) -> {:error, Error.new(reason)}
end
end
defp page_blocks([]), do: ""
defp page_blocks(glyphs) do
median = median_font_size(glyphs)
glyphs
|> cluster_rows()
|> Enum.map_join("\n", &row_block(&1, median))
end
defp cluster_rows(glyphs) do
glyphs
|> Enum.sort_by(& &1.y, :desc)
|> Enum.chunk_while(
[],
fn g, acc ->
case acc do
[] ->
{:cont, [g]}
[ref | _] = row ->
tol = max(ref.font_size, g.font_size) * 0.4
if abs(ref.y - g.y) <= tol,
do: {:cont, [g | row]},
else: {:cont, Enum.reverse(row), [g]}
end
end,
fn
[] -> {:cont, []}
row -> {:cont, Enum.reverse(row), []}
end
)
end
defp row_block(row, median) do
ordered = Enum.sort_by(row, & &1.x)
runs = group_runs(ordered)
text = Enum.map_join(runs, " ", fn run -> Enum.map_join(run, "", & &1.char) end)
uids = Enum.map_join(ordered, " ", & &1.id)
max_fs = ordered |> Enum.map(& &1.font_size) |> Enum.max(fn -> median end)
tag = classify(text, max_fs, median)
~s(<#{tag} data-uids="#{uids}">#{escape(text)}</#{tag}>)
end
defp group_runs(ordered) do
ordered
|> Enum.group_by(& &1.show_op_index)
|> Enum.sort_by(fn {_op, gs} -> gs |> Enum.map(& &1.x) |> Enum.min() end)
|> Enum.map(fn {_op, gs} -> Enum.sort_by(gs, & &1.x) end)
end
defp classify(_text, max_fs, median) when median > 0 and max_fs >= median * 1.6, do: "h1"
defp classify(_text, max_fs, median) when median > 0 and max_fs >= median * 1.3, do: "h2"
defp classify(text, _max_fs, _median), do: if(list_item?(text), do: "li", else: "p")
defp list_item?(text),
do: Regex.match?(~r/^(\x{2022}|\x{00B7}|[-*]|\d+[.)])\s/u, String.trim_leading(text))
defp median_font_size([]), do: 12.0
defp median_font_size(glyphs) do
sorted = glyphs |> Enum.map(& &1.font_size) |> Enum.sort()
Enum.at(sorted, div(length(sorted), 2))
end
@doc """
Plans the per-run `Op.UpdateText`s that turn the block named by `uids` into
`new_text`, by Myers-diffing against the run-joined current text. Does not
apply them (see `apply_semantic_mutation/3`).
"""
@spec semantic_ops(Document.t(), [binary()] | binary(), String.t()) ::
{:ok, [Op.UpdateText.t()]} | {:error, Error.t()}
def semantic_ops(%Document{} = doc, uids, new_text) when is_binary(new_text) do
with {:ok, indexed} <- Glyphs.index(doc),
{:ok, runs} <- resolve_runs(indexed, normalize_uids(uids)) do
old_text = Enum.map_join(runs, " ", & &1.text)
if new_text == old_text do
{:ok, []}
else
new_subs = split_new_text(runs, new_text)
ops =
runs
|> Enum.zip(new_subs)
|> Enum.filter(fn {run, sub} -> sub != run.text end)
|> Enum.map(fn {run, sub} -> %Op.UpdateText{uid: run.first_uid, text: sub} end)
{:ok, ops}
end
end
end
@doc """
Applies the `semantic_ops/3` plan. Ops are applied in descending run order so
each op's `uid` stays valid: editing a run renumbers only the glyph UIDs that
follow it, so earlier runs (applied later) are unaffected (spec D4).
"""
@spec apply_semantic_mutation(Document.t(), [binary()] | binary(), String.t()) ::
{:ok, Document.t()} | {:error, Error.t()}
def apply_semantic_mutation(%Document{} = doc, uids, new_text) when is_binary(new_text) do
with {:ok, ops} <- semantic_ops(doc, uids, new_text) do
ops
|> Enum.reverse()
|> Enum.reduce_while({:ok, doc}, fn op, {:ok, acc} ->
case ContentEdit.replace_text(acc, op.uid, op.text) do
{:ok, updated} -> {:cont, {:ok, updated}}
{:error, _} = err -> {:halt, err}
end
end)
end
end
defp normalize_uids(uids) when is_list(uids), do: uids
defp normalize_uids(uids) when is_binary(uids), do: String.split(uids, " ", trim: true)
defp resolve_runs(indexed, uids) do
grouped =
uids
|> Enum.flat_map(fn uid ->
case indexed.uid_index[uid] do
{:glyph, _ref, op_index, _gidx} -> [{op_index, uid}]
_ -> []
end
end)
|> Enum.group_by(fn {op, _uid} -> op end, fn {_op, uid} -> uid end)
if map_size(grouped) == 0 do
{:error, Error.new(:unknown_uid)}
else
runs =
grouped
|> Enum.sort_by(fn {op, _uids} -> op end)
|> Enum.map(fn {_op, run_uids} ->
first = List.first(run_uids)
case ContentEdit.run_text(indexed, first) do
{:ok, text} -> %{first_uid: first, text: text}
{:error, _} -> %{first_uid: first, text: ""}
end
end)
{:ok, runs}
end
end
defp split_new_text(runs, new_text) do
owners = build_owners(runs)
old_text = Enum.map_join(owners, "", fn {g, _o} -> g end)
diff = String.myers_difference(old_text, new_text)
{buffers, _current, _rest} =
Enum.reduce(diff, {%{}, 0, owners}, fn {op, str}, {buffers, current, rest} ->
gs = String.graphemes(str)
case op do
:eq ->
Enum.reduce(gs, {buffers, current, rest}, fn _g, {b, _cur, [{g, owner} | more]} ->
case owner do
{:sep, next} -> {b, next, more}
run_idx -> {Map.update(b, run_idx, g, &(&1 <> g)), run_idx, more}
end
end)
:del ->
{buffers, current, Enum.drop(rest, length(gs))}
:ins ->
b = Enum.reduce(gs, buffers, fn g, acc -> Map.update(acc, current, g, &(&1 <> g)) end)
{b, current, rest}
end
end)
Enum.map(0..(length(runs) - 1), fn i -> Map.get(buffers, i, "") end)
end
defp build_owners(runs) do
last = length(runs) - 1
runs
|> Enum.with_index()
|> Enum.flat_map(fn {run, i} ->
own = Enum.map(String.graphemes(run.text), fn g -> {g, i} end)
if i < last, do: own ++ [{" ", {:sep, i + 1}}], else: own
end)
end
# MediaBox elements are stored verbatim, so a non-numeric/indirect element
# (e.g. /MediaBox [0 0 612 5 0 R]) must fall back to the default page size
# rather than raise out of the public to_html/2.
defp mediabox_height([_x0, y0, _x1, y1]) when is_number(y0) and is_number(y1), do: y1 - y0
defp mediabox_height(_), do: 792
defp mediabox_width([x0, _y0, x1, _y1]) when is_number(x0) and is_number(x1), do: x1 - x0
defp mediabox_width(_), do: 612
defp fmt(n) when is_integer(n), do: Integer.to_string(n)
defp fmt(n) when is_float(n), do: Number.format(n)
defp escape(text) do
text
|> String.replace("&", "&")
|> String.replace("<", "<")
|> String.replace(">", ">")
end
end