Skip to main content

lib/pdf_ex/convert.ex

defmodule PdfEx.Convert do
  @moduledoc """
  HTML projections of a document, and the reverse (HTML edit → PDF text ops).

  Two modes, both built off the glyph index:

    * `:visual` — byte-faithful absolute layout, one `<span data-uid=...>` per
      glyph (`y_html = mediabox_height - y_pdf - font_size`).
    * `:semantic` — y-band row clustering classified into `<h1>`/`<h2>`/`<li>`/
      `<p>` blocks carrying `data-uids` ranges (heuristic; best-effort for
      rotated text).

  Reverse mapping turns an edited semantic block back into per-run
  `PdfEx.Op.UpdateText`s (`semantic_ops/3`) or applies them
  (`apply_semantic_mutation/3`). `apply_visual_mutation/3` repositions one
  glyph's run; its delta applies to a `Tm` matrix's translation components only
  (documented limitation for scaled/rotated matrices).
  """

  alias PdfEx.{ContentEdit, Document, Error, Glyphs, Op, PageTree, Resolver}
  alias PdfEx.Content.{Number, Tokens}
  alias PdfEx.COS.Stream

  @positioning_ops [:Td, :TD, :Tm]
  @show_ops [:Tj, :TJ, :"'", :"\""]

  @doc """
  Renders the document to HTML. `mode: :visual` (default) is a byte-faithful
  absolute layout; `mode: :semantic` emits classified `data-uid` blocks.
  """
  @spec to_html(Document.t(), [{:mode, :visual | :semantic}]) ::
          {:ok, binary()} | {:error, Error.t()}
  def to_html(%Document{} = doc, opts \\ []) do
    case Keyword.get(opts, :mode, :visual) do
      :visual -> visual_html(doc)
      :semantic -> semantic_html(doc)
      _ -> {:error, Error.new(:unsupported_mode)}
    end
  end

  @doc """
  Moves the span containing `uid` so that glyph lands at the given x/y.

  Token-span patch: rewrites only the nearest preceding Td/TD/Tm operands in
  the content stream (no regeneration); marks only that /Contents object
  dirty. Equal-position mutations are no-ops (dirty_objects untouched).
  """
  @spec apply_visual_mutation(Document.t(), binary(), %{x: number(), y: number()}) ::
          {:ok, Document.t()} | {:error, Error.t()}
  def apply_visual_mutation(%Document{} = doc, uid, %{x: new_x, y: new_y}) do
    with {:ok, indexed} <- Glyphs.index(doc),
         {:ok, contents_ref, op_index, glyph} <- lookup(indexed, uid) do
      dx = new_x / 1.0 - glyph.x
      dy = new_y / 1.0 - glyph.y

      if dx == 0.0 and dy == 0.0 do
        {:ok, indexed}
      else
        patch_position(indexed, contents_ref, op_index, dx, dy)
      end
    end
  end

  defp lookup(doc, uid) do
    case doc.uid_index[uid] do
      {:glyph, contents_ref, op_index, _glyph_idx} ->
        glyph = doc.glyphs |> Map.values() |> List.flatten() |> Enum.find(&(&1.id == uid))
        {:ok, contents_ref, op_index, glyph}

      _ ->
        {:error, Error.new(:unknown_uid)}
    end
  end

  defp patch_position(doc, contents_ref, op_index, dx, dy) do
    with {:ok, %Stream{dictionary: dict} = stream} <- resolve_stream(doc, contents_ref),
         :ok <- ensure_uncompressed(dict),
         {:ok, pos_op} <- find_positioning_op(Tokens.parse_ops(stream.raw_bytes), op_index) do
      bytes = stream.raw_bytes
      {start, stop} = pos_op.region
      replacement = " " <> render_patched(pos_op, dx, dy)

      new_bytes =
        binary_part(bytes, 0, start) <>
          replacement <> binary_part(bytes, stop, byte_size(bytes) - stop)

      new_stream = %Stream{
        dictionary: Map.put(dict, :Length, byte_size(new_bytes)),
        raw_bytes: new_bytes
      }

      dirty = Map.put(doc.dirty_objects, contents_ref, {:update, new_stream})
      {:ok, Document.commit_dirty(doc, dirty)}
    end
  end

  defp resolve_stream(doc, ref) do
    case Resolver.resolve(doc, ref) do
      {:ok, %Stream{} = s} -> {:ok, s}
      _ -> {:error, Error.new(:contents_not_a_stream)}
    end
  end

  defp ensure_uncompressed(dict) do
    if dict[:Filter] || dict[:F],
      do: {:error, Error.new(:unsupported_filter, "uncompressed streams only")},
      else: :ok
  end

  defp find_positioning_op(ops, show_op_index) do
    show_positions =
      ops
      |> Enum.with_index()
      |> Enum.filter(fn {op, _i} -> op.operator in @show_ops end)

    case Enum.at(show_positions, show_op_index) do
      nil ->
        {:error, Error.new(:unknown_uid)}

      {_show_op, ops_idx} ->
        ops
        |> Enum.take(ops_idx)
        |> Enum.reverse()
        |> Enum.find(&(&1.operator in @positioning_ops))
        |> case do
          nil ->
            {:error, Error.new(:unpatchable, "no Td/TD/Tm precedes this span")}

          pos_op ->
            if valid_positioning_operands?(pos_op),
              do: {:ok, pos_op},
              else: {:error, Error.new(:unpatchable, "malformed positioning operands")}
        end
    end
  end

  defp valid_positioning_operands?(%{operator: op, operands: [{:number, _}, {:number, _}]})
       when op in [:Td, :TD],
       do: true

  defp valid_positioning_operands?(%{operator: :Tm, operands: operands}),
    do: length(operands) == 6 and Enum.all?(operands, &match?({:number, _}, &1))

  defp valid_positioning_operands?(_), do: false

  defp render_patched(%{operator: op, operands: operands}, dx, dy) when op in [:Td, :TD] do
    [{:number, tx}, {:number, ty}] = operands
    "#{Number.format(tx + dx)} #{Number.format(ty + dy)} #{op}"
  end

  defp render_patched(%{operator: :Tm, operands: operands}, dx, dy) do
    [{:number, a}, {:number, b}, {:number, c}, {:number, d}, {:number, e}, {:number, f}] =
      operands

    Enum.map_join([a, b, c, d, e + dx, f + dy], " ", &Number.format/1) <> " Tm"
  end

  defp visual_html(doc) do
    with {:ok, indexed} <- Glyphs.index(doc),
         {:ok, pages} <- PageTree.walk_pages(indexed) do
      body =
        Enum.map_join(pages, "\n", fn page ->
          height = mediabox_height(page.mediabox)
          glyphs = indexed.glyphs[page.id] || []

          spans =
            Enum.map_join(glyphs, "\n", fn g ->
              top = height - g.y - g.font_size

              ~s(<span data-uid="#{g.id}" style="position:absolute;) <>
                ~s(left:#{fmt(g.x)}px;top:#{fmt(top)}px;font-size:#{fmt(g.font_size)}px">) <>
                escape(g.char) <> "</span>"
            end)

          ~s(<div class="pdf-page" style="position:relative;) <>
            ~s(width:#{fmt(mediabox_width(page.mediabox))}px;height:#{fmt(height)}px">\n) <>
            spans <> "\n</div>"
        end)

      {:ok, "<!DOCTYPE html>\n<html><body>\n" <> body <> "\n</body></html>"}
    else
      {:error, %Error{} = e} -> {:error, e}
      {:error, reason} when is_atom(reason) -> {:error, Error.new(reason)}
    end
  end

  defp semantic_html(doc) do
    with {:ok, indexed} <- Glyphs.index(doc),
         {:ok, pages} <- PageTree.walk_pages(indexed) do
      body =
        Enum.map_join(pages, "\n", fn page ->
          glyphs = indexed.glyphs[page.id] || []
          ~s(<div class="pdf-page">\n) <> page_blocks(glyphs) <> "\n</div>"
        end)

      {:ok, "<!DOCTYPE html>\n<html><body>\n" <> body <> "\n</body></html>"}
    else
      {:error, %Error{} = e} -> {:error, e}
      {:error, reason} when is_atom(reason) -> {:error, Error.new(reason)}
    end
  end

  defp page_blocks([]), do: ""

  defp page_blocks(glyphs) do
    median = median_font_size(glyphs)

    glyphs
    |> cluster_rows()
    |> Enum.map_join("\n", &row_block(&1, median))
  end

  defp cluster_rows(glyphs) do
    glyphs
    |> Enum.sort_by(& &1.y, :desc)
    |> Enum.chunk_while(
      [],
      fn g, acc ->
        case acc do
          [] ->
            {:cont, [g]}

          [ref | _] = row ->
            tol = max(ref.font_size, g.font_size) * 0.4

            if abs(ref.y - g.y) <= tol,
              do: {:cont, [g | row]},
              else: {:cont, Enum.reverse(row), [g]}
        end
      end,
      fn
        [] -> {:cont, []}
        row -> {:cont, Enum.reverse(row), []}
      end
    )
  end

  defp row_block(row, median) do
    ordered = Enum.sort_by(row, & &1.x)
    runs = group_runs(ordered)
    text = Enum.map_join(runs, " ", fn run -> Enum.map_join(run, "", & &1.char) end)
    uids = Enum.map_join(ordered, " ", & &1.id)
    max_fs = ordered |> Enum.map(& &1.font_size) |> Enum.max(fn -> median end)
    tag = classify(text, max_fs, median)

    ~s(<#{tag} data-uids="#{uids}">#{escape(text)}</#{tag}>)
  end

  defp group_runs(ordered) do
    ordered
    |> Enum.group_by(& &1.show_op_index)
    |> Enum.sort_by(fn {_op, gs} -> gs |> Enum.map(& &1.x) |> Enum.min() end)
    |> Enum.map(fn {_op, gs} -> Enum.sort_by(gs, & &1.x) end)
  end

  defp classify(_text, max_fs, median) when median > 0 and max_fs >= median * 1.6, do: "h1"
  defp classify(_text, max_fs, median) when median > 0 and max_fs >= median * 1.3, do: "h2"
  defp classify(text, _max_fs, _median), do: if(list_item?(text), do: "li", else: "p")

  defp list_item?(text),
    do: Regex.match?(~r/^(\x{2022}|\x{00B7}|[-*]|\d+[.)])\s/u, String.trim_leading(text))

  defp median_font_size([]), do: 12.0

  defp median_font_size(glyphs) do
    sorted = glyphs |> Enum.map(& &1.font_size) |> Enum.sort()
    Enum.at(sorted, div(length(sorted), 2))
  end

  @doc """
  Plans the per-run `Op.UpdateText`s that turn the block named by `uids` into
  `new_text`, by Myers-diffing against the run-joined current text. Does not
  apply them (see `apply_semantic_mutation/3`).
  """
  @spec semantic_ops(Document.t(), [binary()] | binary(), String.t()) ::
          {:ok, [Op.UpdateText.t()]} | {:error, Error.t()}
  def semantic_ops(%Document{} = doc, uids, new_text) when is_binary(new_text) do
    with {:ok, indexed} <- Glyphs.index(doc),
         {:ok, runs} <- resolve_runs(indexed, normalize_uids(uids)) do
      old_text = Enum.map_join(runs, " ", & &1.text)

      if new_text == old_text do
        {:ok, []}
      else
        new_subs = split_new_text(runs, new_text)

        ops =
          runs
          |> Enum.zip(new_subs)
          |> Enum.filter(fn {run, sub} -> sub != run.text end)
          |> Enum.map(fn {run, sub} -> %Op.UpdateText{uid: run.first_uid, text: sub} end)

        {:ok, ops}
      end
    end
  end

  @doc """
  Applies the `semantic_ops/3` plan. Ops are applied in descending run order so
  each op's `uid` stays valid: editing a run renumbers only the glyph UIDs that
  follow it, so earlier runs (applied later) are unaffected (spec D4).
  """
  @spec apply_semantic_mutation(Document.t(), [binary()] | binary(), String.t()) ::
          {:ok, Document.t()} | {:error, Error.t()}
  def apply_semantic_mutation(%Document{} = doc, uids, new_text) when is_binary(new_text) do
    with {:ok, ops} <- semantic_ops(doc, uids, new_text) do
      ops
      |> Enum.reverse()
      |> Enum.reduce_while({:ok, doc}, fn op, {:ok, acc} ->
        case ContentEdit.replace_text(acc, op.uid, op.text) do
          {:ok, updated} -> {:cont, {:ok, updated}}
          {:error, _} = err -> {:halt, err}
        end
      end)
    end
  end

  defp normalize_uids(uids) when is_list(uids), do: uids
  defp normalize_uids(uids) when is_binary(uids), do: String.split(uids, " ", trim: true)

  defp resolve_runs(indexed, uids) do
    grouped =
      uids
      |> Enum.flat_map(fn uid ->
        case indexed.uid_index[uid] do
          {:glyph, _ref, op_index, _gidx} -> [{op_index, uid}]
          _ -> []
        end
      end)
      |> Enum.group_by(fn {op, _uid} -> op end, fn {_op, uid} -> uid end)

    if map_size(grouped) == 0 do
      {:error, Error.new(:unknown_uid)}
    else
      runs =
        grouped
        |> Enum.sort_by(fn {op, _uids} -> op end)
        |> Enum.map(fn {_op, run_uids} ->
          first = List.first(run_uids)

          case ContentEdit.run_text(indexed, first) do
            {:ok, text} -> %{first_uid: first, text: text}
            {:error, _} -> %{first_uid: first, text: ""}
          end
        end)

      {:ok, runs}
    end
  end

  defp split_new_text(runs, new_text) do
    owners = build_owners(runs)
    old_text = Enum.map_join(owners, "", fn {g, _o} -> g end)
    diff = String.myers_difference(old_text, new_text)

    {buffers, _current, _rest} =
      Enum.reduce(diff, {%{}, 0, owners}, fn {op, str}, {buffers, current, rest} ->
        gs = String.graphemes(str)

        case op do
          :eq ->
            Enum.reduce(gs, {buffers, current, rest}, fn _g, {b, _cur, [{g, owner} | more]} ->
              case owner do
                {:sep, next} -> {b, next, more}
                run_idx -> {Map.update(b, run_idx, g, &(&1 <> g)), run_idx, more}
              end
            end)

          :del ->
            {buffers, current, Enum.drop(rest, length(gs))}

          :ins ->
            b = Enum.reduce(gs, buffers, fn g, acc -> Map.update(acc, current, g, &(&1 <> g)) end)
            {b, current, rest}
        end
      end)

    Enum.map(0..(length(runs) - 1), fn i -> Map.get(buffers, i, "") end)
  end

  defp build_owners(runs) do
    last = length(runs) - 1

    runs
    |> Enum.with_index()
    |> Enum.flat_map(fn {run, i} ->
      own = Enum.map(String.graphemes(run.text), fn g -> {g, i} end)
      if i < last, do: own ++ [{" ", {:sep, i + 1}}], else: own
    end)
  end

  # MediaBox elements are stored verbatim, so a non-numeric/indirect element
  # (e.g. /MediaBox [0 0 612 5 0 R]) must fall back to the default page size
  # rather than raise out of the public to_html/2.
  defp mediabox_height([_x0, y0, _x1, y1]) when is_number(y0) and is_number(y1), do: y1 - y0
  defp mediabox_height(_), do: 792

  defp mediabox_width([x0, _y0, x1, _y1]) when is_number(x0) and is_number(x1), do: x1 - x0
  defp mediabox_width(_), do: 612

  defp fmt(n) when is_integer(n), do: Integer.to_string(n)

  defp fmt(n) when is_float(n), do: Number.format(n)

  defp escape(text) do
    text
    |> String.replace("&", "&amp;")
    |> String.replace("<", "&lt;")
    |> String.replace(">", "&gt;")
  end
end