Skip to main content

lib/pdf_ex.ex

defmodule PdfEx do
  @moduledoc """
  Pure-Elixir PDF parsing and lossless surgery engine.

  No NIFs, no C bindings, no external binaries — one runtime dependency
  (`:telemetry`). This module is the read-oriented facade (`open/1`,
  `page_count/1`, `pages/1`, `extract_text/1,2`); editing, projection, and
  serialization live in dedicated modules:

    * `PdfEx.Editor` — structural page ops (insert / delete / reorder)
    * `PdfEx.ContentEdit` — run-level text replacement and glyph deletion
    * `PdfEx.Convert` — visual & semantic HTML projection + reverse mutation
    * `PdfEx.Serializer` — incremental (lossless) and full re-serialization
    * `PdfEx.Session` — supervised collaborative editing sessions
    * `PdfEx.Font.Surgery` — TrueType glyph-subset surgery

  Every read/edit API is a pure function over an immutable `PdfEx.Document`;
  malformed input never raises, it returns `{:error, PdfEx.Error.t()}`.

  ## Usage

      iex> {:ok, doc} = PdfEx.open(File.read!("document.pdf"))
      iex> {:ok, pages} = PdfEx.pages(doc)
      iex> {:ok, text} = PdfEx.extract_text(doc)
  """

  alias PdfEx.{Document, Error, Filter}
  alias PdfEx.XRef.{Bootstrap, Walker}
  alias PdfEx.{PageTree, Resolver}
  alias PdfEx.COS.{Lexer, Reference, Stream}
  alias PdfEx.Content.{Interpreter, Layout}

  @doc "Opens PDF `data` into a `PdfEx.Document`. Malformed input returns `{:error, ...}`; encrypted PDFs are refused."
  @spec open(binary()) :: {:ok, Document.t()} | {:error, Error.t()}
  def open(data) when is_binary(data) do
    with :ok <- verify_header(data),
         {:ok, startxref} <- Bootstrap.find_startxref(data),
         {:ok, xref_map, trailer} <- Walker.build_xref_map(data, startxref),
         :ok <- verify_unencrypted(trailer) do
      doc = %Document{
        source: data,
        revision_xref_offset: startxref,
        xref_style: detect_xref_style(data, startxref),
        xref_map: xref_map,
        trailer: trailer,
        object_cache: %{}
      }

      {:ok, doc}
    else
      {:error, reason} when is_atom(reason) ->
        {:error, Error.new(reason)}

      {:error, reason} ->
        {:error, Error.new(:parse_error, inspect(reason))}
    end
  end

  @doc "Like `open/1` but returns the document directly and raises on failure."
  @spec open!(binary()) :: Document.t()
  def open!(data) do
    case open(data) do
      {:ok, doc} ->
        doc

      {:error, %Error{reason: reason, message: msg}} ->
        raise "PdfEx.open! failed: #{reason}#{msg}"
    end
  end

  @doc "Returns the document's page count from the catalog's `/Pages` `/Count`."
  @spec page_count(Document.t()) :: {:ok, non_neg_integer()} | {:error, Error.t()}
  def page_count(%Document{} = doc) do
    root_ref = doc.trailer[:Root]

    with %Reference{id: id, gen: gen} <- root_ref,
         {:ok, catalog} when is_map(catalog) and not is_struct(catalog) <-
           Resolver.resolve(doc, {id, gen}),
         %Reference{id: pid, gen: pgen} <- catalog[:Pages],
         {:ok, pages} when is_map(pages) and not is_struct(pages) <-
           Resolver.resolve(doc, {pid, pgen}) do
      case pages[:Count] do
        count when is_integer(count) and count >= 0 -> {:ok, count}
        _ -> {:ok, 0}
      end
    else
      _ -> {:error, Error.new(:cannot_determine_page_count)}
    end
  end

  @doc "Walks the page tree, returning the leaf pages in document order with inherited attributes resolved."
  @spec pages(Document.t()) :: {:ok, [PageTree.Page.t()]} | {:error, Error.t()}
  def pages(%Document{} = doc) do
    case PageTree.walk_pages(doc) do
      {:ok, page_list} -> {:ok, page_list}
      {:error, reason} when is_atom(reason) -> {:error, Error.new(reason)}
    end
  end

  @doc "Extracts all text, in reading order, joined across pages by a page-break marker."
  @spec extract_text(Document.t()) :: {:ok, String.t()} | {:error, Error.t()}
  def extract_text(%Document{} = doc) do
    case pages(doc) do
      {:ok, page_list} ->
        text =
          page_list
          |> Enum.map(&extract_page_text(doc, &1))
          |> Enum.reject(&(&1 == ""))
          |> Enum.join("\n--- Page Break ---\n")

        {:ok, text}

      {:error, reason} ->
        {:error, reason}
    end
  end

  @doc "Extracts text from a single 1-based `page_number`."
  @spec extract_text(Document.t(), pos_integer()) :: {:ok, String.t()} | {:error, Error.t()}
  def extract_text(%Document{} = doc, page_number)
      when is_integer(page_number) and page_number >= 1 do
    case pages(doc) do
      {:ok, page_list} ->
        case Enum.at(page_list, page_number - 1) do
          nil ->
            {:error, Error.new(:page_out_of_range, "page #{page_number} does not exist")}

          page ->
            {:ok, extract_page_text(doc, page)}
        end

      {:error, reason} ->
        {:error, reason}
    end
  end

  defp extract_page_text(doc, page) do
    content_bytes =
      page.contents
      |> Enum.flat_map(fn
        %Reference{id: id, gen: gen} ->
          case Resolver.resolve(doc, {id, gen}) do
            {:ok, %Stream{} = stream} -> [Filter.decode_lenient(stream)]
            _ -> []
          end

        _ ->
          []
      end)
      |> Enum.join("\n")

    fragments = Interpreter.interpret(content_bytes, doc, page.resources)
    Layout.reconstruct(fragments)
  end

  defp detect_xref_style(data, startxref) do
    case Lexer.tokenize_next(data, startxref) do
      {:ok, {:keyword, :xref}, _} ->
        :classic

      {:ok, {:number, _}, _} ->
        case PdfEx.XRef.StreamParser.parse(data, startxref) do
          {:ok, _, _} -> :stream
          _ -> :classic
        end

      _ ->
        :classic
    end
  end

  defp verify_header(<<"%PDF-", _::binary>>), do: :ok
  defp verify_header(_), do: {:error, :not_a_pdf}

  defp verify_unencrypted(trailer) do
    if Map.has_key?(trailer, :Encrypt), do: {:error, :encrypted_pdf}, else: :ok
  end
end