defmodule PdfEx do
@moduledoc """
Pure-Elixir PDF parsing and lossless surgery engine.
No NIFs, no C bindings, no external binaries — one runtime dependency
(`:telemetry`). This module is the read-oriented facade (`open/1`,
`page_count/1`, `pages/1`, `extract_text/1,2`); editing, projection, and
serialization live in dedicated modules:
* `PdfEx.Editor` — structural page ops (insert / delete / reorder)
* `PdfEx.ContentEdit` — run-level text replacement and glyph deletion
* `PdfEx.Convert` — visual & semantic HTML projection + reverse mutation
* `PdfEx.Serializer` — incremental (lossless) and full re-serialization
* `PdfEx.Session` — supervised collaborative editing sessions
* `PdfEx.Font.Surgery` — TrueType glyph-subset surgery
Every read/edit API is a pure function over an immutable `PdfEx.Document`;
malformed input never raises, it returns `{:error, PdfEx.Error.t()}`.
## Usage
iex> {:ok, doc} = PdfEx.open(File.read!("document.pdf"))
iex> {:ok, pages} = PdfEx.pages(doc)
iex> {:ok, text} = PdfEx.extract_text(doc)
"""
alias PdfEx.{Document, Error, Filter}
alias PdfEx.XRef.{Bootstrap, Walker}
alias PdfEx.{PageTree, Resolver}
alias PdfEx.COS.{Lexer, Reference, Stream}
alias PdfEx.Content.{Interpreter, Layout}
@doc "Opens PDF `data` into a `PdfEx.Document`. Malformed input returns `{:error, ...}`; encrypted PDFs are refused."
@spec open(binary()) :: {:ok, Document.t()} | {:error, Error.t()}
def open(data) when is_binary(data) do
with :ok <- verify_header(data),
{:ok, startxref} <- Bootstrap.find_startxref(data),
{:ok, xref_map, trailer} <- Walker.build_xref_map(data, startxref),
:ok <- verify_unencrypted(trailer) do
doc = %Document{
source: data,
revision_xref_offset: startxref,
xref_style: detect_xref_style(data, startxref),
xref_map: xref_map,
trailer: trailer,
object_cache: %{}
}
{:ok, doc}
else
{:error, reason} when is_atom(reason) ->
{:error, Error.new(reason)}
{:error, reason} ->
{:error, Error.new(:parse_error, inspect(reason))}
end
end
@doc "Like `open/1` but returns the document directly and raises on failure."
@spec open!(binary()) :: Document.t()
def open!(data) do
case open(data) do
{:ok, doc} ->
doc
{:error, %Error{reason: reason, message: msg}} ->
raise "PdfEx.open! failed: #{reason} — #{msg}"
end
end
@doc "Returns the document's page count from the catalog's `/Pages` `/Count`."
@spec page_count(Document.t()) :: {:ok, non_neg_integer()} | {:error, Error.t()}
def page_count(%Document{} = doc) do
root_ref = doc.trailer[:Root]
with %Reference{id: id, gen: gen} <- root_ref,
{:ok, catalog} when is_map(catalog) and not is_struct(catalog) <-
Resolver.resolve(doc, {id, gen}),
%Reference{id: pid, gen: pgen} <- catalog[:Pages],
{:ok, pages} when is_map(pages) and not is_struct(pages) <-
Resolver.resolve(doc, {pid, pgen}) do
case pages[:Count] do
count when is_integer(count) and count >= 0 -> {:ok, count}
_ -> {:ok, 0}
end
else
_ -> {:error, Error.new(:cannot_determine_page_count)}
end
end
@doc "Walks the page tree, returning the leaf pages in document order with inherited attributes resolved."
@spec pages(Document.t()) :: {:ok, [PageTree.Page.t()]} | {:error, Error.t()}
def pages(%Document{} = doc) do
case PageTree.walk_pages(doc) do
{:ok, page_list} -> {:ok, page_list}
{:error, reason} when is_atom(reason) -> {:error, Error.new(reason)}
end
end
@doc "Extracts all text, in reading order, joined across pages by a page-break marker."
@spec extract_text(Document.t()) :: {:ok, String.t()} | {:error, Error.t()}
def extract_text(%Document{} = doc) do
case pages(doc) do
{:ok, page_list} ->
text =
page_list
|> Enum.map(&extract_page_text(doc, &1))
|> Enum.reject(&(&1 == ""))
|> Enum.join("\n--- Page Break ---\n")
{:ok, text}
{:error, reason} ->
{:error, reason}
end
end
@doc "Extracts text from a single 1-based `page_number`."
@spec extract_text(Document.t(), pos_integer()) :: {:ok, String.t()} | {:error, Error.t()}
def extract_text(%Document{} = doc, page_number)
when is_integer(page_number) and page_number >= 1 do
case pages(doc) do
{:ok, page_list} ->
case Enum.at(page_list, page_number - 1) do
nil ->
{:error, Error.new(:page_out_of_range, "page #{page_number} does not exist")}
page ->
{:ok, extract_page_text(doc, page)}
end
{:error, reason} ->
{:error, reason}
end
end
defp extract_page_text(doc, page) do
content_bytes =
page.contents
|> Enum.flat_map(fn
%Reference{id: id, gen: gen} ->
case Resolver.resolve(doc, {id, gen}) do
{:ok, %Stream{} = stream} -> [Filter.decode_lenient(stream)]
_ -> []
end
_ ->
[]
end)
|> Enum.join("\n")
fragments = Interpreter.interpret(content_bytes, doc, page.resources)
Layout.reconstruct(fragments)
end
defp detect_xref_style(data, startxref) do
case Lexer.tokenize_next(data, startxref) do
{:ok, {:keyword, :xref}, _} ->
:classic
{:ok, {:number, _}, _} ->
case PdfEx.XRef.StreamParser.parse(data, startxref) do
{:ok, _, _} -> :stream
_ -> :classic
end
_ ->
:classic
end
end
defp verify_header(<<"%PDF-", _::binary>>), do: :ok
defp verify_header(_), do: {:error, :not_a_pdf}
defp verify_unencrypted(trailer) do
if Map.has_key?(trailer, :Encrypt), do: {:error, :encrypted_pdf}, else: :ok
end
end