Skip to main content

lib/lite_parse.ex

defmodule LiteParse do
  @moduledoc """
  Elixir wrapper for the [LiteParse](https://github.com/run-llama/liteparse)
  Rust library, providing fast, local PDF and document parsing with spatial
  text extraction.
  """

  alias LiteParse.Config

  @type parse_result :: %{text: String.t(), page_count: non_neg_integer()}

  @doc """
  Parses a document file from disk and returns its extracted text and page count.

  ## Options

  See `LiteParse.Config` for the full list. Pass options as a keyword list:

      LiteParse.parse("doc.pdf", max_pages: 100, ocr_enabled: false)

  Or as a reusable struct:

      config = LiteParse.Config.new(ocr_enabled: false)
      LiteParse.parse("doc.pdf", config)

  Returns `{:ok, %{text: binary, page_count: integer}}` on success or
  `{:error, reason}` if the file cannot be read or parsed.
  """
  @spec parse(Path.t(), keyword() | Config.t()) ::
          {:ok, parse_result()} | {:error, String.t()}
  def parse(path, opts \\ []) when is_binary(path) do
    LiteParse.Native.parse(path, Config.to_nif(opts))
  end

  @doc """
  Parses a document from in-memory binary data and returns its extracted text
  and page count. Useful when the document is not on disk (e.g. received from
  a network request or an upload).

  Mirrors the underlying `liteparse::LiteParse::parse_input` API with
  `PdfInput::Bytes`.

  ## Options

  See `LiteParse.Config` for the full list. Pass options as a keyword list:

      LiteParse.parse_input(uploaded_pdf_binary, max_pages: 100, ocr_enabled: false)

  Or as a reusable struct:

      config = LiteParse.Config.new(ocr_enabled: false)
      LiteParse.parse_input(uploaded_pdf_binary, config)

  Returns `{:ok, %{text: binary, page_count: integer}}` on success or
  `{:error, reason}` if the data cannot be parsed.
  """
  @spec parse_input(binary(), keyword() | Config.t()) ::
          {:ok, parse_result()} | {:error, String.t()}
  def parse_input(bytes, opts \\ []) when is_binary(bytes) do
    LiteParse.Native.parse_input(bytes, Config.to_nif(opts))
  end
end