Skip to main content

lib/lite_parse/config.ex

defmodule LiteParse.Config do
  @moduledoc """
  Configuration schema for `LiteParse.parse/2` and `LiteParse.parse_input/2`.

  Mirrors the options exposed by the underlying `liteparse` Rust crate.
  Pass options as a keyword list, e.g.:

      LiteParse.parse("doc.pdf", max_pages: 100, ocr_enabled: false)

  Or build a reusable struct:

      config = LiteParse.Config.new(max_pages: 100, ocr_language: "spa")
      LiteParse.parse("a.pdf", config)
      LiteParse.parse_input(bin, config)
  """

  @default_num_workers max(System.schedulers_online() - 1, 1)

  @schema [
    ocr_language: [
      type: :string,
      default: "eng",
      doc: "OCR language code (Tesseract format: \"eng\", \"fra\", \"deu\", etc.)."
    ],
    ocr_enabled: [
      type: :boolean,
      default: true,
      doc: "Run OCR on text-sparse pages and embedded images."
    ],
    ocr_server_url: [
      type: {:or, [:string, nil]},
      default: nil,
      doc: "HTTP OCR server URL. When nil, uses built-in Tesseract if available."
    ],
    tessdata_path: [
      type: {:or, [:string, nil]},
      default: nil,
      doc: "Path to tessdata directory. Falls back to the TESSDATA_PREFIX env var."
    ],
    max_pages: [
      type: :non_neg_integer,
      default: 1000,
      doc: "Maximum number of pages to parse."
    ],
    target_pages: [
      type: {:or, [:string, nil]},
      default: nil,
      doc: "Range expression like \"1-5,10,15-20\". `nil` means all pages."
    ],
    dpi: [
      type: :float,
      default: 150.0,
      doc: "DPI for rendering pages (used for OCR and screenshots)."
    ],
    output_format: [
      type: {:in, [:json, :text]},
      default: :json,
      doc: "Output format hint passed to the parser."
    ],
    preserve_very_small_text: [
      type: :boolean,
      default: false,
      doc: "Keep very small text that would normally be filtered out."
    ],
    password: [
      type: {:or, [:string, nil]},
      default: nil,
      doc: "Password for encrypted/protected documents."
    ],
    quiet: [
      type: :boolean,
      default: true,
      doc: "Suppress progress output in the Rust layer. Elixir's Logger is unaffected."
    ],
    num_workers: [
      type: :non_neg_integer,
      default: @default_num_workers,
      doc: "Number of concurrent OCR workers."
    ]
  ]

  @type t :: %__MODULE__{
          ocr_language: String.t(),
          ocr_enabled: boolean(),
          ocr_server_url: String.t() | nil,
          tessdata_path: String.t() | nil,
          max_pages: non_neg_integer(),
          target_pages: String.t() | nil,
          dpi: float(),
          output_format: :json | :text,
          preserve_very_small_text: boolean(),
          password: String.t() | nil,
          quiet: boolean(),
          num_workers: non_neg_integer()
        }

  defstruct [
    :ocr_language,
    :ocr_enabled,
    :ocr_server_url,
    :tessdata_path,
    :max_pages,
    :target_pages,
    :dpi,
    :output_format,
    :preserve_very_small_text,
    :password,
    :quiet,
    :num_workers
  ]

  @doc """
  Builds a `%LiteParse.Config{}` from a keyword list, applying defaults
  and validating types.

  Raises `NimbleOptions.ValidationError` on bad input.

  ## Examples

      iex> LiteParse.Config.new(max_pages: 50)
      %LiteParse.Config{max_pages: 50, ocr_language: "eng", ...}
  """
  @spec new(keyword()) :: t()
  def new(opts \\ []) when is_list(opts) do
    struct!(__MODULE__, NimbleOptions.validate!(opts, @schema))
  end

  @doc """
  Returns the validated keyword list (with defaults applied) for the given
  options. Accepts a keyword list or a `%LiteParse.Config{}` struct.
  """
  @spec validate(keyword() | t()) :: keyword()
  def validate(opts) do
    NimbleOptions.validate!(opts, @schema)
  end

  @doc """
  Converts the validated options into the map shape expected by the NIF:
  string-keyed map with `output_format` serialised to a lowercase string.

  Accepts a keyword list or a `%LiteParse.Config{}` struct. When given a struct,
  it is trusted as already-validated and just converted; when given a keyword
  list, it goes through `NimbleOptions.validate!/2` first.

  Internal-facing but public to keep `LiteParse.parse/2` decoupled from
  the schema definition.
  """
  @spec to_nif(keyword() | t()) :: map()
  def to_nif(%__MODULE__{} = opts) do
    opts
    |> Map.from_struct()
    |> Map.update!(:output_format, &Atom.to_string/1)
  end

  def to_nif(opts) when is_list(opts) do
    opts
    |> validate()
    |> Map.new()
    |> Map.update!(:output_format, &Atom.to_string/1)
  end
end