Skip to main content

native/liteparse_native/src/lib.rs

use std::sync::OnceLock;

use rustler::{Binary, NifMap};
use tokio::runtime::Runtime;

use liteparse::types::PdfInput;
use liteparse::{LiteParse, LiteParseConfig, OutputFormat};

#[derive(NifMap)]
pub struct ParseOpts {
    pub ocr_language: String,
    pub ocr_enabled: bool,
    pub ocr_server_url: Option<String>,
    pub tessdata_path: Option<String>,
    pub max_pages: usize,
    pub target_pages: Option<String>,
    pub dpi: f64,
    pub output_format: String,
    pub preserve_very_small_text: bool,
    pub password: Option<String>,
    pub quiet: bool,
    pub num_workers: usize,
}

impl From<ParseOpts> for LiteParseConfig {
    fn from(opts: ParseOpts) -> Self {
        LiteParseConfig {
            ocr_language: opts.ocr_language,
            ocr_enabled: opts.ocr_enabled,
            ocr_server_url: opts.ocr_server_url,
            tessdata_path: opts.tessdata_path,
            max_pages: opts.max_pages,
            target_pages: opts.target_pages,
            dpi: opts.dpi as f32,
            output_format: match opts.output_format.as_str() {
                "text" => OutputFormat::Text,
                _ => OutputFormat::Json,
            },
            preserve_very_small_text: opts.preserve_very_small_text,
            password: opts.password,
            quiet: opts.quiet,
            num_workers: opts.num_workers,
        }
    }
}

#[derive(NifMap)]
pub struct ParseResponse {
    pub text: String,
    pub page_count: usize,
}

fn runtime() -> &'static Runtime {
    static RT: OnceLock<Runtime> = OnceLock::new();
    RT.get_or_init(|| Runtime::new().expect("failed to build tokio runtime"))
}

#[rustler::nif(schedule = "DirtyCpu")]
fn parse(path: String, opts: ParseOpts) -> Result<ParseResponse, String> {
    let parser = LiteParse::new(LiteParseConfig::from(opts));

    let result = runtime()
        .block_on(parser.parse(&path))
        .map_err(|e| e.to_string())?;

    Ok(ParseResponse {
        text: result.text,
        page_count: result.pages.len(),
    })
}

#[rustler::nif(schedule = "DirtyCpu")]
fn parse_input(bytes: Binary, opts: ParseOpts) -> Result<ParseResponse, String> {
    let parser = LiteParse::new(LiteParseConfig::from(opts));

    let result = runtime()
        .block_on(parser.parse_input(PdfInput::Bytes(bytes.to_vec())))
        .map_err(|e| e.to_string())?;

    Ok(ParseResponse {
        text: result.text,
        page_count: result.pages.len(),
    })
}

rustler::init!("Elixir.LiteParse.Native");