// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:06831f8166c6d860691af36ee02b72ae3246568eb2e5c67ed5d11da71d02afeb
// Re-generate with: alef generate
#![allow(dead_code, unused_imports, unused_variables)]
#![allow(
clippy::too_many_arguments,
clippy::let_unit_value,
clippy::needless_borrow,
clippy::map_identity,
clippy::just_underscores_and_digits,
clippy::unused_unit,
clippy::unnecessary_cast,
clippy::unwrap_or_default,
clippy::derivable_impls,
clippy::needless_borrows_for_generic_args,
clippy::unnecessary_fallible_conversions
)]
use rustler::Encoder;
use rustler::ResourceArc;
use std::collections::HashMap;
use std::sync::Arc;
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ExtractionMeta {
pub cost: Option<f64>,
pub prompt_tokens: Option<u64>,
pub completion_tokens: Option<u64>,
pub model: Option<String>,
pub chunks_processed: usize,
}
impl ExtractionMeta {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
cost: opts.get("cost").and_then(|t| t.decode().ok()),
prompt_tokens: opts.get("prompt_tokens").and_then(|t| t.decode().ok()),
completion_tokens: opts.get("completion_tokens").and_then(|t| t.decode().ok()),
model: opts.get("model").and_then(|t| t.decode().ok()),
chunks_processed: opts
.get("chunks_processed")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ProxyConfig {
pub url: String,
pub username: Option<String>,
pub password: Option<String>,
}
impl ProxyConfig {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
username: opts.get("username").and_then(|t| t.decode().ok()),
password: opts.get("password").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ContentConfig {
pub output_format: String,
pub preprocessing_preset: String,
pub remove_navigation: bool,
pub remove_forms: bool,
pub strip_tags: Vec<String>,
pub preserve_tags: Vec<String>,
pub exclude_selectors: Vec<String>,
pub skip_images: bool,
pub max_depth: Option<usize>,
pub wrap: bool,
pub wrap_width: usize,
pub include_document_structure: bool,
}
impl ContentConfig {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
output_format: opts
.get("output_format")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
preprocessing_preset: opts
.get("preprocessing_preset")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
remove_navigation: opts
.get("remove_navigation")
.and_then(|t| t.decode().ok())
.unwrap_or(true),
remove_forms: opts.get("remove_forms").and_then(|t| t.decode().ok()).unwrap_or(true),
strip_tags: opts.get("strip_tags").and_then(|t| t.decode().ok()).unwrap_or_default(),
preserve_tags: opts
.get("preserve_tags")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
exclude_selectors: opts
.get("exclude_selectors")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
skip_images: opts.get("skip_images").and_then(|t| t.decode().ok()).unwrap_or(false),
max_depth: opts.get("max_depth").and_then(|t| t.decode().ok()),
wrap: opts.get("wrap").and_then(|t| t.decode().ok()).unwrap_or(false),
wrap_width: opts.get("wrap_width").and_then(|t| t.decode().ok()).unwrap_or(80),
include_document_structure: opts
.get("include_document_structure")
.and_then(|t| t.decode().ok())
.unwrap_or(true),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BrowserConfig {
pub mode: BrowserMode,
pub backend: BrowserBackend,
pub endpoint: Option<String>,
pub timeout: u64,
pub wait: BrowserWait,
pub wait_selector: Option<String>,
pub extra_wait: Option<u64>,
pub proxy: Option<ProxyConfig>,
pub block_url_patterns: Vec<String>,
pub eval_script: Option<String>,
pub robots_user_agent: Option<String>,
pub capture_network_events: bool,
pub session_affinity: bool,
}
impl BrowserConfig {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
mode: opts.get("mode").and_then(|t| t.decode().ok()).unwrap_or_default(),
backend: opts.get("backend").and_then(|t| t.decode().ok()).unwrap_or_default(),
endpoint: opts.get("endpoint").and_then(|t| t.decode().ok()),
timeout: opts.get("timeout").and_then(|t| t.decode().ok()).unwrap_or(30000),
wait: opts.get("wait").and_then(|t| t.decode().ok()).unwrap_or_default(),
wait_selector: opts.get("wait_selector").and_then(|t| t.decode().ok()),
extra_wait: opts.get("extra_wait").and_then(|t| t.decode().ok()),
proxy: opts.get("proxy").and_then(|t| t.decode().ok()),
block_url_patterns: opts
.get("block_url_patterns")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
eval_script: opts.get("eval_script").and_then(|t| t.decode().ok()),
robots_user_agent: opts.get("robots_user_agent").and_then(|t| t.decode().ok()),
capture_network_events: opts
.get("capture_network_events")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
session_affinity: opts
.get("session_affinity")
.and_then(|t| t.decode().ok())
.unwrap_or(true),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CrawlConfig {
pub max_depth: Option<usize>,
pub max_pages: Option<usize>,
pub max_concurrent: Option<usize>,
pub respect_robots_txt: bool,
pub soft_http_errors: bool,
pub user_agent: Option<String>,
pub stay_on_domain: bool,
pub allow_subdomains: bool,
pub include_paths: Vec<String>,
pub exclude_paths: Vec<String>,
pub custom_headers: HashMap<String, String>,
pub request_timeout: u64,
pub rate_limit_ms: Option<u64>,
pub max_redirects: usize,
pub retry_count: usize,
pub retry_codes: Vec<u16>,
pub cookies_enabled: bool,
pub auth: Option<AuthConfig>,
pub max_body_size: Option<usize>,
pub remove_tags: Vec<String>,
pub content: ContentConfig,
pub map_limit: Option<usize>,
pub map_search: Option<String>,
pub download_assets: bool,
pub asset_types: Vec<AssetCategory>,
pub max_asset_size: Option<usize>,
pub browser: BrowserConfig,
pub proxy: Option<ProxyConfig>,
pub user_agents: Vec<String>,
pub capture_screenshot: bool,
pub follow_document_urls: bool,
pub document_url_depth: Option<u32>,
pub download_documents: bool,
pub document_max_size: Option<usize>,
pub document_mime_types: Vec<String>,
pub warc_output: Option<String>,
pub browser_profile: Option<String>,
pub save_browser_profile: bool,
pub ssrf: SsrfPolicy,
}
impl CrawlConfig {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
max_depth: opts.get("max_depth").and_then(|t| t.decode().ok()),
max_pages: opts.get("max_pages").and_then(|t| t.decode().ok()),
max_concurrent: opts.get("max_concurrent").and_then(|t| t.decode().ok()),
respect_robots_txt: opts
.get("respect_robots_txt")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
soft_http_errors: opts
.get("soft_http_errors")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
user_agent: opts.get("user_agent").and_then(|t| t.decode().ok()),
stay_on_domain: opts
.get("stay_on_domain")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
allow_subdomains: opts
.get("allow_subdomains")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
include_paths: opts
.get("include_paths")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
exclude_paths: opts
.get("exclude_paths")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
custom_headers: opts
.get("custom_headers")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
request_timeout: opts
.get("request_timeout")
.and_then(|t| t.decode().ok())
.unwrap_or(30000),
rate_limit_ms: opts.get("rate_limit_ms").and_then(|t| t.decode().ok()),
max_redirects: opts.get("max_redirects").and_then(|t| t.decode().ok()).unwrap_or(10),
retry_count: opts.get("retry_count").and_then(|t| t.decode().ok()).unwrap_or(0),
retry_codes: opts
.get("retry_codes")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
cookies_enabled: opts
.get("cookies_enabled")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
auth: opts.get("auth").and_then(|t| t.decode().ok()),
max_body_size: opts.get("max_body_size").and_then(|t| t.decode().ok()),
remove_tags: opts
.get("remove_tags")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
content: opts.get("content").and_then(|t| t.decode().ok()).unwrap_or_default(),
map_limit: opts.get("map_limit").and_then(|t| t.decode().ok()),
map_search: opts.get("map_search").and_then(|t| t.decode().ok()),
download_assets: opts
.get("download_assets")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
asset_types: opts
.get("asset_types")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
max_asset_size: opts.get("max_asset_size").and_then(|t| t.decode().ok()),
browser: opts.get("browser").and_then(|t| t.decode().ok()).unwrap_or_default(),
proxy: opts.get("proxy").and_then(|t| t.decode().ok()),
user_agents: opts
.get("user_agents")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
capture_screenshot: opts
.get("capture_screenshot")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
follow_document_urls: opts
.get("follow_document_urls")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
document_url_depth: opts.get("document_url_depth").and_then(|t| t.decode().ok()),
download_documents: opts
.get("download_documents")
.and_then(|t| t.decode().ok())
.unwrap_or(true),
document_max_size: opts.get("document_max_size").and_then(|t| t.decode().ok()),
document_mime_types: opts
.get("document_mime_types")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
warc_output: opts.get("warc_output").and_then(|t| t.decode().ok()),
browser_profile: opts.get("browser_profile").and_then(|t| t.decode().ok()),
save_browser_profile: opts
.get("save_browser_profile")
.and_then(|t| t.decode().ok())
.unwrap_or(false),
ssrf: opts.get("ssrf").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BrowserExtras {
pub eval_result: Option<String>,
pub network_events: Vec<ResponseMeta>,
pub cookies: Vec<CookieInfo>,
}
impl BrowserExtras {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
eval_result: opts.get("eval_result").and_then(|t| t.decode().ok()),
network_events: opts
.get("network_events")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
cookies: opts.get("cookies").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct DownloadedDocument {
pub url: String,
pub mime_type: String,
pub size: usize,
pub filename: Option<String>,
pub content_hash: String,
pub headers: HashMap<String, String>,
}
impl DownloadedDocument {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
mime_type: opts.get("mime_type").and_then(|t| t.decode().ok()).unwrap_or_default(),
size: opts.get("size").and_then(|t| t.decode().ok()).unwrap_or_default(),
filename: opts.get("filename").and_then(|t| t.decode().ok()),
content_hash: opts
.get("content_hash")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
headers: opts.get("headers").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct InteractionResult {
pub action_results: Vec<ActionResult>,
pub final_html: String,
pub final_url: String,
}
impl InteractionResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
action_results: opts
.get("action_results")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
final_html: opts.get("final_html").and_then(|t| t.decode().ok()).unwrap_or_default(),
final_url: opts.get("final_url").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ActionResult {
pub action_index: usize,
pub action_type: String,
pub success: bool,
pub data: Option<String>,
pub error: Option<String>,
}
impl ActionResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
action_index: opts
.get("action_index")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
action_type: opts
.get("action_type")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
success: opts.get("success").and_then(|t| t.decode().ok()).unwrap_or_default(),
data: opts.get("data").and_then(|t| t.decode().ok()),
error: opts.get("error").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ScrapeResult {
pub status_code: u16,
pub final_url: String,
pub content_type: String,
pub html: String,
pub body_size: usize,
pub metadata: PageMetadata,
pub links: Vec<LinkInfo>,
pub images: Vec<ImageInfo>,
pub feeds: Vec<FeedInfo>,
pub json_ld: Vec<JsonLdEntry>,
pub is_allowed: bool,
pub crawl_delay: Option<u64>,
pub noindex_detected: bool,
pub nofollow_detected: bool,
pub x_robots_tag: Option<String>,
pub is_pdf: bool,
pub was_skipped: bool,
pub detected_charset: Option<String>,
pub auth_header_sent: bool,
pub response_meta: Option<ResponseMeta>,
pub assets: Vec<DownloadedAsset>,
pub js_render_hint: bool,
pub browser_used: bool,
pub markdown: Option<MarkdownResult>,
pub extracted_data: Option<String>,
pub extraction_meta: Option<ExtractionMeta>,
pub downloaded_document: Option<DownloadedDocument>,
pub browser: Option<BrowserExtras>,
}
impl ScrapeResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
status_code: opts
.get("status_code")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
final_url: opts.get("final_url").and_then(|t| t.decode().ok()).unwrap_or_default(),
content_type: opts
.get("content_type")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
html: opts.get("html").and_then(|t| t.decode().ok()).unwrap_or_default(),
body_size: opts.get("body_size").and_then(|t| t.decode().ok()).unwrap_or_default(),
metadata: opts.get("metadata").and_then(|t| t.decode().ok()).unwrap_or_default(),
links: opts.get("links").and_then(|t| t.decode().ok()).unwrap_or_default(),
images: opts.get("images").and_then(|t| t.decode().ok()).unwrap_or_default(),
feeds: opts.get("feeds").and_then(|t| t.decode().ok()).unwrap_or_default(),
json_ld: opts.get("json_ld").and_then(|t| t.decode().ok()).unwrap_or_default(),
is_allowed: opts.get("is_allowed").and_then(|t| t.decode().ok()).unwrap_or_default(),
crawl_delay: opts.get("crawl_delay").and_then(|t| t.decode().ok()),
noindex_detected: opts
.get("noindex_detected")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
nofollow_detected: opts
.get("nofollow_detected")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
x_robots_tag: opts.get("x_robots_tag").and_then(|t| t.decode().ok()),
is_pdf: opts.get("is_pdf").and_then(|t| t.decode().ok()).unwrap_or_default(),
was_skipped: opts
.get("was_skipped")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
detected_charset: opts.get("detected_charset").and_then(|t| t.decode().ok()),
auth_header_sent: opts
.get("auth_header_sent")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
response_meta: opts.get("response_meta").and_then(|t| t.decode().ok()),
assets: opts.get("assets").and_then(|t| t.decode().ok()).unwrap_or_default(),
js_render_hint: opts
.get("js_render_hint")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
browser_used: opts
.get("browser_used")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
markdown: opts.get("markdown").and_then(|t| t.decode().ok()),
extracted_data: opts.get("extracted_data").and_then(|t| t.decode().ok()),
extraction_meta: opts.get("extraction_meta").and_then(|t| t.decode().ok()),
downloaded_document: opts.get("downloaded_document").and_then(|t| t.decode().ok()),
browser: opts.get("browser").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CrawlPageResult {
pub url: String,
pub normalized_url: String,
pub status_code: u16,
pub content_type: String,
pub html: String,
pub body_size: usize,
pub metadata: PageMetadata,
pub links: Vec<LinkInfo>,
pub images: Vec<ImageInfo>,
pub feeds: Vec<FeedInfo>,
pub json_ld: Vec<JsonLdEntry>,
pub depth: usize,
pub stayed_on_domain: bool,
pub was_skipped: bool,
pub is_pdf: bool,
pub detected_charset: Option<String>,
pub markdown: Option<MarkdownResult>,
pub extracted_data: Option<String>,
pub extraction_meta: Option<ExtractionMeta>,
pub downloaded_document: Option<DownloadedDocument>,
pub browser_used: bool,
}
impl CrawlPageResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
normalized_url: opts
.get("normalized_url")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
status_code: opts
.get("status_code")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
content_type: opts
.get("content_type")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
html: opts.get("html").and_then(|t| t.decode().ok()).unwrap_or_default(),
body_size: opts.get("body_size").and_then(|t| t.decode().ok()).unwrap_or_default(),
metadata: opts.get("metadata").and_then(|t| t.decode().ok()).unwrap_or_default(),
links: opts.get("links").and_then(|t| t.decode().ok()).unwrap_or_default(),
images: opts.get("images").and_then(|t| t.decode().ok()).unwrap_or_default(),
feeds: opts.get("feeds").and_then(|t| t.decode().ok()).unwrap_or_default(),
json_ld: opts.get("json_ld").and_then(|t| t.decode().ok()).unwrap_or_default(),
depth: opts.get("depth").and_then(|t| t.decode().ok()).unwrap_or_default(),
stayed_on_domain: opts
.get("stayed_on_domain")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
was_skipped: opts
.get("was_skipped")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
is_pdf: opts.get("is_pdf").and_then(|t| t.decode().ok()).unwrap_or_default(),
detected_charset: opts.get("detected_charset").and_then(|t| t.decode().ok()),
markdown: opts.get("markdown").and_then(|t| t.decode().ok()),
extracted_data: opts.get("extracted_data").and_then(|t| t.decode().ok()),
extraction_meta: opts.get("extraction_meta").and_then(|t| t.decode().ok()),
downloaded_document: opts.get("downloaded_document").and_then(|t| t.decode().ok()),
browser_used: opts
.get("browser_used")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CrawlResult {
pub pages: Vec<CrawlPageResult>,
pub final_url: String,
pub redirect_count: usize,
pub was_skipped: bool,
pub error: Option<String>,
pub cookies: Vec<CookieInfo>,
pub stayed_on_domain: bool,
pub browser_used: bool,
}
impl CrawlResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
pages: opts.get("pages").and_then(|t| t.decode().ok()).unwrap_or_default(),
final_url: opts.get("final_url").and_then(|t| t.decode().ok()).unwrap_or_default(),
redirect_count: opts
.get("redirect_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
was_skipped: opts
.get("was_skipped")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
error: opts.get("error").and_then(|t| t.decode().ok()),
cookies: opts.get("cookies").and_then(|t| t.decode().ok()).unwrap_or_default(),
stayed_on_domain: opts
.get("stayed_on_domain")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
browser_used: opts
.get("browser_used")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct SitemapUrl {
pub url: String,
pub lastmod: Option<String>,
pub changefreq: Option<String>,
pub priority: Option<String>,
}
impl SitemapUrl {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
lastmod: opts.get("lastmod").and_then(|t| t.decode().ok()),
changefreq: opts.get("changefreq").and_then(|t| t.decode().ok()),
priority: opts.get("priority").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct MapResult {
pub urls: Vec<SitemapUrl>,
}
impl MapResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
urls: opts.get("urls").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct MarkdownResult {
pub content: String,
pub document_structure: Option<String>,
pub tables: Vec<String>,
pub warnings: Vec<String>,
pub citations: bool,
pub fit_content: Option<String>,
}
impl MarkdownResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
content: opts.get("content").and_then(|t| t.decode().ok()).unwrap_or_default(),
document_structure: opts.get("document_structure").and_then(|t| t.decode().ok()),
tables: opts.get("tables").and_then(|t| t.decode().ok()).unwrap_or_default(),
warnings: opts.get("warnings").and_then(|t| t.decode().ok()).unwrap_or_default(),
citations: opts.get("citations").and_then(|t| t.decode().ok()).unwrap_or_default(),
fit_content: opts.get("fit_content").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct LinkInfo {
pub url: String,
pub text: String,
pub link_type: LinkType,
pub rel: Option<String>,
pub nofollow: bool,
}
impl LinkInfo {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
text: opts.get("text").and_then(|t| t.decode().ok()).unwrap_or_default(),
link_type: opts.get("link_type").and_then(|t| t.decode().ok()).unwrap_or_default(),
rel: opts.get("rel").and_then(|t| t.decode().ok()),
nofollow: opts.get("nofollow").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ImageInfo {
pub url: String,
pub alt: Option<String>,
pub width: Option<u32>,
pub height: Option<u32>,
pub source: ImageSource,
}
impl ImageInfo {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
alt: opts.get("alt").and_then(|t| t.decode().ok()),
width: opts.get("width").and_then(|t| t.decode().ok()),
height: opts.get("height").and_then(|t| t.decode().ok()),
source: opts.get("source").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct FeedInfo {
pub url: String,
pub title: Option<String>,
pub feed_type: FeedType,
}
impl FeedInfo {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
title: opts.get("title").and_then(|t| t.decode().ok()),
feed_type: opts.get("feed_type").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct JsonLdEntry {
pub schema_type: String,
pub name: Option<String>,
pub raw: String,
}
impl JsonLdEntry {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
schema_type: opts
.get("schema_type")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
name: opts.get("name").and_then(|t| t.decode().ok()),
raw: opts.get("raw").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CookieInfo {
pub name: String,
pub value: String,
pub domain: Option<String>,
pub path: Option<String>,
}
impl CookieInfo {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
name: opts.get("name").and_then(|t| t.decode().ok()).unwrap_or_default(),
value: opts.get("value").and_then(|t| t.decode().ok()).unwrap_or_default(),
domain: opts.get("domain").and_then(|t| t.decode().ok()),
path: opts.get("path").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct DownloadedAsset {
pub url: String,
pub content_hash: String,
pub mime_type: Option<String>,
pub size: usize,
pub asset_category: AssetCategory,
pub html_tag: Option<String>,
}
impl DownloadedAsset {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
content_hash: opts
.get("content_hash")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
mime_type: opts.get("mime_type").and_then(|t| t.decode().ok()),
size: opts.get("size").and_then(|t| t.decode().ok()).unwrap_or_default(),
asset_category: opts
.get("asset_category")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
html_tag: opts.get("html_tag").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ArticleMetadata {
pub published_time: Option<String>,
pub modified_time: Option<String>,
pub author: Option<String>,
pub section: Option<String>,
pub tags: Vec<String>,
}
impl ArticleMetadata {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
published_time: opts.get("published_time").and_then(|t| t.decode().ok()),
modified_time: opts.get("modified_time").and_then(|t| t.decode().ok()),
author: opts.get("author").and_then(|t| t.decode().ok()),
section: opts.get("section").and_then(|t| t.decode().ok()),
tags: opts.get("tags").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct HreflangEntry {
pub lang: String,
pub url: String,
}
impl HreflangEntry {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
lang: opts.get("lang").and_then(|t| t.decode().ok()).unwrap_or_default(),
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct FaviconInfo {
pub url: String,
pub rel: String,
pub sizes: Option<String>,
pub mime_type: Option<String>,
}
impl FaviconInfo {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
rel: opts.get("rel").and_then(|t| t.decode().ok()).unwrap_or_default(),
sizes: opts.get("sizes").and_then(|t| t.decode().ok()),
mime_type: opts.get("mime_type").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct HeadingInfo {
pub level: u8,
pub text: String,
}
impl HeadingInfo {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
level: opts.get("level").and_then(|t| t.decode().ok()).unwrap_or_default(),
text: opts.get("text").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct ResponseMeta {
pub etag: Option<String>,
pub last_modified: Option<String>,
pub cache_control: Option<String>,
pub server: Option<String>,
pub x_powered_by: Option<String>,
pub content_language: Option<String>,
pub content_encoding: Option<String>,
}
impl ResponseMeta {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
etag: opts.get("etag").and_then(|t| t.decode().ok()),
last_modified: opts.get("last_modified").and_then(|t| t.decode().ok()),
cache_control: opts.get("cache_control").and_then(|t| t.decode().ok()),
server: opts.get("server").and_then(|t| t.decode().ok()),
x_powered_by: opts.get("x_powered_by").and_then(|t| t.decode().ok()),
content_language: opts.get("content_language").and_then(|t| t.decode().ok()),
content_encoding: opts.get("content_encoding").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct PageMetadata {
pub title: Option<String>,
pub description: Option<String>,
pub canonical_url: Option<String>,
pub keywords: Option<String>,
pub author: Option<String>,
pub viewport: Option<String>,
pub theme_color: Option<String>,
pub generator: Option<String>,
pub robots: Option<String>,
pub html_lang: Option<String>,
pub html_dir: Option<String>,
pub og_title: Option<String>,
pub og_type: Option<String>,
pub og_image: Option<String>,
pub og_description: Option<String>,
pub og_url: Option<String>,
pub og_site_name: Option<String>,
pub og_locale: Option<String>,
pub og_video: Option<String>,
pub og_audio: Option<String>,
pub og_locale_alternates: Option<Vec<String>>,
pub twitter_card: Option<String>,
pub twitter_title: Option<String>,
pub twitter_description: Option<String>,
pub twitter_image: Option<String>,
pub twitter_site: Option<String>,
pub twitter_creator: Option<String>,
pub dc_title: Option<String>,
pub dc_creator: Option<String>,
pub dc_subject: Option<String>,
pub dc_description: Option<String>,
pub dc_publisher: Option<String>,
pub dc_date: Option<String>,
pub dc_type: Option<String>,
pub dc_format: Option<String>,
pub dc_identifier: Option<String>,
pub dc_language: Option<String>,
pub dc_rights: Option<String>,
pub article: Option<ArticleMetadata>,
pub hreflangs: Option<Vec<HreflangEntry>>,
pub favicons: Option<Vec<FaviconInfo>>,
pub headings: Option<Vec<HeadingInfo>>,
pub word_count: Option<usize>,
}
impl PageMetadata {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
title: opts.get("title").and_then(|t| t.decode().ok()),
description: opts.get("description").and_then(|t| t.decode().ok()),
canonical_url: opts.get("canonical_url").and_then(|t| t.decode().ok()),
keywords: opts.get("keywords").and_then(|t| t.decode().ok()),
author: opts.get("author").and_then(|t| t.decode().ok()),
viewport: opts.get("viewport").and_then(|t| t.decode().ok()),
theme_color: opts.get("theme_color").and_then(|t| t.decode().ok()),
generator: opts.get("generator").and_then(|t| t.decode().ok()),
robots: opts.get("robots").and_then(|t| t.decode().ok()),
html_lang: opts.get("html_lang").and_then(|t| t.decode().ok()),
html_dir: opts.get("html_dir").and_then(|t| t.decode().ok()),
og_title: opts.get("og_title").and_then(|t| t.decode().ok()),
og_type: opts.get("og_type").and_then(|t| t.decode().ok()),
og_image: opts.get("og_image").and_then(|t| t.decode().ok()),
og_description: opts.get("og_description").and_then(|t| t.decode().ok()),
og_url: opts.get("og_url").and_then(|t| t.decode().ok()),
og_site_name: opts.get("og_site_name").and_then(|t| t.decode().ok()),
og_locale: opts.get("og_locale").and_then(|t| t.decode().ok()),
og_video: opts.get("og_video").and_then(|t| t.decode().ok()),
og_audio: opts.get("og_audio").and_then(|t| t.decode().ok()),
og_locale_alternates: opts.get("og_locale_alternates").and_then(|t| t.decode().ok()),
twitter_card: opts.get("twitter_card").and_then(|t| t.decode().ok()),
twitter_title: opts.get("twitter_title").and_then(|t| t.decode().ok()),
twitter_description: opts.get("twitter_description").and_then(|t| t.decode().ok()),
twitter_image: opts.get("twitter_image").and_then(|t| t.decode().ok()),
twitter_site: opts.get("twitter_site").and_then(|t| t.decode().ok()),
twitter_creator: opts.get("twitter_creator").and_then(|t| t.decode().ok()),
dc_title: opts.get("dc_title").and_then(|t| t.decode().ok()),
dc_creator: opts.get("dc_creator").and_then(|t| t.decode().ok()),
dc_subject: opts.get("dc_subject").and_then(|t| t.decode().ok()),
dc_description: opts.get("dc_description").and_then(|t| t.decode().ok()),
dc_publisher: opts.get("dc_publisher").and_then(|t| t.decode().ok()),
dc_date: opts.get("dc_date").and_then(|t| t.decode().ok()),
dc_type: opts.get("dc_type").and_then(|t| t.decode().ok()),
dc_format: opts.get("dc_format").and_then(|t| t.decode().ok()),
dc_identifier: opts.get("dc_identifier").and_then(|t| t.decode().ok()),
dc_language: opts.get("dc_language").and_then(|t| t.decode().ok()),
dc_rights: opts.get("dc_rights").and_then(|t| t.decode().ok()),
article: opts.get("article").and_then(|t| t.decode().ok()),
hreflangs: opts.get("hreflangs").and_then(|t| t.decode().ok()),
favicons: opts.get("favicons").and_then(|t| t.decode().ok()),
headings: opts.get("headings").and_then(|t| t.decode().ok()),
word_count: opts.get("word_count").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CrawlStreamRequest {
pub url: String,
}
impl CrawlStreamRequest {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BatchCrawlStreamRequest {
pub urls: Vec<String>,
}
impl BatchCrawlStreamRequest {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
urls: opts.get("urls").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CitationResult {
pub content: String,
pub references: Vec<CitationReference>,
}
impl CitationResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
content: opts.get("content").and_then(|t| t.decode().ok()).unwrap_or_default(),
references: opts.get("references").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct CitationReference {
pub index: usize,
pub url: String,
pub text: String,
}
impl CitationReference {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
index: opts.get("index").and_then(|t| t.decode().ok()).unwrap_or_default(),
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
text: opts.get("text").and_then(|t| t.decode().ok()).unwrap_or_default(),
}
}
}
#[derive(Clone)]
pub struct CrawlEngineHandle {
inner: Arc<crawlberg::CrawlEngineHandle>,
}
// SAFETY: See gen_opaque_resource in alef-backend-rustler for rationale.
impl std::panic::RefUnwindSafe for CrawlEngineHandle {}
impl rustler::Resource for CrawlEngineHandle {}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BatchScrapeResult {
pub url: String,
pub result: Option<ScrapeResult>,
pub error: Option<String>,
}
impl BatchScrapeResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
result: opts.get("result").and_then(|t| t.decode().ok()),
error: opts.get("error").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BatchCrawlResult {
pub url: String,
pub result: Option<CrawlResult>,
pub error: Option<String>,
}
impl BatchCrawlResult {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
url: opts.get("url").and_then(|t| t.decode().ok()).unwrap_or_default(),
result: opts.get("result").and_then(|t| t.decode().ok()),
error: opts.get("error").and_then(|t| t.decode().ok()),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BatchScrapeResults {
pub results: Vec<BatchScrapeResult>,
pub total_count: usize,
pub completed_count: usize,
pub failed_count: usize,
}
impl BatchScrapeResults {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
results: opts.get("results").and_then(|t| t.decode().ok()).unwrap_or_default(),
total_count: opts
.get("total_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
completed_count: opts
.get("completed_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
failed_count: opts
.get("failed_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct BatchCrawlResults {
pub results: Vec<BatchCrawlResult>,
pub total_count: usize,
pub completed_count: usize,
pub failed_count: usize,
}
impl BatchCrawlResults {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
results: opts.get("results").and_then(|t| t.decode().ok()).unwrap_or_default(),
total_count: opts
.get("total_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
completed_count: opts
.get("completed_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
failed_count: opts
.get("failed_count")
.and_then(|t| t.decode().ok())
.unwrap_or_default(),
}
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, rustler::NifMap)]
pub struct SsrfPolicy {
pub deny_private: bool,
pub max_redirects: u8,
}
impl SsrfPolicy {
pub fn new(opts: std::collections::HashMap<String, rustler::Term>) -> Self {
Self {
deny_private: opts.get("deny_private").and_then(|t| t.decode().ok()).unwrap_or(true),
max_redirects: opts.get("max_redirects").and_then(|t| t.decode().ok()).unwrap_or(5),
}
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum BrowserMode {
Auto,
Always,
Never,
Stealth,
}
#[allow(clippy::derivable_impls)]
impl Default for BrowserMode {
fn default() -> Self {
Self::Auto
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum BrowserWait {
NetworkIdle,
Selector,
Fixed,
}
#[allow(clippy::derivable_impls)]
impl Default for BrowserWait {
fn default() -> Self {
Self::NetworkIdle
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum BrowserBackend {
Chromiumoxide,
Native,
}
#[allow(clippy::derivable_impls)]
impl Default for BrowserBackend {
fn default() -> Self {
Self::Chromiumoxide
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, rustler::NifTaggedEnum)]
#[serde(tag = "type")]
pub enum AuthConfig {
Basic { username: String, password: String },
Bearer { token: String },
Header { name: String, value: String },
}
#[allow(clippy::derivable_impls)]
impl Default for AuthConfig {
fn default() -> Self {
Self::Basic {
username: Default::default(),
password: Default::default(),
}
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum LinkType {
Internal,
External,
Anchor,
Document,
}
#[allow(clippy::derivable_impls)]
impl Default for LinkType {
fn default() -> Self {
Self::Internal
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum ImageSource {
Img,
PictureSource,
OgImage,
TwitterImage,
}
#[allow(clippy::derivable_impls)]
impl Default for ImageSource {
fn default() -> Self {
Self::Img
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum FeedType {
Rss,
Atom,
JsonFeed,
}
#[allow(clippy::derivable_impls)]
impl Default for FeedType {
fn default() -> Self {
Self::Rss
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum AssetCategory {
Document,
Image,
Audio,
Video,
Font,
Stylesheet,
Script,
Archive,
Data,
Other,
}
#[allow(clippy::derivable_impls)]
impl Default for AssetCategory {
fn default() -> Self {
Self::Image
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, rustler::NifTaggedEnum)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum CrawlEvent {
Page { result: CrawlPageResult },
Error { url: String, error: String },
Complete { pages_crawled: usize },
}
#[allow(clippy::derivable_impls)]
impl Default for CrawlEvent {
fn default() -> Self {
Self::Page {
result: Default::default(),
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, rustler::NifTaggedEnum)]
#[serde(tag = "type", rename_all = "camelCase")]
pub enum PageAction {
Click {
selector: String,
},
TypeText {
selector: String,
text: String,
},
Press {
key: String,
},
Scroll {
direction: ScrollDirection,
#[serde(default)]
selector: Option<String>,
#[serde(default)]
amount: Option<i64>,
},
Wait {
#[serde(default)]
milliseconds: Option<i64>,
#[serde(default)]
selector: Option<String>,
},
Screenshot {
#[serde(default)]
full_page: Option<bool>,
},
ExecuteJs {
script: String,
},
Scrape,
}
#[allow(clippy::derivable_impls)]
impl Default for PageAction {
fn default() -> Self {
Self::Scrape
}
}
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize, rustler::NifUnitEnum)]
pub enum ScrollDirection {
Up,
Down,
}
#[allow(clippy::derivable_impls)]
impl Default for ScrollDirection {
fn default() -> Self {
Self::Down
}
}
/// Streaming handle for `CrawlEngineHandle::crawl_stream` — owns a Tokio runtime
/// plus the live `BoxStream`. Each call to `crawlenginehandle_crawl_stream_next` blocks the dirty-CPU
/// scheduler thread on a single `stream.next()` poll.
pub struct CrawlEngineHandleCrawlStreamHandle {
runtime: std::sync::Arc<tokio::runtime::Runtime>,
stream: std::sync::Mutex<
Option<
futures_util::stream::BoxStream<'static, std::result::Result<crawlberg::CrawlEvent, crawlberg::CrawlError>>,
>,
>,
}
#[rustler::resource_impl()]
impl rustler::Resource for CrawlEngineHandleCrawlStreamHandle {}
/// Open a streaming `crawl_stream` request. Returns an opaque iterator
/// resource which the Elixir wrapper drives via `Stream.unfold/2`.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn crawlenginehandle_crawl_stream_start(
resource: rustler::ResourceArc<CrawlEngineHandle>,
req: Option<String>,
) -> std::result::Result<rustler::ResourceArc<CrawlEngineHandleCrawlStreamHandle>, String> {
let core_req: crawlberg::CrawlStreamRequest = req
.map(|s| serde_json::from_str::<crawlberg::CrawlStreamRequest>(&s))
.transpose()
.map_err(|e| e.to_string())?
.unwrap_or_default();
let runtime = std::sync::Arc::new(
tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.map_err(|e| e.to_string())?,
);
let inner = resource.inner.clone();
let stream = runtime
.block_on(async move { inner.crawl_stream(core_req).await })
.map_err(|e| e.to_string())?;
let handle = CrawlEngineHandleCrawlStreamHandle {
runtime,
stream: std::sync::Mutex::new(Some(stream)),
};
Ok(rustler::ResourceArc::new(handle))
}
/// Pull the next chunk from a streaming handle. Returns the chunk JSON
/// (decoded by the Elixir wrapper via `Jason.decode!/1`) or `nil` to
/// signal end-of-stream. After end-of-stream the inner stream is dropped.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn crawlenginehandle_crawl_stream_next(
handle: rustler::ResourceArc<CrawlEngineHandleCrawlStreamHandle>,
) -> std::result::Result<Option<String>, String> {
use futures_util::StreamExt;
let runtime = handle.runtime.clone();
let mut guard = handle.stream.lock().map_err(|e| e.to_string())?;
let stream_ref = match guard.as_mut() {
Some(s) => s,
None => return Ok(None),
};
match runtime.block_on(stream_ref.next()) {
Some(Ok(chunk)) => {
let json = serde_json::to_string(&chunk).map_err(|e| e.to_string())?;
Ok(Some(json))
}
Some(Err(e)) => {
*guard = None;
Err(e.to_string())
}
None => {
*guard = None;
Ok(None)
}
}
}
/// Streaming handle for `CrawlEngineHandle::batch_crawl_stream` — owns a Tokio runtime
/// plus the live `BoxStream`. Each call to `crawlenginehandle_batch_crawl_stream_next` blocks the dirty-CPU
/// scheduler thread on a single `stream.next()` poll.
pub struct CrawlEngineHandleBatchCrawlStreamHandle {
runtime: std::sync::Arc<tokio::runtime::Runtime>,
stream: std::sync::Mutex<
Option<
futures_util::stream::BoxStream<'static, std::result::Result<crawlberg::CrawlEvent, crawlberg::CrawlError>>,
>,
>,
}
#[rustler::resource_impl()]
impl rustler::Resource for CrawlEngineHandleBatchCrawlStreamHandle {}
/// Open a streaming `batch_crawl_stream` request. Returns an opaque iterator
/// resource which the Elixir wrapper drives via `Stream.unfold/2`.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn crawlenginehandle_batch_crawl_stream_start(
resource: rustler::ResourceArc<CrawlEngineHandle>,
req: Option<String>,
) -> std::result::Result<rustler::ResourceArc<CrawlEngineHandleBatchCrawlStreamHandle>, String> {
let core_req: crawlberg::BatchCrawlStreamRequest = req
.map(|s| serde_json::from_str::<crawlberg::BatchCrawlStreamRequest>(&s))
.transpose()
.map_err(|e| e.to_string())?
.unwrap_or_default();
let runtime = std::sync::Arc::new(
tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.map_err(|e| e.to_string())?,
);
let inner = resource.inner.clone();
let stream = runtime
.block_on(async move { inner.batch_crawl_stream(core_req).await })
.map_err(|e| e.to_string())?;
let handle = CrawlEngineHandleBatchCrawlStreamHandle {
runtime,
stream: std::sync::Mutex::new(Some(stream)),
};
Ok(rustler::ResourceArc::new(handle))
}
/// Pull the next chunk from a streaming handle. Returns the chunk JSON
/// (decoded by the Elixir wrapper via `Jason.decode!/1`) or `nil` to
/// signal end-of-stream. After end-of-stream the inner stream is dropped.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn crawlenginehandle_batch_crawl_stream_next(
handle: rustler::ResourceArc<CrawlEngineHandleBatchCrawlStreamHandle>,
) -> std::result::Result<Option<String>, String> {
use futures_util::StreamExt;
let runtime = handle.runtime.clone();
let mut guard = handle.stream.lock().map_err(|e| e.to_string())?;
let stream_ref = match guard.as_mut() {
Some(s) => s,
None => return Ok(None),
};
match runtime.block_on(stream_ref.next()) {
Some(Ok(chunk)) => {
let json = serde_json::to_string(&chunk).map_err(|e| e.to_string())?;
Ok(Some(json))
}
Some(Err(e)) => {
*guard = None;
Err(e.to_string())
}
None => {
*guard = None;
Ok(None)
}
}
}
/// Convert markdown links to numbered citations.
///
/// `[Example](https://example.com)` becomes `Example[1]`
/// with `[1]: https://example.com` in the reference list.
/// Images `` are preserved unchanged.
#[rustler::nif]
pub fn generate_citations(markdown: String) -> CitationResult {
crawlberg::generate_citations(&markdown).into()
}
/// Create a new crawl engine with the given configuration.
///
/// If `config` is `None`, uses `CrawlConfig::default()`.
/// Returns an error if the configuration is invalid.
#[rustler::nif]
pub fn create_engine(config: Option<String>) -> Result<ResourceArc<CrawlEngineHandle>, String> {
let config_core: Option<crawlberg::CrawlConfig> = config
.map(|s| serde_json::from_str::<crawlberg::CrawlConfig>(&s))
.transpose()
.map_err(|e| e.to_string())?;
let result = crawlberg::create_engine(config_core).map_err(|e| e.to_string())?;
Ok(ResourceArc::new(CrawlEngineHandle {
inner: Arc::new(result),
}))
}
/// Scrape a single URL, returning extracted page data.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn scrape_async(engine: rustler::ResourceArc<CrawlEngineHandle>, url: String) -> Result<ScrapeResult, String> {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
std::thread::Builder::new()
.stack_size(32 * 1024 * 1024)
.spawn(move || {
let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?;
let result = rt
.block_on(async { crawlberg::scrape(&engine.inner, &url).await })
.map_err(|e| e.to_string())?;
Ok(result.into())
})
.map_err(|e| e.to_string())?
.join()
.map_err(|_| "thread panicked".to_string())
}));
match result {
Ok(inner_result) => inner_result?,
Err(_) => Err("thread panic during async operation".to_string()),
}
}
/// Crawl a website starting from `url`, following links up to the configured depth.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn crawl_async(engine: rustler::ResourceArc<CrawlEngineHandle>, url: String) -> Result<CrawlResult, String> {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
std::thread::Builder::new()
.stack_size(32 * 1024 * 1024)
.spawn(move || {
let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?;
let result = rt
.block_on(async { crawlberg::crawl(&engine.inner, &url).await })
.map_err(|e| e.to_string())?;
Ok(result.into())
})
.map_err(|e| e.to_string())?
.join()
.map_err(|_| "thread panicked".to_string())
}));
match result {
Ok(inner_result) => inner_result?,
Err(_) => Err("thread panic during async operation".to_string()),
}
}
/// Discover all pages on a website by following links and sitemaps.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn map_urls_async(engine: rustler::ResourceArc<CrawlEngineHandle>, url: String) -> Result<MapResult, String> {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
std::thread::Builder::new()
.stack_size(32 * 1024 * 1024)
.spawn(move || {
let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?;
let result = rt
.block_on(async { crawlberg::map_urls(&engine.inner, &url).await })
.map_err(|e| e.to_string())?;
Ok(result.into())
})
.map_err(|e| e.to_string())?
.join()
.map_err(|_| "thread panicked".to_string())
}));
match result {
Ok(inner_result) => inner_result?,
Err(_) => Err("thread panic during async operation".to_string()),
}
}
/// Execute browser actions on a single page.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn interact_async(
engine: rustler::ResourceArc<CrawlEngineHandle>,
url: String,
actions: Option<String>,
) -> Result<InteractionResult, String> {
let actions_core: Vec<crawlberg::PageAction> = actions
.map(|s| serde_json::from_str::<Vec<crawlberg::PageAction>>(&s).map_err(|e| e.to_string()))
.transpose()?
.unwrap_or_default();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
std::thread::Builder::new()
.stack_size(32 * 1024 * 1024)
.spawn(move || {
let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?;
let result = rt
.block_on(async { crawlberg::interact(&engine.inner, &url, actions_core).await })
.map_err(|e| e.to_string())?;
Ok(result.into())
})
.map_err(|e| e.to_string())?
.join()
.map_err(|_| "thread panicked".to_string())
}));
match result {
Ok(inner_result) => inner_result?,
Err(_) => Err("thread panic during async operation".to_string()),
}
}
/// Scrape multiple URLs concurrently.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn batch_scrape_async(
engine: rustler::ResourceArc<CrawlEngineHandle>,
urls: Vec<String>,
) -> Result<BatchScrapeResults, String> {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
std::thread::Builder::new()
.stack_size(32 * 1024 * 1024)
.spawn(move || {
let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?;
let result = rt
.block_on(async { crawlberg::batch_scrape(&engine.inner, urls).await })
.map_err(|e| e.to_string())?;
Ok(result.into())
})
.map_err(|e| e.to_string())?
.join()
.map_err(|_| "thread panicked".to_string())
}));
match result {
Ok(inner_result) => inner_result?,
Err(_) => Err("thread panic during async operation".to_string()),
}
}
/// Crawl multiple seed URLs concurrently, each following links to configured depth.
#[rustler::nif(schedule = "DirtyCpu")]
pub fn batch_crawl_async(
engine: rustler::ResourceArc<CrawlEngineHandle>,
urls: Vec<String>,
) -> Result<BatchCrawlResults, String> {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
std::thread::Builder::new()
.stack_size(32 * 1024 * 1024)
.spawn(move || {
let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?;
let result = rt
.block_on(async { crawlberg::batch_crawl(&engine.inner, urls).await })
.map_err(|e| e.to_string())?;
Ok(result.into())
})
.map_err(|e| e.to_string())?
.join()
.map_err(|_| "thread panicked".to_string())
}));
match result {
Ok(inner_result) => inner_result?,
Err(_) => Err("thread panic during async operation".to_string()),
}
}
#[rustler::nif]
pub fn contentconfig_default() -> ContentConfig {
crawlberg::ContentConfig::default().into()
}
#[rustler::nif]
pub fn browserconfig_default() -> BrowserConfig {
crawlberg::BrowserConfig::default().into()
}
#[rustler::nif]
pub fn crawlconfig_default() -> CrawlConfig {
crawlberg::CrawlConfig::default().into()
}
/// Validate the configuration, returning an error if any values are invalid.
#[rustler::nif]
pub fn crawlconfig_validate(obj: CrawlConfig) -> Result<(), String> {
let result = crawlberg::CrawlConfig::from(obj)
.validate()
.map_err(|e| e.to_string())?;
Ok(result)
}
/// Returns the count of unique normalized URLs encountered during crawling.
#[rustler::nif]
pub fn crawlresult_unique_normalized_urls(obj: CrawlResult) -> usize {
crawlberg::CrawlResult::from(obj).unique_normalized_urls()
}
#[rustler::nif]
pub fn ssrfpolicy_default() -> SsrfPolicy {
crawlberg::SsrfPolicy::default().into()
}
/// Create a policy from environment variables.
///
/// On native platforms, reads `CRAWLBERG_ALLOW_PRIVATE_NETWORK` — if set to "1" or "true"
/// (case-insensitive), sets `deny_private = false`. Otherwise, defaults to `deny_private = true`.
///
/// On wasm32 targets (browser/Node.js), environment variables are not accessible to the
/// compiled module. Defaults to `deny_private = false` because:
/// - Outbound requests in a browser go through the fetch API, which enforces its own network policies.
/// - Rust-side SSRF checking is unenforceable and redundant in a wasm32 context.
/// - For testing and localhost access, the host's network sandbox is the enforcing boundary.
#[rustler::nif]
pub fn ssrfpolicy_from_env() -> SsrfPolicy {
crawlberg::SsrfPolicy::from_env().into()
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ExtractionMeta> for crawlberg::ExtractionMeta {
fn from(val: ExtractionMeta) -> Self {
Self {
cost: val.cost,
prompt_tokens: val.prompt_tokens,
completion_tokens: val.completion_tokens,
model: val.model,
chunks_processed: val.chunks_processed,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ExtractionMeta> for ExtractionMeta {
fn from(val: crawlberg::ExtractionMeta) -> Self {
Self {
cost: val.cost,
prompt_tokens: val.prompt_tokens,
completion_tokens: val.completion_tokens,
model: val.model.map(|v| v.to_string()),
chunks_processed: val.chunks_processed,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ProxyConfig> for crawlberg::ProxyConfig {
fn from(val: ProxyConfig) -> Self {
Self {
url: val.url,
username: val.username,
password: val.password,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ProxyConfig> for ProxyConfig {
fn from(val: crawlberg::ProxyConfig) -> Self {
Self {
url: val.url.to_string(),
username: val.username.map(|v| v.to_string()),
password: val.password.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ContentConfig> for crawlberg::ContentConfig {
fn from(val: ContentConfig) -> Self {
Self {
output_format: val.output_format,
preprocessing_preset: val.preprocessing_preset,
remove_navigation: val.remove_navigation,
remove_forms: val.remove_forms,
strip_tags: val.strip_tags.into_iter().collect(),
preserve_tags: val.preserve_tags.into_iter().collect(),
exclude_selectors: val.exclude_selectors.into_iter().collect(),
skip_images: val.skip_images,
max_depth: val.max_depth,
wrap: val.wrap,
wrap_width: val.wrap_width,
include_document_structure: val.include_document_structure,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ContentConfig> for ContentConfig {
fn from(val: crawlberg::ContentConfig) -> Self {
Self {
output_format: val.output_format.to_string(),
preprocessing_preset: val.preprocessing_preset.to_string(),
remove_navigation: val.remove_navigation,
remove_forms: val.remove_forms,
strip_tags: val.strip_tags.into_iter().collect(),
preserve_tags: val.preserve_tags.into_iter().collect(),
exclude_selectors: val.exclude_selectors.into_iter().collect(),
skip_images: val.skip_images,
max_depth: val.max_depth,
wrap: val.wrap,
wrap_width: val.wrap_width,
include_document_structure: val.include_document_structure,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BrowserConfig> for crawlberg::BrowserConfig {
fn from(val: BrowserConfig) -> Self {
Self {
mode: val.mode.into(),
backend: val.backend.into(),
endpoint: val.endpoint,
timeout: std::time::Duration::from_millis(val.timeout),
wait: val.wait.into(),
wait_selector: val.wait_selector,
extra_wait: val.extra_wait.map(std::time::Duration::from_millis),
proxy: val.proxy.map(Into::into),
block_url_patterns: val.block_url_patterns.into_iter().collect(),
eval_script: val.eval_script,
robots_user_agent: val.robots_user_agent,
capture_network_events: val.capture_network_events,
session_affinity: val.session_affinity,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BrowserConfig> for BrowserConfig {
fn from(val: crawlberg::BrowserConfig) -> Self {
Self {
mode: val.mode.into(),
backend: val.backend.into(),
endpoint: val.endpoint.map(|v| v.to_string()),
timeout: val.timeout.as_millis() as u64,
wait: val.wait.into(),
wait_selector: val.wait_selector.map(|v| v.to_string()),
extra_wait: val.extra_wait.map(|d| d.as_millis() as u64),
proxy: val.proxy.map(Into::into),
block_url_patterns: val.block_url_patterns.into_iter().collect(),
eval_script: val.eval_script.map(|v| v.to_string()),
robots_user_agent: val.robots_user_agent.map(|v| v.to_string()),
capture_network_events: val.capture_network_events,
session_affinity: val.session_affinity,
}
}
}
#[allow(clippy::needless_update)]
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CrawlConfig> for crawlberg::CrawlConfig {
fn from(val: CrawlConfig) -> Self {
Self {
max_depth: val.max_depth,
max_pages: val.max_pages,
max_concurrent: val.max_concurrent,
respect_robots_txt: val.respect_robots_txt,
soft_http_errors: val.soft_http_errors,
user_agent: val.user_agent,
stay_on_domain: val.stay_on_domain,
allow_subdomains: val.allow_subdomains,
include_paths: val.include_paths.into_iter().collect(),
exclude_paths: val.exclude_paths.into_iter().collect(),
custom_headers: val
.custom_headers
.into_iter()
.map(|(k, v)| (k.into(), v.into()))
.collect(),
request_timeout: std::time::Duration::from_millis(val.request_timeout),
rate_limit_ms: val.rate_limit_ms,
max_redirects: val.max_redirects,
retry_count: val.retry_count,
retry_codes: val.retry_codes.into_iter().collect(),
cookies_enabled: val.cookies_enabled,
auth: val.auth.map(Into::into),
max_body_size: val.max_body_size,
remove_tags: val.remove_tags.into_iter().collect(),
content: val.content.into(),
map_limit: val.map_limit,
map_search: val.map_search,
download_assets: val.download_assets,
asset_types: val.asset_types.into_iter().map(Into::into).collect(),
max_asset_size: val.max_asset_size,
browser: val.browser.into(),
proxy: val.proxy.map(Into::into),
user_agents: val.user_agents.into_iter().collect(),
capture_screenshot: val.capture_screenshot,
follow_document_urls: val.follow_document_urls,
document_url_depth: val.document_url_depth,
download_documents: val.download_documents,
document_max_size: val.document_max_size,
document_mime_types: val.document_mime_types.into_iter().collect(),
warc_output: val.warc_output.map(Into::into),
browser_profile: val.browser_profile,
save_browser_profile: val.save_browser_profile,
ssrf: val.ssrf.into(),
..Default::default()
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CrawlConfig> for CrawlConfig {
fn from(val: crawlberg::CrawlConfig) -> Self {
Self {
max_depth: val.max_depth,
max_pages: val.max_pages,
max_concurrent: val.max_concurrent,
respect_robots_txt: val.respect_robots_txt,
soft_http_errors: val.soft_http_errors,
user_agent: val.user_agent.map(|v| v.to_string()),
stay_on_domain: val.stay_on_domain,
allow_subdomains: val.allow_subdomains,
include_paths: val.include_paths.into_iter().collect(),
exclude_paths: val.exclude_paths.into_iter().collect(),
custom_headers: val
.custom_headers
.into_iter()
.map(|(k, v)| (k.into(), v.into()))
.collect(),
request_timeout: val.request_timeout.as_millis() as u64,
rate_limit_ms: val.rate_limit_ms,
max_redirects: val.max_redirects,
retry_count: val.retry_count,
retry_codes: val.retry_codes.into_iter().collect(),
cookies_enabled: val.cookies_enabled,
auth: val.auth.map(Into::into),
max_body_size: val.max_body_size,
remove_tags: val.remove_tags.into_iter().collect(),
content: val.content.into(),
map_limit: val.map_limit,
map_search: val.map_search.map(|v| v.to_string()),
download_assets: val.download_assets,
asset_types: val.asset_types.into_iter().map(Into::into).collect(),
max_asset_size: val.max_asset_size,
browser: val.browser.into(),
proxy: val.proxy.map(Into::into),
user_agents: val.user_agents.into_iter().collect(),
capture_screenshot: val.capture_screenshot,
follow_document_urls: val.follow_document_urls,
document_url_depth: val.document_url_depth,
download_documents: val.download_documents,
document_max_size: val.document_max_size,
document_mime_types: val.document_mime_types.into_iter().collect(),
warc_output: val.warc_output.map(|p| p.to_string_lossy().to_string()),
browser_profile: val.browser_profile.map(|v| v.to_string()),
save_browser_profile: val.save_browser_profile,
ssrf: val.ssrf.into(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BrowserExtras> for crawlberg::BrowserExtras {
fn from(val: BrowserExtras) -> Self {
Self {
eval_result: val.eval_result.as_ref().and_then(|s| serde_json::from_str(s).ok()),
network_events: val.network_events.into_iter().map(Into::into).collect(),
cookies: val.cookies.into_iter().map(Into::into).collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BrowserExtras> for BrowserExtras {
fn from(val: crawlberg::BrowserExtras) -> Self {
Self {
eval_result: val.eval_result.as_ref().map(ToString::to_string),
network_events: val.network_events.into_iter().map(Into::into).collect(),
cookies: val.cookies.into_iter().map(Into::into).collect(),
}
}
}
#[allow(clippy::needless_update)]
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<DownloadedDocument> for crawlberg::DownloadedDocument {
fn from(val: DownloadedDocument) -> Self {
Self {
url: val.url,
mime_type: val.mime_type.into(),
size: val.size,
filename: val.filename.map(Into::into),
content_hash: val.content_hash.into(),
headers: val.headers.into_iter().map(|(k, v)| (k.into(), v.into())).collect(),
..Default::default()
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::DownloadedDocument> for DownloadedDocument {
fn from(val: crawlberg::DownloadedDocument) -> Self {
Self {
url: val.url.to_string(),
mime_type: val.mime_type.to_string(),
size: val.size,
filename: val.filename.as_ref().map(|v| v.to_string()),
content_hash: val.content_hash.to_string(),
headers: val.headers.into_iter().map(|(k, v)| (k.into(), v.into())).collect(),
}
}
}
#[allow(clippy::needless_update)]
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<InteractionResult> for crawlberg::InteractionResult {
fn from(val: InteractionResult) -> Self {
Self {
action_results: val.action_results.into_iter().map(Into::into).collect(),
final_html: val.final_html,
final_url: val.final_url,
..Default::default()
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::InteractionResult> for InteractionResult {
fn from(val: crawlberg::InteractionResult) -> Self {
Self {
action_results: val.action_results.into_iter().map(Into::into).collect(),
final_html: val.final_html.to_string(),
final_url: val.final_url.to_string(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ActionResult> for crawlberg::ActionResult {
fn from(val: ActionResult) -> Self {
Self {
action_index: val.action_index,
action_type: val.action_type.into(),
success: val.success,
data: val.data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
error: val.error,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ActionResult> for ActionResult {
fn from(val: crawlberg::ActionResult) -> Self {
Self {
action_index: val.action_index,
action_type: val.action_type.to_string(),
success: val.success,
data: val.data.as_ref().map(ToString::to_string),
error: val.error.map(|v| v.to_string()),
}
}
}
#[allow(clippy::needless_update)]
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ScrapeResult> for crawlberg::ScrapeResult {
fn from(val: ScrapeResult) -> Self {
Self {
status_code: val.status_code,
final_url: val.final_url,
content_type: val.content_type,
html: val.html,
body_size: val.body_size,
metadata: val.metadata.into(),
links: val.links.into_iter().map(Into::into).collect(),
images: val.images.into_iter().map(Into::into).collect(),
feeds: val.feeds.into_iter().map(Into::into).collect(),
json_ld: val.json_ld.into_iter().map(Into::into).collect(),
is_allowed: val.is_allowed,
crawl_delay: val.crawl_delay,
noindex_detected: val.noindex_detected,
nofollow_detected: val.nofollow_detected,
x_robots_tag: val.x_robots_tag,
is_pdf: val.is_pdf,
was_skipped: val.was_skipped,
detected_charset: val.detected_charset,
auth_header_sent: val.auth_header_sent,
response_meta: val.response_meta.map(Into::into),
assets: val.assets.into_iter().map(Into::into).collect(),
js_render_hint: val.js_render_hint,
browser_used: val.browser_used,
markdown: val.markdown.map(Into::into),
extracted_data: val.extracted_data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
extraction_meta: val.extraction_meta.map(Into::into),
downloaded_document: val.downloaded_document.map(Into::into),
browser: val.browser.map(Into::into),
..Default::default()
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ScrapeResult> for ScrapeResult {
fn from(val: crawlberg::ScrapeResult) -> Self {
Self {
status_code: val.status_code,
final_url: val.final_url.to_string(),
content_type: val.content_type.to_string(),
html: val.html.to_string(),
body_size: val.body_size,
metadata: val.metadata.into(),
links: val.links.into_iter().map(Into::into).collect(),
images: val.images.into_iter().map(Into::into).collect(),
feeds: val.feeds.into_iter().map(Into::into).collect(),
json_ld: val.json_ld.into_iter().map(Into::into).collect(),
is_allowed: val.is_allowed,
crawl_delay: val.crawl_delay,
noindex_detected: val.noindex_detected,
nofollow_detected: val.nofollow_detected,
x_robots_tag: val.x_robots_tag.map(|v| v.to_string()),
is_pdf: val.is_pdf,
was_skipped: val.was_skipped,
detected_charset: val.detected_charset.map(|v| v.to_string()),
auth_header_sent: val.auth_header_sent,
response_meta: val.response_meta.map(Into::into),
assets: val.assets.into_iter().map(Into::into).collect(),
js_render_hint: val.js_render_hint,
browser_used: val.browser_used,
markdown: val.markdown.map(Into::into),
extracted_data: val.extracted_data.as_ref().map(ToString::to_string),
extraction_meta: val.extraction_meta.map(Into::into),
downloaded_document: val.downloaded_document.map(Into::into),
browser: val.browser.map(Into::into),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CrawlPageResult> for crawlberg::CrawlPageResult {
fn from(val: CrawlPageResult) -> Self {
Self {
url: val.url,
normalized_url: val.normalized_url,
status_code: val.status_code,
content_type: val.content_type,
html: val.html,
body_size: val.body_size,
metadata: val.metadata.into(),
links: val.links.into_iter().map(Into::into).collect(),
images: val.images.into_iter().map(Into::into).collect(),
feeds: val.feeds.into_iter().map(Into::into).collect(),
json_ld: val.json_ld.into_iter().map(Into::into).collect(),
depth: val.depth,
stayed_on_domain: val.stayed_on_domain,
was_skipped: val.was_skipped,
is_pdf: val.is_pdf,
detected_charset: val.detected_charset,
markdown: val.markdown.map(Into::into),
extracted_data: val.extracted_data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
extraction_meta: val.extraction_meta.map(Into::into),
downloaded_document: val.downloaded_document.map(Into::into),
browser_used: val.browser_used,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CrawlPageResult> for CrawlPageResult {
fn from(val: crawlberg::CrawlPageResult) -> Self {
Self {
url: val.url.to_string(),
normalized_url: val.normalized_url.to_string(),
status_code: val.status_code,
content_type: val.content_type.to_string(),
html: val.html.to_string(),
body_size: val.body_size,
metadata: val.metadata.into(),
links: val.links.into_iter().map(Into::into).collect(),
images: val.images.into_iter().map(Into::into).collect(),
feeds: val.feeds.into_iter().map(Into::into).collect(),
json_ld: val.json_ld.into_iter().map(Into::into).collect(),
depth: val.depth,
stayed_on_domain: val.stayed_on_domain,
was_skipped: val.was_skipped,
is_pdf: val.is_pdf,
detected_charset: val.detected_charset.map(|v| v.to_string()),
markdown: val.markdown.map(Into::into),
extracted_data: val.extracted_data.as_ref().map(ToString::to_string),
extraction_meta: val.extraction_meta.map(Into::into),
downloaded_document: val.downloaded_document.map(Into::into),
browser_used: val.browser_used,
}
}
}
#[allow(clippy::needless_update)]
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CrawlResult> for crawlberg::CrawlResult {
fn from(val: CrawlResult) -> Self {
Self {
pages: val.pages.into_iter().map(Into::into).collect(),
final_url: val.final_url,
redirect_count: val.redirect_count,
was_skipped: val.was_skipped,
error: val.error,
cookies: val.cookies.into_iter().map(Into::into).collect(),
stayed_on_domain: val.stayed_on_domain,
browser_used: val.browser_used,
..Default::default()
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CrawlResult> for CrawlResult {
fn from(val: crawlberg::CrawlResult) -> Self {
Self {
pages: val.pages.into_iter().map(Into::into).collect(),
final_url: val.final_url.to_string(),
redirect_count: val.redirect_count,
was_skipped: val.was_skipped,
error: val.error.map(|v| v.to_string()),
cookies: val.cookies.into_iter().map(Into::into).collect(),
stayed_on_domain: val.stayed_on_domain,
browser_used: val.browser_used,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<SitemapUrl> for crawlberg::SitemapUrl {
fn from(val: SitemapUrl) -> Self {
Self {
url: val.url,
lastmod: val.lastmod,
changefreq: val.changefreq,
priority: val.priority,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::SitemapUrl> for SitemapUrl {
fn from(val: crawlberg::SitemapUrl) -> Self {
Self {
url: val.url.to_string(),
lastmod: val.lastmod.map(|v| v.to_string()),
changefreq: val.changefreq.map(|v| v.to_string()),
priority: val.priority.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<MapResult> for crawlberg::MapResult {
fn from(val: MapResult) -> Self {
Self {
urls: val.urls.into_iter().map(Into::into).collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::MapResult> for MapResult {
fn from(val: crawlberg::MapResult) -> Self {
Self {
urls: val.urls.into_iter().map(Into::into).collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<MarkdownResult> for crawlberg::MarkdownResult {
fn from(val: MarkdownResult) -> Self {
Self {
content: val.content,
document_structure: val
.document_structure
.as_ref()
.and_then(|s| serde_json::from_str(s).ok()),
tables: val
.tables
.into_iter()
.filter_map(|s| serde_json::from_str(&s).ok())
.collect(),
warnings: val.warnings.into_iter().collect(),
citations: val.citations,
fit_content: val.fit_content,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::MarkdownResult> for MarkdownResult {
fn from(val: crawlberg::MarkdownResult) -> Self {
Self {
content: val.content.to_string(),
document_structure: val.document_structure.as_ref().map(ToString::to_string),
tables: val.tables.iter().map(ToString::to_string).collect(),
warnings: val.warnings.into_iter().collect(),
citations: val.citations,
fit_content: val.fit_content.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<LinkInfo> for crawlberg::LinkInfo {
fn from(val: LinkInfo) -> Self {
Self {
url: val.url,
text: val.text,
link_type: val.link_type.into(),
rel: val.rel,
nofollow: val.nofollow,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::LinkInfo> for LinkInfo {
fn from(val: crawlberg::LinkInfo) -> Self {
Self {
url: val.url.to_string(),
text: val.text.to_string(),
link_type: val.link_type.into(),
rel: val.rel.map(|v| v.to_string()),
nofollow: val.nofollow,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ImageInfo> for crawlberg::ImageInfo {
fn from(val: ImageInfo) -> Self {
Self {
url: val.url,
alt: val.alt,
width: val.width,
height: val.height,
source: val.source.into(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ImageInfo> for ImageInfo {
fn from(val: crawlberg::ImageInfo) -> Self {
Self {
url: val.url.to_string(),
alt: val.alt.map(|v| v.to_string()),
width: val.width,
height: val.height,
source: val.source.into(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<FeedInfo> for crawlberg::FeedInfo {
fn from(val: FeedInfo) -> Self {
Self {
url: val.url,
title: val.title,
feed_type: val.feed_type.into(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::FeedInfo> for FeedInfo {
fn from(val: crawlberg::FeedInfo) -> Self {
Self {
url: val.url.to_string(),
title: val.title.map(|v| v.to_string()),
feed_type: val.feed_type.into(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<JsonLdEntry> for crawlberg::JsonLdEntry {
fn from(val: JsonLdEntry) -> Self {
Self {
schema_type: val.schema_type,
name: val.name,
raw: val.raw,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::JsonLdEntry> for JsonLdEntry {
fn from(val: crawlberg::JsonLdEntry) -> Self {
Self {
schema_type: val.schema_type.to_string(),
name: val.name.map(|v| v.to_string()),
raw: val.raw.to_string(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CookieInfo> for crawlberg::CookieInfo {
fn from(val: CookieInfo) -> Self {
Self {
name: val.name,
value: val.value,
domain: val.domain,
path: val.path,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CookieInfo> for CookieInfo {
fn from(val: crawlberg::CookieInfo) -> Self {
Self {
name: val.name.to_string(),
value: val.value.to_string(),
domain: val.domain.map(|v| v.to_string()),
path: val.path.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<DownloadedAsset> for crawlberg::DownloadedAsset {
fn from(val: DownloadedAsset) -> Self {
Self {
url: val.url,
content_hash: val.content_hash,
mime_type: val.mime_type,
size: val.size,
asset_category: val.asset_category.into(),
html_tag: val.html_tag,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::DownloadedAsset> for DownloadedAsset {
fn from(val: crawlberg::DownloadedAsset) -> Self {
Self {
url: val.url.to_string(),
content_hash: val.content_hash.to_string(),
mime_type: val.mime_type.map(|v| v.to_string()),
size: val.size,
asset_category: val.asset_category.into(),
html_tag: val.html_tag.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ArticleMetadata> for crawlberg::ArticleMetadata {
fn from(val: ArticleMetadata) -> Self {
Self {
published_time: val.published_time,
modified_time: val.modified_time,
author: val.author,
section: val.section,
tags: val.tags.into_iter().collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ArticleMetadata> for ArticleMetadata {
fn from(val: crawlberg::ArticleMetadata) -> Self {
Self {
published_time: val.published_time.map(|v| v.to_string()),
modified_time: val.modified_time.map(|v| v.to_string()),
author: val.author.map(|v| v.to_string()),
section: val.section.map(|v| v.to_string()),
tags: val.tags.into_iter().collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<HreflangEntry> for crawlberg::HreflangEntry {
fn from(val: HreflangEntry) -> Self {
Self {
lang: val.lang,
url: val.url,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::HreflangEntry> for HreflangEntry {
fn from(val: crawlberg::HreflangEntry) -> Self {
Self {
lang: val.lang.to_string(),
url: val.url.to_string(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<FaviconInfo> for crawlberg::FaviconInfo {
fn from(val: FaviconInfo) -> Self {
Self {
url: val.url,
rel: val.rel,
sizes: val.sizes,
mime_type: val.mime_type,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::FaviconInfo> for FaviconInfo {
fn from(val: crawlberg::FaviconInfo) -> Self {
Self {
url: val.url.to_string(),
rel: val.rel.to_string(),
sizes: val.sizes.map(|v| v.to_string()),
mime_type: val.mime_type.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<HeadingInfo> for crawlberg::HeadingInfo {
fn from(val: HeadingInfo) -> Self {
Self {
level: val.level,
text: val.text,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::HeadingInfo> for HeadingInfo {
fn from(val: crawlberg::HeadingInfo) -> Self {
Self {
level: val.level,
text: val.text.to_string(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<ResponseMeta> for crawlberg::ResponseMeta {
fn from(val: ResponseMeta) -> Self {
Self {
etag: val.etag,
last_modified: val.last_modified,
cache_control: val.cache_control,
server: val.server,
x_powered_by: val.x_powered_by,
content_language: val.content_language,
content_encoding: val.content_encoding,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::ResponseMeta> for ResponseMeta {
fn from(val: crawlberg::ResponseMeta) -> Self {
Self {
etag: val.etag.map(|v| v.to_string()),
last_modified: val.last_modified.map(|v| v.to_string()),
cache_control: val.cache_control.map(|v| v.to_string()),
server: val.server.map(|v| v.to_string()),
x_powered_by: val.x_powered_by.map(|v| v.to_string()),
content_language: val.content_language.map(|v| v.to_string()),
content_encoding: val.content_encoding.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<PageMetadata> for crawlberg::PageMetadata {
fn from(val: PageMetadata) -> Self {
Self {
title: val.title,
description: val.description,
canonical_url: val.canonical_url,
keywords: val.keywords,
author: val.author,
viewport: val.viewport,
theme_color: val.theme_color,
generator: val.generator,
robots: val.robots,
html_lang: val.html_lang,
html_dir: val.html_dir,
og_title: val.og_title,
og_type: val.og_type,
og_image: val.og_image,
og_description: val.og_description,
og_url: val.og_url,
og_site_name: val.og_site_name,
og_locale: val.og_locale,
og_video: val.og_video,
og_audio: val.og_audio,
og_locale_alternates: val.og_locale_alternates.map(|v| v.into_iter().collect()),
twitter_card: val.twitter_card,
twitter_title: val.twitter_title,
twitter_description: val.twitter_description,
twitter_image: val.twitter_image,
twitter_site: val.twitter_site,
twitter_creator: val.twitter_creator,
dc_title: val.dc_title,
dc_creator: val.dc_creator,
dc_subject: val.dc_subject,
dc_description: val.dc_description,
dc_publisher: val.dc_publisher,
dc_date: val.dc_date,
dc_type: val.dc_type,
dc_format: val.dc_format,
dc_identifier: val.dc_identifier,
dc_language: val.dc_language,
dc_rights: val.dc_rights,
article: val.article.map(Into::into),
hreflangs: val.hreflangs.map(|v| v.into_iter().map(Into::into).collect()),
favicons: val.favicons.map(|v| v.into_iter().map(Into::into).collect()),
headings: val.headings.map(|v| v.into_iter().map(Into::into).collect()),
word_count: val.word_count,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::PageMetadata> for PageMetadata {
fn from(val: crawlberg::PageMetadata) -> Self {
Self {
title: val.title.map(|v| v.to_string()),
description: val.description.map(|v| v.to_string()),
canonical_url: val.canonical_url.map(|v| v.to_string()),
keywords: val.keywords.map(|v| v.to_string()),
author: val.author.map(|v| v.to_string()),
viewport: val.viewport.map(|v| v.to_string()),
theme_color: val.theme_color.map(|v| v.to_string()),
generator: val.generator.map(|v| v.to_string()),
robots: val.robots.map(|v| v.to_string()),
html_lang: val.html_lang.map(|v| v.to_string()),
html_dir: val.html_dir.map(|v| v.to_string()),
og_title: val.og_title.map(|v| v.to_string()),
og_type: val.og_type.map(|v| v.to_string()),
og_image: val.og_image.map(|v| v.to_string()),
og_description: val.og_description.map(|v| v.to_string()),
og_url: val.og_url.map(|v| v.to_string()),
og_site_name: val.og_site_name.map(|v| v.to_string()),
og_locale: val.og_locale.map(|v| v.to_string()),
og_video: val.og_video.map(|v| v.to_string()),
og_audio: val.og_audio.map(|v| v.to_string()),
og_locale_alternates: val.og_locale_alternates.map(|v| v.into_iter().collect()),
twitter_card: val.twitter_card.map(|v| v.to_string()),
twitter_title: val.twitter_title.map(|v| v.to_string()),
twitter_description: val.twitter_description.map(|v| v.to_string()),
twitter_image: val.twitter_image.map(|v| v.to_string()),
twitter_site: val.twitter_site.map(|v| v.to_string()),
twitter_creator: val.twitter_creator.map(|v| v.to_string()),
dc_title: val.dc_title.map(|v| v.to_string()),
dc_creator: val.dc_creator.map(|v| v.to_string()),
dc_subject: val.dc_subject.map(|v| v.to_string()),
dc_description: val.dc_description.map(|v| v.to_string()),
dc_publisher: val.dc_publisher.map(|v| v.to_string()),
dc_date: val.dc_date.map(|v| v.to_string()),
dc_type: val.dc_type.map(|v| v.to_string()),
dc_format: val.dc_format.map(|v| v.to_string()),
dc_identifier: val.dc_identifier.map(|v| v.to_string()),
dc_language: val.dc_language.map(|v| v.to_string()),
dc_rights: val.dc_rights.map(|v| v.to_string()),
article: val.article.map(Into::into),
hreflangs: val.hreflangs.map(|v| v.into_iter().map(Into::into).collect()),
favicons: val.favicons.map(|v| v.into_iter().map(Into::into).collect()),
headings: val.headings.map(|v| v.into_iter().map(Into::into).collect()),
word_count: val.word_count,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CrawlStreamRequest> for crawlberg::CrawlStreamRequest {
fn from(val: CrawlStreamRequest) -> Self {
Self { url: val.url }
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CrawlStreamRequest> for CrawlStreamRequest {
fn from(val: crawlberg::CrawlStreamRequest) -> Self {
Self {
url: val.url.to_string(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BatchCrawlStreamRequest> for crawlberg::BatchCrawlStreamRequest {
fn from(val: BatchCrawlStreamRequest) -> Self {
Self {
urls: val.urls.into_iter().collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BatchCrawlStreamRequest> for BatchCrawlStreamRequest {
fn from(val: crawlberg::BatchCrawlStreamRequest) -> Self {
Self {
urls: val.urls.into_iter().collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CitationResult> for crawlberg::CitationResult {
fn from(val: CitationResult) -> Self {
Self {
content: val.content,
references: val.references.into_iter().map(Into::into).collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CitationResult> for CitationResult {
fn from(val: crawlberg::CitationResult) -> Self {
Self {
content: val.content.to_string(),
references: val.references.into_iter().map(Into::into).collect(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<CitationReference> for crawlberg::CitationReference {
fn from(val: CitationReference) -> Self {
Self {
index: val.index,
url: val.url,
text: val.text,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::CitationReference> for CitationReference {
fn from(val: crawlberg::CitationReference) -> Self {
Self {
index: val.index,
url: val.url.to_string(),
text: val.text.to_string(),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BatchScrapeResult> for crawlberg::BatchScrapeResult {
fn from(val: BatchScrapeResult) -> Self {
Self {
url: val.url,
result: val.result.map(Into::into),
error: val.error,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BatchScrapeResult> for BatchScrapeResult {
fn from(val: crawlberg::BatchScrapeResult) -> Self {
Self {
url: val.url.to_string(),
result: val.result.map(Into::into),
error: val.error.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BatchCrawlResult> for crawlberg::BatchCrawlResult {
fn from(val: BatchCrawlResult) -> Self {
Self {
url: val.url,
result: val.result.map(Into::into),
error: val.error,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BatchCrawlResult> for BatchCrawlResult {
fn from(val: crawlberg::BatchCrawlResult) -> Self {
Self {
url: val.url.to_string(),
result: val.result.map(Into::into),
error: val.error.map(|v| v.to_string()),
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BatchScrapeResults> for crawlberg::BatchScrapeResults {
fn from(val: BatchScrapeResults) -> Self {
Self {
results: val.results.into_iter().map(Into::into).collect(),
total_count: val.total_count,
completed_count: val.completed_count,
failed_count: val.failed_count,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BatchScrapeResults> for BatchScrapeResults {
fn from(val: crawlberg::BatchScrapeResults) -> Self {
Self {
results: val.results.into_iter().map(Into::into).collect(),
total_count: val.total_count,
completed_count: val.completed_count,
failed_count: val.failed_count,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<BatchCrawlResults> for crawlberg::BatchCrawlResults {
fn from(val: BatchCrawlResults) -> Self {
Self {
results: val.results.into_iter().map(Into::into).collect(),
total_count: val.total_count,
completed_count: val.completed_count,
failed_count: val.failed_count,
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::BatchCrawlResults> for BatchCrawlResults {
fn from(val: crawlberg::BatchCrawlResults) -> Self {
Self {
results: val.results.into_iter().map(Into::into).collect(),
total_count: val.total_count,
completed_count: val.completed_count,
failed_count: val.failed_count,
}
}
}
#[allow(clippy::needless_update)]
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<SsrfPolicy> for crawlberg::SsrfPolicy {
fn from(val: SsrfPolicy) -> Self {
Self {
deny_private: val.deny_private,
max_redirects: val.max_redirects,
..Default::default()
}
}
}
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
impl From<crawlberg::SsrfPolicy> for SsrfPolicy {
fn from(val: crawlberg::SsrfPolicy) -> Self {
Self {
deny_private: val.deny_private,
max_redirects: val.max_redirects,
}
}
}
impl From<BrowserMode> for crawlberg::BrowserMode {
fn from(val: BrowserMode) -> Self {
match val {
BrowserMode::Auto => Self::Auto,
BrowserMode::Always => Self::Always,
BrowserMode::Never => Self::Never,
BrowserMode::Stealth => Self::Stealth,
}
}
}
impl From<crawlberg::BrowserMode> for BrowserMode {
fn from(val: crawlberg::BrowserMode) -> Self {
match val {
crawlberg::BrowserMode::Auto => Self::Auto,
crawlberg::BrowserMode::Always => Self::Always,
crawlberg::BrowserMode::Never => Self::Never,
crawlberg::BrowserMode::Stealth => Self::Stealth,
}
}
}
impl From<BrowserWait> for crawlberg::BrowserWait {
fn from(val: BrowserWait) -> Self {
match val {
BrowserWait::NetworkIdle => Self::NetworkIdle,
BrowserWait::Selector => Self::Selector,
BrowserWait::Fixed => Self::Fixed,
}
}
}
impl From<crawlberg::BrowserWait> for BrowserWait {
fn from(val: crawlberg::BrowserWait) -> Self {
match val {
crawlberg::BrowserWait::NetworkIdle => Self::NetworkIdle,
crawlberg::BrowserWait::Selector => Self::Selector,
crawlberg::BrowserWait::Fixed => Self::Fixed,
}
}
}
impl From<BrowserBackend> for crawlberg::BrowserBackend {
fn from(val: BrowserBackend) -> Self {
match val {
BrowserBackend::Chromiumoxide => Self::Chromiumoxide,
BrowserBackend::Native => Self::Native,
}
}
}
impl From<crawlberg::BrowserBackend> for BrowserBackend {
fn from(val: crawlberg::BrowserBackend) -> Self {
match val {
crawlberg::BrowserBackend::Chromiumoxide => Self::Chromiumoxide,
crawlberg::BrowserBackend::Native => Self::Native,
}
}
}
impl From<AuthConfig> for crawlberg::AuthConfig {
fn from(val: AuthConfig) -> Self {
match val {
AuthConfig::Basic { username, password } => Self::Basic { username, password },
AuthConfig::Bearer { token } => Self::Bearer { token },
AuthConfig::Header { name, value } => Self::Header { name, value },
}
}
}
impl From<crawlberg::AuthConfig> for AuthConfig {
fn from(val: crawlberg::AuthConfig) -> Self {
match val {
crawlberg::AuthConfig::Basic { username, password } => Self::Basic {
username: username.to_string(),
password: password.to_string(),
},
crawlberg::AuthConfig::Bearer { token } => Self::Bearer {
token: token.to_string(),
},
crawlberg::AuthConfig::Header { name, value } => Self::Header {
name: name.to_string(),
value: value.to_string(),
},
}
}
}
impl From<LinkType> for crawlberg::LinkType {
fn from(val: LinkType) -> Self {
match val {
LinkType::Internal => Self::Internal,
LinkType::External => Self::External,
LinkType::Anchor => Self::Anchor,
LinkType::Document => Self::Document,
}
}
}
impl From<crawlberg::LinkType> for LinkType {
fn from(val: crawlberg::LinkType) -> Self {
match val {
crawlberg::LinkType::Internal => Self::Internal,
crawlberg::LinkType::External => Self::External,
crawlberg::LinkType::Anchor => Self::Anchor,
crawlberg::LinkType::Document => Self::Document,
}
}
}
impl From<ImageSource> for crawlberg::ImageSource {
fn from(val: ImageSource) -> Self {
match val {
ImageSource::Img => Self::Img,
ImageSource::PictureSource => Self::PictureSource,
ImageSource::OgImage => Self::OgImage,
ImageSource::TwitterImage => Self::TwitterImage,
}
}
}
impl From<crawlberg::ImageSource> for ImageSource {
fn from(val: crawlberg::ImageSource) -> Self {
match val {
crawlberg::ImageSource::Img => Self::Img,
crawlberg::ImageSource::PictureSource => Self::PictureSource,
crawlberg::ImageSource::OgImage => Self::OgImage,
crawlberg::ImageSource::TwitterImage => Self::TwitterImage,
}
}
}
impl From<FeedType> for crawlberg::FeedType {
fn from(val: FeedType) -> Self {
match val {
FeedType::Rss => Self::Rss,
FeedType::Atom => Self::Atom,
FeedType::JsonFeed => Self::JsonFeed,
}
}
}
impl From<crawlberg::FeedType> for FeedType {
fn from(val: crawlberg::FeedType) -> Self {
match val {
crawlberg::FeedType::Rss => Self::Rss,
crawlberg::FeedType::Atom => Self::Atom,
crawlberg::FeedType::JsonFeed => Self::JsonFeed,
}
}
}
impl From<AssetCategory> for crawlberg::AssetCategory {
fn from(val: AssetCategory) -> Self {
match val {
AssetCategory::Document => Self::Document,
AssetCategory::Image => Self::Image,
AssetCategory::Audio => Self::Audio,
AssetCategory::Video => Self::Video,
AssetCategory::Font => Self::Font,
AssetCategory::Stylesheet => Self::Stylesheet,
AssetCategory::Script => Self::Script,
AssetCategory::Archive => Self::Archive,
AssetCategory::Data => Self::Data,
AssetCategory::Other => Self::Other,
}
}
}
impl From<crawlberg::AssetCategory> for AssetCategory {
fn from(val: crawlberg::AssetCategory) -> Self {
match val {
crawlberg::AssetCategory::Document => Self::Document,
crawlberg::AssetCategory::Image => Self::Image,
crawlberg::AssetCategory::Audio => Self::Audio,
crawlberg::AssetCategory::Video => Self::Video,
crawlberg::AssetCategory::Font => Self::Font,
crawlberg::AssetCategory::Stylesheet => Self::Stylesheet,
crawlberg::AssetCategory::Script => Self::Script,
crawlberg::AssetCategory::Archive => Self::Archive,
crawlberg::AssetCategory::Data => Self::Data,
crawlberg::AssetCategory::Other => Self::Other,
}
}
}
impl From<crawlberg::CrawlEvent> for CrawlEvent {
fn from(val: crawlberg::CrawlEvent) -> Self {
match val {
crawlberg::CrawlEvent::Page { result } => Self::Page {
result: (*result).into(),
},
crawlberg::CrawlEvent::Error { url, error } => Self::Error {
url: url.to_string(),
error: error.to_string(),
},
crawlberg::CrawlEvent::Complete { pages_crawled } => Self::Complete { pages_crawled },
}
}
}
impl From<PageAction> for crawlberg::PageAction {
fn from(val: PageAction) -> Self {
match val {
PageAction::Click { selector } => Self::Click { selector },
PageAction::TypeText { selector, text } => Self::TypeText { selector, text },
PageAction::Press { key } => Self::Press { key },
PageAction::Scroll {
direction,
selector,
amount,
} => Self::Scroll {
direction: direction.into(),
selector,
amount,
},
PageAction::Wait { milliseconds, selector } => Self::Wait { milliseconds, selector },
PageAction::Screenshot { full_page } => Self::Screenshot { full_page },
PageAction::ExecuteJs { script } => Self::ExecuteJs { script },
PageAction::Scrape => Self::Scrape,
}
}
}
impl From<crawlberg::PageAction> for PageAction {
fn from(val: crawlberg::PageAction) -> Self {
match val {
crawlberg::PageAction::Click { selector } => Self::Click {
selector: selector.to_string(),
},
crawlberg::PageAction::TypeText { selector, text } => Self::TypeText {
selector: selector.to_string(),
text: text.to_string(),
},
crawlberg::PageAction::Press { key } => Self::Press { key: key.to_string() },
crawlberg::PageAction::Scroll {
direction,
selector,
amount,
} => Self::Scroll {
direction: direction.into(),
selector: selector.map(|v| v.to_string()),
amount,
},
crawlberg::PageAction::Wait { milliseconds, selector } => Self::Wait {
milliseconds,
selector: selector.map(|v| v.to_string()),
},
crawlberg::PageAction::Screenshot { full_page } => Self::Screenshot { full_page },
crawlberg::PageAction::ExecuteJs { script } => Self::ExecuteJs {
script: script.to_string(),
},
crawlberg::PageAction::Scrape => Self::Scrape,
}
}
}
impl From<ScrollDirection> for crawlberg::ScrollDirection {
fn from(val: ScrollDirection) -> Self {
match val {
ScrollDirection::Up => Self::Up,
ScrollDirection::Down => Self::Down,
}
}
}
impl From<crawlberg::ScrollDirection> for ScrollDirection {
fn from(val: crawlberg::ScrollDirection) -> Self {
match val {
crawlberg::ScrollDirection::Up => Self::Up,
crawlberg::ScrollDirection::Down => Self::Down,
}
}
}
/// Convert a `crawlberg::CrawlError` error to a Rustler error string.
#[allow(dead_code)]
fn crawl_error_to_rustler_err(e: crawlberg::CrawlError) -> String {
e.to_string()
}
/// Convert a `crawlberg::SsrfError` error to a Rustler error string.
#[allow(dead_code)]
fn ssrf_error_to_rustler_err(e: crawlberg::SsrfError) -> String {
e.to_string()
}
#[rustler::nif]
pub fn extraction_meta_from_json(json: String) -> Result<ExtractionMeta, String> {
serde_json::from_str::<crawlberg::ExtractionMeta>(&json)
.map(ExtractionMeta::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn proxy_config_from_json(json: String) -> Result<ProxyConfig, String> {
serde_json::from_str::<crawlberg::ProxyConfig>(&json)
.map(ProxyConfig::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn content_config_from_json(json: String) -> Result<ContentConfig, String> {
serde_json::from_str::<crawlberg::ContentConfig>(&json)
.map(ContentConfig::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn browser_config_from_json(json: String) -> Result<BrowserConfig, String> {
serde_json::from_str::<crawlberg::BrowserConfig>(&json)
.map(BrowserConfig::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn crawl_config_from_json(json: String) -> Result<CrawlConfig, String> {
serde_json::from_str::<crawlberg::CrawlConfig>(&json)
.map(CrawlConfig::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn browser_extras_from_json(json: String) -> Result<BrowserExtras, String> {
serde_json::from_str::<crawlberg::BrowserExtras>(&json)
.map(BrowserExtras::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn downloaded_document_from_json(json: String) -> Result<DownloadedDocument, String> {
serde_json::from_str::<crawlberg::DownloadedDocument>(&json)
.map(DownloadedDocument::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn interaction_result_from_json(json: String) -> Result<InteractionResult, String> {
serde_json::from_str::<crawlberg::InteractionResult>(&json)
.map(InteractionResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn action_result_from_json(json: String) -> Result<ActionResult, String> {
serde_json::from_str::<crawlberg::ActionResult>(&json)
.map(ActionResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn scrape_result_from_json(json: String) -> Result<ScrapeResult, String> {
serde_json::from_str::<crawlberg::ScrapeResult>(&json)
.map(ScrapeResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn crawl_page_result_from_json(json: String) -> Result<CrawlPageResult, String> {
serde_json::from_str::<crawlberg::CrawlPageResult>(&json)
.map(CrawlPageResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn crawl_result_from_json(json: String) -> Result<CrawlResult, String> {
serde_json::from_str::<crawlberg::CrawlResult>(&json)
.map(CrawlResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn sitemap_url_from_json(json: String) -> Result<SitemapUrl, String> {
serde_json::from_str::<crawlberg::SitemapUrl>(&json)
.map(SitemapUrl::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn map_result_from_json(json: String) -> Result<MapResult, String> {
serde_json::from_str::<crawlberg::MapResult>(&json)
.map(MapResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn markdown_result_from_json(json: String) -> Result<MarkdownResult, String> {
serde_json::from_str::<crawlberg::MarkdownResult>(&json)
.map(MarkdownResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn link_info_from_json(json: String) -> Result<LinkInfo, String> {
serde_json::from_str::<crawlberg::LinkInfo>(&json)
.map(LinkInfo::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn image_info_from_json(json: String) -> Result<ImageInfo, String> {
serde_json::from_str::<crawlberg::ImageInfo>(&json)
.map(ImageInfo::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn feed_info_from_json(json: String) -> Result<FeedInfo, String> {
serde_json::from_str::<crawlberg::FeedInfo>(&json)
.map(FeedInfo::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn json_ld_entry_from_json(json: String) -> Result<JsonLdEntry, String> {
serde_json::from_str::<crawlberg::JsonLdEntry>(&json)
.map(JsonLdEntry::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn cookie_info_from_json(json: String) -> Result<CookieInfo, String> {
serde_json::from_str::<crawlberg::CookieInfo>(&json)
.map(CookieInfo::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn downloaded_asset_from_json(json: String) -> Result<DownloadedAsset, String> {
serde_json::from_str::<crawlberg::DownloadedAsset>(&json)
.map(DownloadedAsset::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn article_metadata_from_json(json: String) -> Result<ArticleMetadata, String> {
serde_json::from_str::<crawlberg::ArticleMetadata>(&json)
.map(ArticleMetadata::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn hreflang_entry_from_json(json: String) -> Result<HreflangEntry, String> {
serde_json::from_str::<crawlberg::HreflangEntry>(&json)
.map(HreflangEntry::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn favicon_info_from_json(json: String) -> Result<FaviconInfo, String> {
serde_json::from_str::<crawlberg::FaviconInfo>(&json)
.map(FaviconInfo::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn heading_info_from_json(json: String) -> Result<HeadingInfo, String> {
serde_json::from_str::<crawlberg::HeadingInfo>(&json)
.map(HeadingInfo::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn response_meta_from_json(json: String) -> Result<ResponseMeta, String> {
serde_json::from_str::<crawlberg::ResponseMeta>(&json)
.map(ResponseMeta::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn page_metadata_from_json(json: String) -> Result<PageMetadata, String> {
serde_json::from_str::<crawlberg::PageMetadata>(&json)
.map(PageMetadata::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn crawl_stream_request_from_json(json: String) -> Result<CrawlStreamRequest, String> {
serde_json::from_str::<crawlberg::CrawlStreamRequest>(&json)
.map(CrawlStreamRequest::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn batch_crawl_stream_request_from_json(json: String) -> Result<BatchCrawlStreamRequest, String> {
serde_json::from_str::<crawlberg::BatchCrawlStreamRequest>(&json)
.map(BatchCrawlStreamRequest::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn citation_result_from_json(json: String) -> Result<CitationResult, String> {
serde_json::from_str::<crawlberg::CitationResult>(&json)
.map(CitationResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn citation_reference_from_json(json: String) -> Result<CitationReference, String> {
serde_json::from_str::<crawlberg::CitationReference>(&json)
.map(CitationReference::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn batch_scrape_result_from_json(json: String) -> Result<BatchScrapeResult, String> {
serde_json::from_str::<crawlberg::BatchScrapeResult>(&json)
.map(BatchScrapeResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn batch_crawl_result_from_json(json: String) -> Result<BatchCrawlResult, String> {
serde_json::from_str::<crawlberg::BatchCrawlResult>(&json)
.map(BatchCrawlResult::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn batch_scrape_results_from_json(json: String) -> Result<BatchScrapeResults, String> {
serde_json::from_str::<crawlberg::BatchScrapeResults>(&json)
.map(BatchScrapeResults::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn batch_crawl_results_from_json(json: String) -> Result<BatchCrawlResults, String> {
serde_json::from_str::<crawlberg::BatchCrawlResults>(&json)
.map(BatchCrawlResults::from)
.map_err(|e| e.to_string())
}
#[rustler::nif]
pub fn ssrf_policy_from_json(json: String) -> Result<SsrfPolicy, String> {
serde_json::from_str::<crawlberg::SsrfPolicy>(&json)
.map(SsrfPolicy::from)
.map_err(|e| e.to_string())
}
fn on_load(env: rustler::Env, _info: rustler::Term) -> bool {
env.register::<CrawlEngineHandle>()
.expect("Failed to register resource type CrawlEngineHandle");
env.register::<CrawlEngineHandleCrawlStreamHandle>()
.expect("Failed to register resource type CrawlEngineHandleCrawlStreamHandle");
env.register::<CrawlEngineHandleBatchCrawlStreamHandle>()
.expect("Failed to register resource type CrawlEngineHandleBatchCrawlStreamHandle");
true
}
rustler::init!("Elixir.Crawlberg.Native", load = on_load);