use crate::config::{LiteParseConfig, parse_target_pages};
#[cfg(not(target_arch = "wasm32"))]
use crate::conversion;
use crate::error::LiteParseError;
use crate::extract;
use crate::ocr::OcrEngine;
#[cfg(not(target_arch = "wasm32"))]
use crate::ocr::http_simple::HttpOcrEngine;
#[cfg(feature = "tesseract")]
use crate::ocr::tesseract::TesseractOcrEngine;
use crate::ocr_merge;
use crate::output::markdown;
use crate::projection;
#[cfg(not(target_arch = "wasm32"))]
use crate::render;
use crate::types::{ExtractedImage, OutlineTarget, Page, ParsedPage, PdfInput};
use pdfium::Library;
pub struct ParseResult {
pub pages: Vec<ParsedPage>,
pub text: String,
pub outline: Vec<OutlineTarget>,
pub images: Vec<ExtractedImage>,
}
#[derive(Debug, Clone)]
pub struct ScreenshotResult {
pub page_num: u32,
pub width: u32,
pub height: u32,
pub image_bytes: Vec<u8>,
}
#[cfg(not(target_arch = "wasm32"))]
const FONT_DB_DIR_ENV: &str = "LITEPARSE_FONT_DB_DIR";
#[cfg(not(target_arch = "wasm32"))]
fn default_glyph_resolver() -> Option<std::sync::Arc<dyn crate::GlyphResolver>> {
let dir = std::env::var_os(FONT_DB_DIR_ENV)?;
if dir.is_empty() {
return None;
}
Some(std::sync::Arc::new(crate::FontDbResolver::new(dir)))
}
#[cfg(target_arch = "wasm32")]
fn default_glyph_resolver() -> Option<std::sync::Arc<dyn crate::GlyphResolver>> {
None
}
pub struct LiteParse {
config: LiteParseConfig,
ocr_engine_override: Option<std::sync::Arc<dyn OcrEngine>>,
glyph_resolver: Option<std::sync::Arc<dyn crate::GlyphResolver>>,
}
impl LiteParse {
pub fn new(config: LiteParseConfig) -> Self {
Self {
config,
ocr_engine_override: None,
glyph_resolver: default_glyph_resolver(),
}
}
pub fn with_ocr_engine(mut self, engine: std::sync::Arc<dyn OcrEngine>) -> Self {
self.ocr_engine_override = Some(engine);
self
}
pub fn with_glyph_resolver(
mut self,
resolver: std::sync::Arc<dyn crate::GlyphResolver>,
) -> Self {
self.glyph_resolver = Some(resolver);
self
}
fn resolve_target_pages(&self) -> Result<Option<Vec<u32>>, LiteParseError> {
self.config
.target_pages
.as_ref()
.map(|s| parse_target_pages(s))
.transpose()
.map_err(|e| format!("invalid --target-pages: {}", e).into())
}
pub async fn is_complex(
&self,
input: PdfInput,
) -> Result<Vec<ocr_merge::PageComplexityStats>, LiteParseError> {
let log = |msg: &str| {
if !self.config.quiet {
eprintln!("{}", msg);
}
};
let t0 = web_time::Instant::now();
#[cfg(not(target_arch = "wasm32"))]
let (validated_input, _guard) =
conversion::resolve_pdf_input(input, self.config.password.as_deref(), false).await?;
#[cfg(target_arch = "wasm32")]
let validated_input = input;
let target_pages = self.resolve_target_pages()?;
let password = self.config.password.as_deref();
let lib = Library::init();
let document = extract::load_document_from_input(&lib, &validated_input, password)?;
let (pages, _) = extract::extract_pages_and_images(
&document,
target_pages.as_deref(),
self.config.max_pages,
false, false, self.glyph_resolver.as_deref(),
)?;
let t_extract = web_time::Instant::now();
log(&format!(
"[liteparse] extract: {:.1}ms ({} pages)",
t_extract.duration_since(t0).as_secs_f64() * 1000.0,
pages.len()
));
let t_complexity = web_time::Instant::now();
let page_complexities = pages
.iter()
.map(|page| {
let page_obj = document.page((page.page_number - 1) as i32)?;
ocr_merge::calculate_page_complexity(page, &page_obj)
})
.collect::<Result<Vec<_>, _>>()?;
log(&format!(
"[liteparse] complexity: {:.1}ms",
t_complexity.duration_since(t_extract).as_secs_f64() * 1000.0
));
Ok(page_complexities)
}
#[cfg(not(target_arch = "wasm32"))]
pub async fn parse(&self, input: &str) -> Result<ParseResult, LiteParseError> {
self.parse_input(PdfInput::Path(input.to_string())).await
}
pub async fn parse_input(&self, input: PdfInput) -> Result<ParseResult, LiteParseError> {
let log = |msg: &str| {
if !self.config.quiet {
eprintln!("{}", msg);
}
};
let t0 = web_time::Instant::now();
#[cfg(not(target_arch = "wasm32"))]
let (validated_input, _guard) =
conversion::resolve_pdf_input(input, self.config.password.as_deref(), false).await?;
#[cfg(target_arch = "wasm32")]
let validated_input = input;
let target_pages = self.resolve_target_pages()?;
let password = self.config.password.as_deref();
let render_images = matches!(self.config.image_mode, crate::config::ImageMode::Embed);
let ocr_engine: Option<std::sync::Arc<dyn OcrEngine>> = if self.config.ocr_enabled {
Some(if let Some(e) = self.ocr_engine_override.clone() {
e
} else {
#[cfg(not(target_arch = "wasm32"))]
{
if let Some(ref url) = self.config.ocr_server_url {
std::sync::Arc::new(HttpOcrEngine::with_headers(
url.clone(),
self.config.ocr_server_headers.clone(),
))
} else {
#[cfg(feature = "tesseract")]
{
std::sync::Arc::new(TesseractOcrEngine::new(
self.config.tessdata_path.clone(),
))
}
#[cfg(not(feature = "tesseract"))]
{
return Err("OCR enabled but no --ocr-server-url provided and tesseract feature is disabled".into());
}
}
}
#[cfg(target_arch = "wasm32")]
{
return Err(
"OCR enabled but no `ocrEngine` callback was provided (WASM builds have no built-in OCR engine)".into(),
);
}
})
} else {
None
};
let ocr_grayscale = ocr_engine.as_ref().is_some_and(|e| e.prefers_grayscale());
let (pages, ocr_rendered, outline, images) = {
let lib = Library::init();
let document = extract::load_document_from_input(&lib, &validated_input, password)?;
let outline = extract::extract_outline(&document);
let (pages, images) = extract::extract_pages_and_images(
&document,
target_pages.as_deref(),
self.config.max_pages,
render_images,
self.config.extract_links
&& self.config.output_format == crate::config::OutputFormat::Markdown,
self.glyph_resolver.as_deref(),
)?;
let t_extract = web_time::Instant::now();
log(&format!(
"[liteparse] extract: {:.1}ms ({} pages)",
t_extract.duration_since(t0).as_secs_f64() * 1000.0,
pages.len()
));
let rendered = if self.config.ocr_enabled {
let r = ocr_merge::render_pages_for_ocr(
&document,
&pages,
self.config.dpi,
ocr_grayscale,
)?;
log(&format!(
"[liteparse] ocr render: {:.1}ms ({} pages)",
web_time::Instant::now()
.duration_since(t_extract)
.as_secs_f64()
* 1000.0,
r.len()
));
r
} else {
Vec::new()
};
(pages, rendered, outline, images)
};
let mut pages = pages;
let t1 = web_time::Instant::now();
if let Some(engine) = ocr_engine {
ocr_merge::ocr_and_merge_rendered(
&mut pages,
ocr_rendered,
self.config.dpi,
engine,
&self.config.ocr_language,
self.config.num_workers,
)
.await?;
}
let t_ocr = web_time::Instant::now();
log(&format!(
"[liteparse] ocr: {:.1}ms",
t_ocr.duration_since(t1).as_secs_f64() * 1000.0
));
let parsed_pages = projection::project_pages_to_grid(pages);
let t2 = web_time::Instant::now();
log(&format!(
"[liteparse] project: {:.1}ms",
t2.duration_since(t_ocr).as_secs_f64() * 1000.0
));
let full_text = if self.config.output_format == crate::config::OutputFormat::Markdown {
let md = markdown::format_markdown(&parsed_pages, &outline, self.config.image_mode);
let t3 = web_time::Instant::now();
log(&format!(
"[liteparse] markdown: {:.1}ms",
t3.duration_since(t2).as_secs_f64() * 1000.0
));
md
} else {
parsed_pages
.iter()
.map(|p| p.text.as_str())
.collect::<Vec<_>>()
.join("\n\n")
};
let total = web_time::Instant::now().duration_since(t0).as_secs_f64() * 1000.0;
log(&format!("[liteparse] total: {:.1}ms", total));
Ok(ParseResult {
pages: parsed_pages,
text: full_text,
outline,
images,
})
}
pub fn parse_from_pages(&self, pages: Vec<Page>, outline: Vec<OutlineTarget>) -> ParseResult {
let parsed_pages = projection::project_pages_to_grid(pages);
let full_text = if self.config.output_format == crate::config::OutputFormat::Markdown {
markdown::format_markdown(&parsed_pages, &outline, self.config.image_mode)
} else {
parsed_pages
.iter()
.map(|p| p.text.as_str())
.collect::<Vec<_>>()
.join("\n\n")
};
ParseResult {
pages: parsed_pages,
text: full_text,
outline,
images: Vec::new(),
}
}
#[cfg(not(target_arch = "wasm32"))]
pub async fn screenshot(
&self,
input: &str,
page_numbers: Option<Vec<u32>>,
) -> Result<Vec<ScreenshotResult>, LiteParseError> {
self.screenshot_input(PdfInput::Path(input.to_string()), page_numbers)
.await
}
#[cfg(not(target_arch = "wasm32"))]
pub async fn screenshot_input(
&self,
input: PdfInput,
page_numbers: Option<Vec<u32>>,
) -> Result<Vec<ScreenshotResult>, LiteParseError> {
let log = |msg: &str| {
if !self.config.quiet {
eprintln!("{}", msg);
}
};
let (validated_input, _guard) =
conversion::resolve_pdf_input(input, self.config.password.as_deref(), true).await?;
if let PdfInput::Path(ref path) = validated_input
&& !conversion::is_pdf(path)
{
log("[liteparse] converted input to PDF for screenshot rendering");
}
let rendered = render::render_pages_to_png(
&validated_input,
page_numbers.as_deref(),
self.config.dpi,
self.config.password.as_deref(),
)?;
Ok(rendered
.into_iter()
.map(|page| ScreenshotResult {
page_num: page.page_num,
width: page.width,
height: page.height,
image_bytes: page.png_bytes,
})
.collect())
}
pub fn config(&self) -> &LiteParseConfig {
&self.config
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[allow(clippy::field_reassign_with_default)]
fn test_new_stores_config() {
let mut cfg = LiteParseConfig::default();
cfg.ocr_enabled = false;
cfg.max_pages = 7;
let lp = LiteParse::new(cfg);
assert!(!lp.config().ocr_enabled);
assert_eq!(lp.config().max_pages, 7);
}
}