#![warn(clippy::all)]
use std::path::PathBuf;
use std::sync::Arc;
use rayon::prelude::*;
use spdf_convert::{ConversionResult, convert_path_to_pdf};
use spdf_ocr::{HttpOcrEngine, OcrEngine, OcrOptions, OcrResult};
use spdf_output::{format_text, to_json};
use spdf_pdf::{ExtractOptions, PageData, PdfDocumentHandle, PdfEngine, PdfiumEngine};
use spdf_processing::bbox::build_bounding_boxes;
use spdf_processing::text_utils::clean_ocr_table_artifacts;
use spdf_projection::{PageInput, project_pages_to_grid};
use spdf_types::{
Language, ParseConfig, ParseInput, ParseResult, ParsedPage, ScreenshotResult, SpdfError,
SpdfResult, TextItem,
};
use tracing::{debug, info, warn};
pub use spdf_types::OutputFormat;
pub struct SpdfParser {
config: ParseConfig,
pdf_engine: Arc<PdfiumEngine>,
ocr_engine: Option<Arc<dyn OcrEngine>>,
}
impl SpdfParser {
pub fn new(config: ParseConfig) -> Self {
let ocr_engine = build_ocr_engine(&config);
Self {
config,
pdf_engine: Arc::new(PdfiumEngine::new()),
ocr_engine,
}
}
pub fn with_ocr_engine(mut self, engine: Arc<dyn OcrEngine>) -> Self {
self.ocr_engine = Some(engine);
self
}
pub fn builder() -> ParseConfigBuilder {
ParseConfigBuilder::default()
}
pub fn config(&self) -> &ParseConfig {
&self.config
}
pub fn parse(&self, input: impl Into<ParseInput>) -> SpdfResult<ParseResult> {
self.parse_inner(input.into())
}
fn parse_inner(&self, input: ParseInput) -> SpdfResult<ParseResult> {
let deadline = self
.config
.timeout_secs
.map(|s| std::time::Instant::now() + std::time::Duration::from_secs(s));
let check_deadline = |stage: &str| -> SpdfResult<()> {
if let Some(d) = deadline {
if std::time::Instant::now() >= d {
return Err(SpdfError::InvalidInput(format!(
"spdf: timeout exceeded during {stage}"
)));
}
}
Ok(())
};
if let (ParseInput::Bytes(b), Some(cap)) = (&input, self.config.max_input_bytes) {
if b.len() as u64 > cap {
return Err(SpdfError::InvalidInput(format!(
"spdf: input {} bytes exceeds max_input_bytes {cap}",
b.len()
)));
}
}
let materialised = self.materialise(input)?;
let bytes = match materialised {
Materialised::Pdf { bytes, .. } => bytes,
Materialised::PlainText(content) => return Ok(plain_text_result(content)),
};
check_deadline("load")?;
let doc = self
.pdf_engine
.load_bytes(&bytes, self.config.password.as_deref())?;
let total_pages = doc.num_pages().min(self.config.max_pages);
info!(pages = total_pages, "spdf: parsing");
let page_numbers = select_pages(total_pages, self.config.target_pages.as_deref())?;
debug!(selected = page_numbers.len(), "spdf: page set selected");
let opts = ExtractOptions {
extract_images: self.config.ocr_enabled,
};
let pdf_engine = Arc::clone(&self.pdf_engine);
let mut page_datas: Vec<PageData> = page_numbers
.par_iter()
.map(|&page_num| pdf_engine.extract_page(&doc, page_num, opts))
.collect::<SpdfResult<Vec<_>>>()?;
check_deadline("extract")?;
if self.config.ocr_enabled {
if let Some(ocr) = self.ocr_engine.as_ref() {
self.run_ocr(&doc, &mut page_datas, ocr.as_ref())?;
} else {
warn_no_ocr_engine();
}
}
check_deadline("ocr")?;
let pages: Vec<PageInput> = page_datas
.into_iter()
.map(|p| PageInput {
page_num: p.page_num,
width: p.width,
height: p.height,
text_items: p.text_items,
})
.collect();
let mut processed: Vec<ParsedPage> = project_pages_to_grid(pages, &self.config);
if self.config.precise_bounding_box {
for page in processed.iter_mut() {
page.bounding_boxes = Some(build_bounding_boxes(&page.text_items));
}
}
let full_text = processed
.iter()
.map(|p| p.text.as_str())
.collect::<Vec<_>>()
.join("\n\n");
let mut result = ParseResult {
pages: processed,
text: full_text,
json: None,
};
if matches!(self.config.output_format, OutputFormat::Json) {
result.json = Some(to_json(&result));
}
self.pdf_engine.close(doc)?;
Ok(result)
}
fn run_ocr(
&self,
doc: &<PdfiumEngine as PdfEngine>::Doc,
pages: &mut [PageData],
ocr: &dyn OcrEngine,
) -> SpdfResult<()> {
let languages: Vec<String> = match &self.config.ocr_language {
Language::Single(s) => vec![s.clone()],
Language::Multiple(v) => v.clone(),
};
let options = OcrOptions {
languages,
correct_rotation: true,
dpi: Some(self.config.dpi),
};
let scale_factor = 72.0 / self.config.dpi as f64;
let mut todo: Vec<(usize, u32)> = Vec::new();
for (idx, page) in pages.iter().enumerate() {
let text_length: usize = page.text_items.iter().map(|t| t.str.len()).sum();
let needs_full_ocr = text_length < 100 || !page.images.is_empty();
if needs_full_ocr {
todo.push((idx, page.page_num));
}
}
if todo.is_empty() {
return Ok(());
}
let num_workers = self.config.num_workers.max(1);
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(num_workers)
.thread_name(|i| format!("spdf-ocr-{i}"))
.build()
.map_err(|e| SpdfError::Ocr(format!("ocr thread pool: {e}")))?;
let engine = self.pdf_engine.clone();
let dpi = self.config.dpi;
let results: Vec<(usize, Vec<OcrResult>)> = pool.install(|| {
todo.par_iter()
.map(|&(idx, page_num)| {
let image = match engine.render_page_png(doc, page_num, dpi) {
Ok(b) => b,
Err(e) => {
warn!(page = page_num, error = %e, "spdf: render for OCR failed");
return (idx, Vec::new());
}
};
match ocr.recognize(&image, &options) {
Ok(r) => (idx, r),
Err(e) => {
warn!(page = page_num, error = %e, "spdf: OCR failed");
(idx, Vec::new())
}
}
})
.collect()
});
for (idx, ocr_results) in results {
let page = &mut pages[idx];
let existing_len = page.text_items.len();
let mut appended = 0usize;
for r in ocr_results {
if r.confidence <= 0.3 {
continue;
}
let [x1, y1, x2, y2] = r.bbox;
let px = x1 * scale_factor;
let py = y1 * scale_factor;
let pw = (x2 - x1) * scale_factor;
let ph = (y2 - y1) * scale_factor;
if pw <= 0.0 || ph <= 0.0 {
continue;
}
if overlaps_existing_text(&page.text_items[..existing_len], px, py, pw, ph) {
continue;
}
let cleaned = clean_ocr_table_artifacts(&r.text);
let cleaned = strip_ocr_pipe_artifacts(&cleaned);
if cleaned.is_empty() || is_ocr_punctuation_noise(&cleaned) {
continue;
}
let mut item = TextItem::new(cleaned, px, py, pw, ph);
item.font_name = Some("OCR".into());
item.font_size = Some(ph);
item.confidence = Some((r.confidence * 1000.0).round() / 1000.0);
page.text_items.push(item);
appended += 1;
}
debug!(page = page.page_num, appended, "spdf: OCR merged");
}
Ok(())
}
pub fn stream<I: Into<ParseInput>>(
&self,
input: I,
) -> SpdfResult<Box<dyn Iterator<Item = SpdfResult<ParsedPage>> + '_>> {
let bytes = match self.materialise(input.into())? {
Materialised::Pdf { bytes, .. } => bytes,
Materialised::PlainText(content) => {
let page = plain_text_result(content).pages.remove(0);
return Ok(Box::new(std::iter::once(Ok(page))));
}
};
let doc = self
.pdf_engine
.load_bytes(&bytes, self.config.password.as_deref())?;
let total = doc.num_pages().min(self.config.max_pages);
let page_numbers = select_pages(total, self.config.target_pages.as_deref())?;
let opts = ExtractOptions {
extract_images: self.config.ocr_enabled,
};
let engine = Arc::clone(&self.pdf_engine);
let precise_bbox = self.config.precise_bounding_box;
let debug_on = self.config.debug.as_ref().is_some_and(|d| d.enabled);
let cfg = self.config.clone();
let iter = page_numbers.into_iter().map(move |page_num| {
let pd = engine.extract_page(&doc, page_num, opts)?;
let pages = spdf_projection::project_pages_to_grid(
vec![spdf_projection::PageInput {
page_num: pd.page_num,
width: pd.width,
height: pd.height,
text_items: pd.text_items,
}],
&cfg,
);
let mut page = pages.into_iter().next().unwrap();
if precise_bbox {
page.bounding_boxes = Some(spdf_processing::bbox::build_bounding_boxes(
&page.text_items,
));
}
if debug_on {
debug!(page = page.page_num, "spdf: streamed");
}
Ok(page)
});
Ok(Box::new(iter))
}
pub fn screenshot(
&self,
input: impl Into<ParseInput>,
page_numbers: Option<Vec<u32>>,
) -> SpdfResult<Vec<ScreenshotResult>> {
let (bytes, _temp) = match self.materialise(input.into())? {
Materialised::Pdf { bytes, tempdir } => (bytes, tempdir),
Materialised::PlainText(_) => {
return Err(SpdfError::UnsupportedFormat(
"cannot screenshot plain-text input".into(),
));
}
};
let doc = self
.pdf_engine
.load_bytes(&bytes, self.config.password.as_deref())?;
let total = doc.num_pages();
let targets = page_numbers.unwrap_or_else(|| (1..=total).collect());
let mut out = Vec::with_capacity(targets.len());
for page_num in targets {
let png = self
.pdf_engine
.render_page_png(&doc, page_num, self.config.dpi)?;
out.push(ScreenshotResult {
page_num,
width: 0,
height: 0,
image_buffer: png,
image_path: None,
});
}
self.pdf_engine.close(doc)?;
Ok(out)
}
pub fn format(&self, result: &ParseResult) -> String {
match self.config.output_format {
OutputFormat::Text => format_text(result),
OutputFormat::Json => {
let json = result.json.clone().unwrap_or_else(|| to_json(result));
serde_json::to_string_pretty(&json).unwrap_or_default()
}
}
}
fn materialise(&self, input: ParseInput) -> SpdfResult<Materialised> {
match input {
ParseInput::Bytes(b) => Ok(Materialised::Pdf {
bytes: b,
tempdir: None,
}),
ParseInput::Path(p) => {
match convert_path_to_pdf(&p, self.config.password.as_deref())? {
ConversionResult::Pdf {
pdf_path, _tempdir, ..
} => Ok(Materialised::Pdf {
bytes: std::fs::read(pdf_path)?,
tempdir: _tempdir,
}),
ConversionResult::PlainText { content } => Ok(Materialised::PlainText(content)),
}
}
}
}
}
enum Materialised {
Pdf {
bytes: Vec<u8>,
#[allow(dead_code)]
tempdir: Option<tempfile::TempDir>,
},
PlainText(String),
}
fn plain_text_result(content: String) -> ParseResult {
let page = ParsedPage {
page_num: 1,
width: 0.0,
height: 0.0,
text: content.clone(),
text_items: vec![TextItem::new(&content, 0.0, 0.0, 0.0, 0.0)],
bounding_boxes: None,
};
let mut result = ParseResult {
pages: vec![page],
text: content,
json: None,
};
result.json = Some(to_json(&result));
result
}
fn select_pages(total_pages: u32, target: Option<&str>) -> SpdfResult<Vec<u32>> {
let Some(spec) = target else {
return Ok((1..=total_pages).collect());
};
let mut out = Vec::new();
for chunk in spec.split(',').map(str::trim).filter(|s| !s.is_empty()) {
if let Some((lo, hi)) = chunk.split_once('-') {
let lo: u32 = lo
.trim()
.parse()
.map_err(|_| SpdfError::InvalidConfig(format!("bad range: {chunk}")))?;
let hi: u32 = hi
.trim()
.parse()
.map_err(|_| SpdfError::InvalidConfig(format!("bad range: {chunk}")))?;
for p in lo..=hi {
if p >= 1 && p <= total_pages {
out.push(p);
}
}
} else {
let p: u32 = chunk
.parse()
.map_err(|_| SpdfError::InvalidConfig(format!("bad page: {chunk}")))?;
if p >= 1 && p <= total_pages {
out.push(p);
}
}
}
out.sort_unstable();
out.dedup();
Ok(out)
}
#[derive(Debug, Default)]
pub struct ParseConfigBuilder {
config: ParseConfig,
}
impl ParseConfigBuilder {
pub fn ocr_enabled(mut self, on: bool) -> Self {
self.config.ocr_enabled = on;
self
}
pub fn ocr_server_url(mut self, url: impl Into<String>) -> Self {
self.config.ocr_server_url = Some(url.into());
self
}
pub fn dpi(mut self, dpi: u32) -> Self {
self.config.dpi = dpi;
self
}
pub fn output_format(mut self, fmt: OutputFormat) -> Self {
self.config.output_format = fmt;
self
}
pub fn max_pages(mut self, max: u32) -> Self {
self.config.max_pages = max;
self
}
pub fn target_pages(mut self, spec: impl Into<String>) -> Self {
self.config.target_pages = Some(spec.into());
self
}
pub fn num_workers(mut self, n: usize) -> Self {
self.config.num_workers = n;
self
}
pub fn password(mut self, pw: impl Into<String>) -> Self {
self.config.password = Some(pw.into());
self
}
pub fn precise_bounding_box(mut self, on: bool) -> Self {
self.config.precise_bounding_box = on;
self
}
pub fn timeout_secs(mut self, secs: u64) -> Self {
self.config.timeout_secs = Some(secs);
self
}
pub fn max_input_bytes(mut self, bytes: u64) -> Self {
self.config.max_input_bytes = Some(bytes);
self
}
pub fn config(self) -> ParseConfig {
self.config
}
pub fn build(self) -> SpdfParser {
SpdfParser::new(self.config)
}
}
pub fn default_screenshot_path(output_dir: &std::path::Path, page_num: u32) -> PathBuf {
output_dir.join(format!("page-{page_num}.png"))
}
fn build_ocr_engine(config: &ParseConfig) -> Option<Arc<dyn OcrEngine>> {
if !config.ocr_enabled {
return None;
}
if let Some(url) = config.ocr_server_url.as_deref() {
return Some(Arc::new(HttpOcrEngine::new(url)));
}
#[cfg(feature = "tesseract")]
{
return Some(Arc::new(spdf_ocr::TesseractEngine::new(
config.tessdata_path.clone(),
)));
}
#[cfg(not(feature = "tesseract"))]
{
let _ = config;
None
}
}
fn warn_no_ocr_engine() {
use std::sync::Once;
static ONCE: Once = Once::new();
ONCE.call_once(|| {
let tesseract_built = cfg!(feature = "tesseract");
let msg = if tesseract_built {
"spdf: OCR requested but no engine configured. This build supports \
Tesseract; install libtesseract + language data (e.g. \
`apt install tesseract-ocr tesseract-ocr-eng`) or pass \
--ocr-server-url to use an HTTP OCR server. Any rasterized text \
in the PDF will be missing from the output."
} else {
"spdf: OCR requested but no engine configured. Either pass \
--ocr-server-url <URL> to use an HTTP OCR server, or rebuild \
spdf with the `tesseract` feature (`cargo build --release \
-p spdf-cli --features tesseract`, requires libtesseract and \
libleptonica). Rasterized text will be missing from the output."
};
warn!("{msg}");
});
}
fn overlaps_existing_text(items: &[TextItem], x: f64, y: f64, w: f64, h: f64) -> bool {
const TOL: f64 = 2.0;
let right = x + w;
let bottom = y + h;
for it in items {
let iw = if it.width > 0.0 { it.width } else { it.w };
let ih = if it.height > 0.0 { it.height } else { it.h };
let ir = it.x + iw;
let ib = it.y + ih;
let overlap_x = x < ir + TOL && right > it.x - TOL;
let overlap_y = y < ib + TOL && bottom > it.y - TOL;
if overlap_x && overlap_y {
return true;
}
}
false
}
fn is_ocr_punctuation_noise(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return true;
}
!t.chars().any(|c| c.is_alphanumeric())
}
fn strip_ocr_pipe_artifacts(text: &str) -> String {
text.trim().trim_matches('|').trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn select_pages_defaults_to_all() {
assert_eq!(select_pages(3, None).unwrap(), vec![1, 2, 3]);
}
#[test]
fn select_pages_parses_mixed_spec() {
let out = select_pages(20, Some("1-3,5,10-11")).unwrap();
assert_eq!(out, vec![1, 2, 3, 5, 10, 11]);
}
#[test]
fn select_pages_rejects_bad_spec() {
let err = select_pages(10, Some("1-abc")).unwrap_err();
match err {
SpdfError::InvalidConfig(msg) => assert!(msg.contains("bad range")),
_ => panic!("expected InvalidConfig"),
}
}
#[test]
fn overlap_detects_collision_with_existing_text() {
let items = vec![TextItem::new("hi", 10.0, 20.0, 40.0, 12.0)];
assert!(overlaps_existing_text(&items, 10.0, 20.0, 40.0, 12.0));
assert!(!overlaps_existing_text(&items, 200.0, 200.0, 40.0, 12.0));
assert!(overlaps_existing_text(&items, 11.0, 21.0, 1.0, 1.0));
}
}