1#![warn(clippy::all)]
5
6use std::path::PathBuf;
7use std::sync::Arc;
8
9use rayon::prelude::*;
10use spdf_convert::{ConversionResult, convert_path_to_pdf};
11use spdf_ocr::{HttpOcrEngine, OcrEngine, OcrOptions, OcrResult};
12use spdf_output::{format_text, to_json};
13use spdf_pdf::{ExtractOptions, PageData, PdfDocumentHandle, PdfEngine, PdfiumEngine};
14use spdf_processing::bbox::build_bounding_boxes;
15use spdf_processing::text_utils::clean_ocr_table_artifacts;
16use spdf_projection::{PageInput, project_pages_to_grid};
17use spdf_types::{
18 Language, ParseConfig, ParseInput, ParseResult, ParsedPage, ScreenshotResult, SpdfError,
19 SpdfResult, TextItem,
20};
21use tracing::{debug, info, warn};
22
23pub use spdf_types::OutputFormat;
24
25pub struct SpdfParser {
27 config: ParseConfig,
28 pdf_engine: Arc<PdfiumEngine>,
29 ocr_engine: Option<Arc<dyn OcrEngine>>,
30}
31
32impl SpdfParser {
33 pub fn new(config: ParseConfig) -> Self {
36 let ocr_engine = build_ocr_engine(&config);
37 Self {
38 config,
39 pdf_engine: Arc::new(PdfiumEngine::new()),
40 ocr_engine,
41 }
42 }
43
44 pub fn with_ocr_engine(mut self, engine: Arc<dyn OcrEngine>) -> Self {
46 self.ocr_engine = Some(engine);
47 self
48 }
49
50 pub fn builder() -> ParseConfigBuilder {
52 ParseConfigBuilder::default()
53 }
54
55 pub fn config(&self) -> &ParseConfig {
56 &self.config
57 }
58
59 pub fn parse(&self, input: impl Into<ParseInput>) -> SpdfResult<ParseResult> {
61 self.parse_inner(input.into())
62 }
63
64 fn parse_inner(&self, input: ParseInput) -> SpdfResult<ParseResult> {
65 let deadline = self
66 .config
67 .timeout_secs
68 .map(|s| std::time::Instant::now() + std::time::Duration::from_secs(s));
69 let check_deadline = |stage: &str| -> SpdfResult<()> {
70 if let Some(d) = deadline {
71 if std::time::Instant::now() >= d {
72 return Err(SpdfError::InvalidInput(format!(
73 "spdf: timeout exceeded during {stage}"
74 )));
75 }
76 }
77 Ok(())
78 };
79 if let (ParseInput::Bytes(b), Some(cap)) = (&input, self.config.max_input_bytes) {
81 if b.len() as u64 > cap {
82 return Err(SpdfError::InvalidInput(format!(
83 "spdf: input {} bytes exceeds max_input_bytes {cap}",
84 b.len()
85 )));
86 }
87 }
88 let materialised = self.materialise(input)?;
89 let bytes = match materialised {
90 Materialised::Pdf { bytes, .. } => bytes,
91 Materialised::PlainText(content) => return Ok(plain_text_result(content)),
92 };
93 check_deadline("load")?;
94
95 let doc = self
96 .pdf_engine
97 .load_bytes(&bytes, self.config.password.as_deref())?;
98 let total_pages = doc.num_pages().min(self.config.max_pages);
99 info!(pages = total_pages, "spdf: parsing");
100
101 let page_numbers = select_pages(total_pages, self.config.target_pages.as_deref())?;
102 debug!(selected = page_numbers.len(), "spdf: page set selected");
103
104 let opts = ExtractOptions {
105 extract_images: self.config.ocr_enabled,
106 };
107
108 let pdf_engine = Arc::clone(&self.pdf_engine);
109 let mut page_datas: Vec<PageData> = page_numbers
110 .par_iter()
111 .map(|&page_num| pdf_engine.extract_page(&doc, page_num, opts))
112 .collect::<SpdfResult<Vec<_>>>()?;
113 check_deadline("extract")?;
114
115 if self.config.ocr_enabled {
119 if let Some(ocr) = self.ocr_engine.as_ref() {
120 self.run_ocr(&doc, &mut page_datas, ocr.as_ref())?;
121 } else {
122 warn_no_ocr_engine();
123 }
124 }
125 check_deadline("ocr")?;
126
127 let pages: Vec<PageInput> = page_datas
128 .into_iter()
129 .map(|p| PageInput {
130 page_num: p.page_num,
131 width: p.width,
132 height: p.height,
133 text_items: p.text_items,
134 })
135 .collect();
136
137 let mut processed: Vec<ParsedPage> = project_pages_to_grid(pages, &self.config);
138
139 if self.config.precise_bounding_box {
140 for page in processed.iter_mut() {
141 page.bounding_boxes = Some(build_bounding_boxes(&page.text_items));
142 }
143 }
144
145 let full_text = processed
146 .iter()
147 .map(|p| p.text.as_str())
148 .collect::<Vec<_>>()
149 .join("\n\n");
150
151 let mut result = ParseResult {
152 pages: processed,
153 text: full_text,
154 json: None,
155 };
156
157 if matches!(self.config.output_format, OutputFormat::Json) {
158 result.json = Some(to_json(&result));
159 }
160
161 self.pdf_engine.close(doc)?;
162 Ok(result)
163 }
164
165 fn run_ocr(
175 &self,
176 doc: &<PdfiumEngine as PdfEngine>::Doc,
177 pages: &mut [PageData],
178 ocr: &dyn OcrEngine,
179 ) -> SpdfResult<()> {
180 let languages: Vec<String> = match &self.config.ocr_language {
181 Language::Single(s) => vec![s.clone()],
182 Language::Multiple(v) => v.clone(),
183 };
184 let options = OcrOptions {
185 languages,
186 correct_rotation: true,
187 dpi: Some(self.config.dpi),
188 };
189 let scale_factor = 72.0 / self.config.dpi as f64;
192
193 let mut todo: Vec<(usize, u32)> = Vec::new();
197 for (idx, page) in pages.iter().enumerate() {
198 let text_length: usize = page.text_items.iter().map(|t| t.str.len()).sum();
199 let needs_full_ocr = text_length < 100 || !page.images.is_empty();
200 if needs_full_ocr {
201 todo.push((idx, page.page_num));
202 }
203 }
204 if todo.is_empty() {
205 return Ok(());
206 }
207
208 let num_workers = self.config.num_workers.max(1);
212 let pool = rayon::ThreadPoolBuilder::new()
213 .num_threads(num_workers)
214 .thread_name(|i| format!("spdf-ocr-{i}"))
215 .build()
216 .map_err(|e| SpdfError::Ocr(format!("ocr thread pool: {e}")))?;
217
218 let engine = self.pdf_engine.clone();
219 let dpi = self.config.dpi;
220 let results: Vec<(usize, Vec<OcrResult>)> = pool.install(|| {
221 todo.par_iter()
222 .map(|&(idx, page_num)| {
223 let image = match engine.render_page_png(doc, page_num, dpi) {
224 Ok(b) => b,
225 Err(e) => {
226 warn!(page = page_num, error = %e, "spdf: render for OCR failed");
227 return (idx, Vec::new());
228 }
229 };
230 match ocr.recognize(&image, &options) {
231 Ok(r) => (idx, r),
232 Err(e) => {
233 warn!(page = page_num, error = %e, "spdf: OCR failed");
234 (idx, Vec::new())
235 }
236 }
237 })
238 .collect()
239 });
240
241 for (idx, ocr_results) in results {
245 let page = &mut pages[idx];
246 let existing_len = page.text_items.len();
254 let mut appended = 0usize;
255 for r in ocr_results {
256 if r.confidence <= 0.3 {
257 continue;
258 }
259 let [x1, y1, x2, y2] = r.bbox;
260 let px = x1 * scale_factor;
261 let py = y1 * scale_factor;
262 let pw = (x2 - x1) * scale_factor;
263 let ph = (y2 - y1) * scale_factor;
264 if pw <= 0.0 || ph <= 0.0 {
265 continue;
266 }
267 if overlaps_existing_text(&page.text_items[..existing_len], px, py, pw, ph) {
268 continue;
269 }
270 let cleaned = clean_ocr_table_artifacts(&r.text);
271 let cleaned = strip_ocr_pipe_artifacts(&cleaned);
272 if cleaned.is_empty() || is_ocr_punctuation_noise(&cleaned) {
273 continue;
274 }
275 let mut item = TextItem::new(cleaned, px, py, pw, ph);
276 item.font_name = Some("OCR".into());
277 item.font_size = Some(ph);
278 item.confidence = Some((r.confidence * 1000.0).round() / 1000.0);
279 page.text_items.push(item);
280 appended += 1;
281 }
282 debug!(page = page.page_num, appended, "spdf: OCR merged");
283 }
284 Ok(())
285 }
286
287 pub fn stream<I: Into<ParseInput>>(
291 &self,
292 input: I,
293 ) -> SpdfResult<Box<dyn Iterator<Item = SpdfResult<ParsedPage>> + '_>> {
294 let bytes = match self.materialise(input.into())? {
295 Materialised::Pdf { bytes, .. } => bytes,
296 Materialised::PlainText(content) => {
297 let page = plain_text_result(content).pages.remove(0);
298 return Ok(Box::new(std::iter::once(Ok(page))));
299 }
300 };
301 let doc = self
302 .pdf_engine
303 .load_bytes(&bytes, self.config.password.as_deref())?;
304 let total = doc.num_pages().min(self.config.max_pages);
305 let page_numbers = select_pages(total, self.config.target_pages.as_deref())?;
306 let opts = ExtractOptions {
307 extract_images: self.config.ocr_enabled,
308 };
309 let engine = Arc::clone(&self.pdf_engine);
310 let precise_bbox = self.config.precise_bounding_box;
311 let debug_on = self.config.debug.as_ref().is_some_and(|d| d.enabled);
312 let cfg = self.config.clone();
313 let iter = page_numbers.into_iter().map(move |page_num| {
314 let pd = engine.extract_page(&doc, page_num, opts)?;
315 let pages = spdf_projection::project_pages_to_grid(
316 vec![spdf_projection::PageInput {
317 page_num: pd.page_num,
318 width: pd.width,
319 height: pd.height,
320 text_items: pd.text_items,
321 }],
322 &cfg,
323 );
324 let mut page = pages.into_iter().next().unwrap();
325 if precise_bbox {
326 page.bounding_boxes = Some(spdf_processing::bbox::build_bounding_boxes(
327 &page.text_items,
328 ));
329 }
330 if debug_on {
331 debug!(page = page.page_num, "spdf: streamed");
332 }
333 Ok(page)
334 });
335 Ok(Box::new(iter))
336 }
337
338 pub fn screenshot(
340 &self,
341 input: impl Into<ParseInput>,
342 page_numbers: Option<Vec<u32>>,
343 ) -> SpdfResult<Vec<ScreenshotResult>> {
344 let (bytes, _temp) = match self.materialise(input.into())? {
345 Materialised::Pdf { bytes, tempdir } => (bytes, tempdir),
346 Materialised::PlainText(_) => {
347 return Err(SpdfError::UnsupportedFormat(
348 "cannot screenshot plain-text input".into(),
349 ));
350 }
351 };
352 let doc = self
353 .pdf_engine
354 .load_bytes(&bytes, self.config.password.as_deref())?;
355 let total = doc.num_pages();
356 let targets = page_numbers.unwrap_or_else(|| (1..=total).collect());
357
358 let mut out = Vec::with_capacity(targets.len());
359 for page_num in targets {
360 let png = self
361 .pdf_engine
362 .render_page_png(&doc, page_num, self.config.dpi)?;
363 out.push(ScreenshotResult {
365 page_num,
366 width: 0,
367 height: 0,
368 image_buffer: png,
369 image_path: None,
370 });
371 }
372 self.pdf_engine.close(doc)?;
373 Ok(out)
374 }
375
376 pub fn format(&self, result: &ParseResult) -> String {
378 match self.config.output_format {
379 OutputFormat::Text => format_text(result),
380 OutputFormat::Json => {
381 let json = result.json.clone().unwrap_or_else(|| to_json(result));
382 serde_json::to_string_pretty(&json).unwrap_or_default()
383 }
384 }
385 }
386
387 fn materialise(&self, input: ParseInput) -> SpdfResult<Materialised> {
389 match input {
390 ParseInput::Bytes(b) => Ok(Materialised::Pdf {
391 bytes: b,
392 tempdir: None,
393 }),
394 ParseInput::Path(p) => {
395 match convert_path_to_pdf(&p, self.config.password.as_deref())? {
396 ConversionResult::Pdf {
397 pdf_path, _tempdir, ..
398 } => Ok(Materialised::Pdf {
399 bytes: std::fs::read(pdf_path)?,
400 tempdir: _tempdir,
401 }),
402 ConversionResult::PlainText { content } => Ok(Materialised::PlainText(content)),
403 }
404 }
405 }
406 }
407}
408
409enum Materialised {
411 Pdf {
412 bytes: Vec<u8>,
413 #[allow(dead_code)]
414 tempdir: Option<tempfile::TempDir>,
415 },
416 PlainText(String),
417}
418
419fn plain_text_result(content: String) -> ParseResult {
423 let page = ParsedPage {
424 page_num: 1,
425 width: 0.0,
426 height: 0.0,
427 text: content.clone(),
428 text_items: vec![TextItem::new(&content, 0.0, 0.0, 0.0, 0.0)],
429 bounding_boxes: None,
430 };
431 let mut result = ParseResult {
432 pages: vec![page],
433 text: content,
434 json: None,
435 };
436 result.json = Some(to_json(&result));
437 result
438}
439
440fn select_pages(total_pages: u32, target: Option<&str>) -> SpdfResult<Vec<u32>> {
442 let Some(spec) = target else {
443 return Ok((1..=total_pages).collect());
444 };
445 let mut out = Vec::new();
446 for chunk in spec.split(',').map(str::trim).filter(|s| !s.is_empty()) {
447 if let Some((lo, hi)) = chunk.split_once('-') {
448 let lo: u32 = lo
449 .trim()
450 .parse()
451 .map_err(|_| SpdfError::InvalidConfig(format!("bad range: {chunk}")))?;
452 let hi: u32 = hi
453 .trim()
454 .parse()
455 .map_err(|_| SpdfError::InvalidConfig(format!("bad range: {chunk}")))?;
456 for p in lo..=hi {
457 if p >= 1 && p <= total_pages {
458 out.push(p);
459 }
460 }
461 } else {
462 let p: u32 = chunk
463 .parse()
464 .map_err(|_| SpdfError::InvalidConfig(format!("bad page: {chunk}")))?;
465 if p >= 1 && p <= total_pages {
466 out.push(p);
467 }
468 }
469 }
470 out.sort_unstable();
471 out.dedup();
472 Ok(out)
473}
474
475#[derive(Debug, Default)]
477pub struct ParseConfigBuilder {
478 config: ParseConfig,
479}
480
481impl ParseConfigBuilder {
482 pub fn ocr_enabled(mut self, on: bool) -> Self {
483 self.config.ocr_enabled = on;
484 self
485 }
486 pub fn ocr_server_url(mut self, url: impl Into<String>) -> Self {
487 self.config.ocr_server_url = Some(url.into());
488 self
489 }
490 pub fn dpi(mut self, dpi: u32) -> Self {
491 self.config.dpi = dpi;
492 self
493 }
494 pub fn output_format(mut self, fmt: OutputFormat) -> Self {
495 self.config.output_format = fmt;
496 self
497 }
498 pub fn max_pages(mut self, max: u32) -> Self {
499 self.config.max_pages = max;
500 self
501 }
502 pub fn target_pages(mut self, spec: impl Into<String>) -> Self {
503 self.config.target_pages = Some(spec.into());
504 self
505 }
506 pub fn num_workers(mut self, n: usize) -> Self {
507 self.config.num_workers = n;
508 self
509 }
510 pub fn password(mut self, pw: impl Into<String>) -> Self {
511 self.config.password = Some(pw.into());
512 self
513 }
514 pub fn precise_bounding_box(mut self, on: bool) -> Self {
515 self.config.precise_bounding_box = on;
516 self
517 }
518 pub fn timeout_secs(mut self, secs: u64) -> Self {
520 self.config.timeout_secs = Some(secs);
521 self
522 }
523 pub fn max_input_bytes(mut self, bytes: u64) -> Self {
525 self.config.max_input_bytes = Some(bytes);
526 self
527 }
528 pub fn config(self) -> ParseConfig {
529 self.config
530 }
531 pub fn build(self) -> SpdfParser {
532 SpdfParser::new(self.config)
533 }
534}
535
536pub fn default_screenshot_path(output_dir: &std::path::Path, page_num: u32) -> PathBuf {
539 output_dir.join(format!("page-{page_num}.png"))
540}
541
542fn build_ocr_engine(config: &ParseConfig) -> Option<Arc<dyn OcrEngine>> {
545 if !config.ocr_enabled {
546 return None;
547 }
548 if let Some(url) = config.ocr_server_url.as_deref() {
549 return Some(Arc::new(HttpOcrEngine::new(url)));
550 }
551 #[cfg(feature = "tesseract")]
552 {
553 return Some(Arc::new(spdf_ocr::TesseractEngine::new(
554 config.tessdata_path.clone(),
555 )));
556 }
557 #[cfg(not(feature = "tesseract"))]
558 {
559 let _ = config;
560 None
561 }
562}
563
564fn warn_no_ocr_engine() {
567 use std::sync::Once;
568 static ONCE: Once = Once::new();
569 ONCE.call_once(|| {
570 let tesseract_built = cfg!(feature = "tesseract");
571 let msg = if tesseract_built {
572 "spdf: OCR requested but no engine configured. This build supports \
573 Tesseract; install libtesseract + language data (e.g. \
574 `apt install tesseract-ocr tesseract-ocr-eng`) or pass \
575 --ocr-server-url to use an HTTP OCR server. Any rasterized text \
576 in the PDF will be missing from the output."
577 } else {
578 "spdf: OCR requested but no engine configured. Either pass \
579 --ocr-server-url <URL> to use an HTTP OCR server, or rebuild \
580 spdf with the `tesseract` feature (`cargo build --release \
581 -p spdf-cli --features tesseract`, requires libtesseract and \
582 libleptonica). Rasterized text will be missing from the output."
583 };
584 warn!("{msg}");
585 });
586}
587
588fn overlaps_existing_text(items: &[TextItem], x: f64, y: f64, w: f64, h: f64) -> bool {
591 const TOL: f64 = 2.0;
592 let right = x + w;
593 let bottom = y + h;
594 for it in items {
595 let iw = if it.width > 0.0 { it.width } else { it.w };
596 let ih = if it.height > 0.0 { it.height } else { it.h };
597 let ir = it.x + iw;
598 let ib = it.y + ih;
599 let overlap_x = x < ir + TOL && right > it.x - TOL;
600 let overlap_y = y < ib + TOL && bottom > it.y - TOL;
601 if overlap_x && overlap_y {
602 return true;
603 }
604 }
605 false
606}
607
608fn is_ocr_punctuation_noise(text: &str) -> bool {
613 let t = text.trim();
614 if t.is_empty() {
615 return true;
616 }
617 !t.chars().any(|c| c.is_alphanumeric())
619}
620
621fn strip_ocr_pipe_artifacts(text: &str) -> String {
625 text.trim().trim_matches('|').trim().to_string()
626}
627
628#[cfg(test)]
629mod tests {
630 use super::*;
631
632 #[test]
633 fn select_pages_defaults_to_all() {
634 assert_eq!(select_pages(3, None).unwrap(), vec![1, 2, 3]);
635 }
636
637 #[test]
638 fn select_pages_parses_mixed_spec() {
639 let out = select_pages(20, Some("1-3,5,10-11")).unwrap();
640 assert_eq!(out, vec![1, 2, 3, 5, 10, 11]);
641 }
642
643 #[test]
644 fn select_pages_rejects_bad_spec() {
645 let err = select_pages(10, Some("1-abc")).unwrap_err();
646 match err {
647 SpdfError::InvalidConfig(msg) => assert!(msg.contains("bad range")),
648 _ => panic!("expected InvalidConfig"),
649 }
650 }
651
652 #[test]
653 fn overlap_detects_collision_with_existing_text() {
654 let items = vec![TextItem::new("hi", 10.0, 20.0, 40.0, 12.0)];
655 assert!(overlaps_existing_text(&items, 10.0, 20.0, 40.0, 12.0));
657 assert!(!overlaps_existing_text(&items, 200.0, 200.0, 40.0, 12.0));
659 assert!(overlaps_existing_text(&items, 11.0, 21.0, 1.0, 1.0));
661 }
662}