fleischwolf_pdf/lib.rs
1//! PDF backend for fleischwolf.
2//!
3//! A port of docling's standard PDF pipeline: pdfium extracts the text layer
4//! (cells with bounding boxes) and renders page images; a discriminative ONNX
5//! stack (layout detection, table structure, OCR) classifies regions; the cells
6//! are assembled in reading order into a [`DoclingDocument`].
7//!
8//! Current stages: pdfium text-cell extraction + page rendering ([`pdfium_backend`])
9//! and the deterministic text/reading-order assembly ([`assemble`]). The layout,
10//! table-structure and OCR ONNX stages land behind [`Pipeline`] next.
11
12mod assemble;
13mod dp_lines;
14pub mod layout;
15mod mets;
16mod ocr;
17pub mod pdfium_backend;
18pub mod resample;
19pub mod tableformer;
20pub mod textparse;
21pub mod timing;
22
23use std::collections::BTreeMap;
24use std::fmt;
25use std::sync::mpsc::{sync_channel, Receiver};
26use std::sync::{Arc, Mutex};
27
28use fleischwolf_core::{DoclingDocument, Node};
29
30pub use mets::{convert_mets_gbs, convert_mets_gbs_with_options};
31pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
32
33/// Errors from the PDF backend. Detailed and surfaced (never silently skipped).
34#[derive(Debug)]
35pub enum PdfError {
36 /// pdfium failed to bind, open, or read the document.
37 Pdfium(String),
38 /// The layout ONNX model failed to load or run.
39 Layout(String),
40 /// The OCR ONNX model failed to load or run.
41 Ocr(String),
42}
43
44impl fmt::Display for PdfError {
45 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
46 match self {
47 PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
48 PdfError::Layout(m) => write!(f, "pdf: {m}"),
49 PdfError::Ocr(m) => write!(f, "pdf: {m}"),
50 }
51 }
52}
53
54impl std::error::Error for PdfError {}
55
56impl From<pdfium_render::prelude::PdfiumError> for PdfError {
57 fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
58 PdfError::Pdfium(e.to_string())
59 }
60}
61
62/// Threads ONNX inference may use, capped by `FLEISCHWOLF_PDF_THREADS` if set.
63/// Defaults to the available parallelism (ort otherwise picks a low number).
64pub(crate) fn intra_threads() -> usize {
65 if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
66 .ok()
67 .and_then(|v| v.parse::<usize>().ok())
68 .filter(|&n| n > 0)
69 {
70 return n;
71 }
72 std::thread::available_parallelism()
73 .map(|n| n.get())
74 .unwrap_or(1)
75}
76
77/// One page's assembled output: typed nodes plus the page's hyperlinks, kept
78/// separate so pages processed out of order can be stitched back in page order.
79type PageOut = (Vec<Node>, Vec<(String, String)>);
80
81/// A self-contained set of the per-page models (layout, OCR, TableFormer). Each
82/// parallel page-worker owns its own `Worker` so inference runs concurrently
83/// without sharing an ONNX session (`ort`'s `Session::run` is `&mut self`).
84struct Worker {
85 /// `None` when `no_ocr` skips layout entirely — no model load, no inference.
86 layout: Option<layout::LayoutModel>,
87 ocr: Option<ocr::OcrModel>,
88 /// TableFormer structure model; `None` when its ONNX graphs aren't present
89 /// (the assembler then falls back to geometric table reconstruction) or
90 /// when `no_table_former`/`no_ocr` skip it.
91 tables: Option<tableformer::TableFormer>,
92 /// Skip layout, OCR, and TableFormer; reconstruct text purely from the PDF's
93 /// embedded text layer. See [`Pipeline::no_ocr`].
94 no_ocr: bool,
95}
96
97impl Worker {
98 fn load(intra: usize, no_table_former: bool, no_ocr: bool) -> Result<Self, PdfError> {
99 Ok(Self {
100 layout: if no_ocr {
101 None
102 } else {
103 Some(layout::LayoutModel::load_with(intra).map_err(PdfError::Layout)?)
104 },
105 ocr: None,
106 tables: if no_table_former || no_ocr {
107 None
108 } else {
109 tableformer::TableFormer::load_with(intra)
110 },
111 no_ocr,
112 })
113 }
114
115 /// Run layout (+ OCR for cell-less pages) + TableFormer and assemble page `n`
116 /// into its nodes and links. Pure given the page (mutates only the worker's
117 /// lazily-loaded OCR model), so it is safe to run concurrently across pages.
118 fn process(&mut self, n: usize, page: &mut PdfPage) -> Result<PageOut, PdfError> {
119 if self.no_ocr {
120 // Fastest path: no layout/OCR/TableFormer inference at all. The PDF's
121 // embedded text cells (if any) become flat, line-grouped paragraphs in
122 // reading order via the same orphan-region machinery that normally
123 // rescues text the detector missed — here it rescues *all* of it.
124 // Pages with no embedded text layer (scanned/image-only) yield nothing;
125 // convert those without `no_ocr`.
126 let mut regions = Vec::new();
127 assemble::add_orphan_regions(&mut regions, &page.cells);
128 let table_rows = vec![None; regions.len()];
129 return Ok(timing::timed("assemble_page", || {
130 assemble::assemble_page(page, regions, &table_rows)
131 }));
132 }
133 let regions = timing::timed("layout.predict", || {
134 self.layout
135 .as_mut()
136 .expect("layout model loaded unless no_ocr")
137 .predict(&page.image, page.width, page.height)
138 })
139 .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
140 // Resolve overlapping detections once, before OCR.
141 let mut regions = assemble::resolve(regions);
142 // Emit text the detector missed as orphan text regions (docling parity).
143 assemble::add_orphan_regions(&mut regions, &page.cells);
144 // Drop phantom empty low-confidence picture boxes (docling parity).
145 assemble::drop_false_pictures(&mut regions, &page.cells, page.width, page.height);
146 // No text layer → recognise text from the page image via OCR.
147 if page.cells.is_empty() {
148 if self.ocr.is_none() {
149 self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
150 }
151 let cells = self
152 .ocr
153 .as_mut()
154 .unwrap()
155 .ocr_page(&page.image, ®ions, page.scale)
156 .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
157 page.cells = cells;
158 }
159 // TableFormer structure per table region (else geometric fallback).
160 let mut table_rows: Vec<Option<Vec<Vec<String>>>> = vec![None; regions.len()];
161 if let Some(tf) = self.tables.as_mut() {
162 timing::timed("tableformer", || {
163 for (i, r) in regions.iter().enumerate() {
164 if r.label == "table" {
165 table_rows[i] = tf.predict_table_rows(
166 &page.image,
167 page.height,
168 [r.l, r.t, r.r, r.b],
169 &page.word_cells,
170 );
171 }
172 }
173 });
174 }
175 Ok(timing::timed("assemble_page", || {
176 assemble::assemble_page(page, regions, &table_rows)
177 }))
178 }
179}
180
181/// Per-worker ONNX intra-op threads. The layout model is memory-bandwidth bound,
182/// so on a typical machine two threads per worker (sharing one in-cache copy of
183/// the weights) extracts more throughput than one fat model or many single-thread
184/// workers. `FLEISCHWOLF_PDF_INTRA` overrides for per-machine tuning.
185fn pdf_intra() -> usize {
186 if let Some(n) = std::env::var("FLEISCHWOLF_PDF_INTRA")
187 .ok()
188 .and_then(|v| v.parse::<usize>().ok())
189 .filter(|&n| n > 0)
190 {
191 return n;
192 }
193 if intra_threads() >= 2 {
194 2
195 } else {
196 1
197 }
198}
199
200/// How many page-workers to spin up for a multi-page PDF. `FLEISCHWOLF_PDF_WORKERS`
201/// overrides; otherwise size the pool so `workers × intra ≈ cores`, capped at 4 so
202/// a worst-case pool holds a bounded amount of model memory (~0.4 GB per worker)
203/// and does not oversaturate the memory bus with model-weight traffic.
204fn pdf_worker_count() -> usize {
205 if let Some(n) = std::env::var("FLEISCHWOLF_PDF_WORKERS")
206 .ok()
207 .and_then(|v| v.parse::<usize>().ok())
208 .filter(|&n| n > 0)
209 {
210 return n;
211 }
212 (intra_threads() / pdf_intra()).clamp(1, 4)
213}
214
215/// Minimum page count before a PDF is worth the parallel worker pool. Below this,
216/// the serial primary (running its model on every core) is faster than fanning out
217/// — the helper pool's one-time model-load cost only pays off once enough pages
218/// share it. `FLEISCHWOLF_PDF_PARALLEL_MIN` overrides.
219fn pdf_parallel_min() -> usize {
220 std::env::var("FLEISCHWOLF_PDF_PARALLEL_MIN")
221 .ok()
222 .and_then(|v| v.parse::<usize>().ok())
223 .filter(|&n| n > 0)
224 .unwrap_or(6)
225}
226
227/// A reusable PDF pipeline. The **primary** worker runs its models on every core,
228/// so a single-page / small / image / METS input is converted at full intra-op
229/// speed with no pool to load. A document with enough pages instead fans out
230/// across a **pool** of narrower workers processed concurrently. Both load lazily
231/// and are cached for reuse, so a one-shot conversion only pays for what it uses.
232pub struct Pipeline {
233 /// Full-intra worker for the serial path; loaded on first serial use.
234 primary: Option<Worker>,
235 /// Narrower workers (≈cores/`target_workers` threads each) for the parallel
236 /// path; loaded on first multi-page use and cached.
237 pool: Vec<Worker>,
238 /// Desired pool size for multi-page documents.
239 target_workers: usize,
240 /// Page count at/above which the parallel pool is worth its load cost.
241 parallel_min: usize,
242 /// Skip loading/running TableFormer; table regions fall back to geometric
243 /// reconstruction. See [`Pipeline::no_table_former`].
244 no_table_former: bool,
245 /// Skip layout, OCR, and TableFormer entirely. See [`Pipeline::no_ocr`].
246 no_ocr: bool,
247}
248
249impl Pipeline {
250 /// Construct the pipeline. Models load lazily on first use (full-intra primary
251 /// for serial inputs, the helper pool for multi-page PDFs), so nothing is
252 /// loaded that a given document doesn't need.
253 pub fn new() -> Result<Self, PdfError> {
254 Ok(Self {
255 primary: None,
256 pool: Vec::new(),
257 target_workers: pdf_worker_count(),
258 parallel_min: pdf_parallel_min(),
259 no_table_former: false,
260 no_ocr: false,
261 })
262 }
263
264 /// Skip loading and running the TableFormer table-structure model. Table
265 /// regions still get emitted, but reconstructed geometrically from cell
266 /// positions instead of via the ONNX model's predicted structure — faster
267 /// (no model load, no per-table inference) at the cost of table fidelity.
268 /// No effect if a worker is already loaded; set this before the first
269 /// conversion.
270 pub fn no_table_former(mut self, disable: bool) -> Self {
271 self.no_table_former = disable;
272 self
273 }
274
275 /// Skip layout detection, OCR, and TableFormer entirely — no model load, no
276 /// inference of any kind. The PDF's embedded text cells are grouped by line
277 /// and emitted as plain paragraphs in reading order: no headings, lists,
278 /// tables, code blocks, or pictures, since that structure comes from the
279 /// layout model. The fastest possible PDF path, but pages with no embedded
280 /// text layer (scanned/image-only PDFs) yield no text at all — convert those
281 /// without this flag. Implies `no_table_former`. No effect if a worker is
282 /// already loaded; set this before the first conversion.
283 pub fn no_ocr(mut self, disable: bool) -> Self {
284 self.no_ocr = disable;
285 self
286 }
287
288 /// The full-intra serial worker, loaded on first use.
289 fn primary(&mut self) -> Result<&mut Worker, PdfError> {
290 if self.primary.is_none() {
291 self.primary = Some(Worker::load(
292 intra_threads(),
293 self.no_table_former,
294 self.no_ocr,
295 )?);
296 }
297 Ok(self.primary.as_mut().unwrap())
298 }
299
300 /// Convert a PDF (bytes) to a [`DoclingDocument`]. A document with fewer than
301 /// `parallel_min` pages (or a pool size of 1) streams through the full-intra
302 /// primary; a larger one renders on this thread (pdfium is not thread-safe) and
303 /// fans the pages out across the worker pool, reassembled in page order so the
304 /// output is byte-identical to the serial path.
305 pub fn convert(
306 &mut self,
307 bytes: &[u8],
308 password: Option<&str>,
309 name: &str,
310 ) -> Result<DoclingDocument, PdfError> {
311 let pages = pdfium_backend::page_count(bytes, password)?;
312 let doc = if self.target_workers >= 2 && pages >= self.parallel_min {
313 self.convert_parallel(bytes, password, name)?
314 } else {
315 self.convert_serial(bytes, password, name)?
316 };
317 timing::report();
318 Ok(doc)
319 }
320
321 /// Stream pages one at a time through the primary worker — render → process →
322 /// drop — so the document holds ~one page bitmap (~5 MB) at a time.
323 fn convert_serial(
324 &mut self,
325 bytes: &[u8],
326 password: Option<&str>,
327 name: &str,
328 ) -> Result<DoclingDocument, PdfError> {
329 let mut doc = DoclingDocument::new(name);
330 let render_image = !self.no_ocr;
331 let worker = self.primary()?;
332 pdfium_backend::for_each_page(bytes, password, render_image, |n, _total, mut page| {
333 let (nodes, links) = worker.process(n, &mut page)?;
334 doc.nodes.extend(nodes);
335 doc.links.extend(links);
336 Ok::<(), PdfError>(())
337 })?;
338 assemble::merge_continuations(&mut doc.nodes);
339 Ok(doc)
340 }
341
342 /// Render pages serially on this thread (pdfium) and process them in parallel
343 /// across the worker pool. A bounded channel applies backpressure so only a
344 /// handful of page bitmaps are resident at once; results carry their page
345 /// index and are reassembled in order, so the output is byte-identical to the
346 /// serial path.
347 fn convert_parallel(
348 &mut self,
349 bytes: &[u8],
350 password: Option<&str>,
351 name: &str,
352 ) -> Result<DoclingDocument, PdfError> {
353 self.ensure_pool()?;
354 let n_workers = self.pool.len();
355 let render_image = !self.no_ocr;
356 let (work_tx, work_rx) = sync_channel::<(usize, PdfPage)>(n_workers * 2);
357 let work_rx: Arc<Mutex<Receiver<(usize, PdfPage)>>> = Arc::new(Mutex::new(work_rx));
358 let results: Arc<Mutex<Vec<(usize, PageOut)>>> = Arc::new(Mutex::new(Vec::new()));
359 let first_err: Arc<Mutex<Option<PdfError>>> = Arc::new(Mutex::new(None));
360
361 // Move the pool into the scope so each worker gets an exclusive `&mut`.
362 let mut workers = std::mem::take(&mut self.pool);
363 std::thread::scope(|s| {
364 for worker in workers.iter_mut() {
365 let work_rx = Arc::clone(&work_rx);
366 let results = Arc::clone(&results);
367 let first_err = Arc::clone(&first_err);
368 s.spawn(move || loop {
369 // Hold the receiver lock only for the recv; release before the
370 // (long) per-page work so other workers can pull concurrently.
371 let item = work_rx.lock().unwrap().recv();
372 let Ok((idx, mut page)) = item else { break };
373 match worker.process(idx, &mut page) {
374 Ok(out) => results.lock().unwrap().push((idx, out)),
375 Err(e) => {
376 let mut slot = first_err.lock().unwrap();
377 if slot.is_none() {
378 *slot = Some(e);
379 }
380 }
381 }
382 });
383 }
384 // Render on this thread and feed the workers; backpressure blocks here
385 // when the channel is full. Dropping `work_tx` afterwards signals the
386 // workers (recv → Err) to finish.
387 let render =
388 pdfium_backend::for_each_page(bytes, password, render_image, |i, _total, page| {
389 work_tx
390 .send((i, page))
391 .map_err(|_| PdfError::Pdfium("page-worker channel closed".into()))
392 });
393 drop(work_tx);
394 if let Err(e) = render {
395 let mut slot = first_err.lock().unwrap();
396 if slot.is_none() {
397 *slot = Some(e);
398 }
399 }
400 });
401 // Threads have joined; restore the pool for the next conversion.
402 self.pool = workers;
403
404 if let Some(e) = first_err.lock().unwrap().take() {
405 return Err(e);
406 }
407 let mut results = Arc::try_unwrap(results)
408 .unwrap_or_else(|arc| Mutex::new(arc.lock().unwrap().clone()))
409 .into_inner()
410 .unwrap();
411 results.sort_by_key(|(idx, _)| *idx);
412 let mut doc = DoclingDocument::new(name);
413 for (_, (nodes, links)) in results {
414 doc.nodes.extend(nodes);
415 doc.links.extend(links);
416 }
417 assemble::merge_continuations(&mut doc.nodes);
418 Ok(doc)
419 }
420
421 /// Convert a PDF in **streaming** mode: `emit` is called with each finalized,
422 /// in-document-order batch of nodes (and that span's recovered links) as pages
423 /// complete, so a caller can serialize Markdown page by page instead of waiting
424 /// for the whole document. The batches are exactly the buffered [`convert`]'s
425 /// nodes, split at safe block boundaries by [`assemble::StreamAssembler`] — the
426 /// parallel path reorders pages back into document order before emitting, so
427 /// the output is identical regardless of worker scheduling.
428 ///
429 /// `emit` runs on the calling thread (never a worker), so it needn't be `Send`
430 /// and its backpressure throttles the whole pipeline. Returning `Err` from
431 /// `emit` aborts the conversion with that error.
432 pub fn convert_streaming<F>(
433 &mut self,
434 bytes: &[u8],
435 password: Option<&str>,
436 name: &str,
437 emit: F,
438 ) -> Result<(), PdfError>
439 where
440 F: FnMut(Vec<Node>, Vec<(String, String)>) -> Result<(), PdfError>,
441 {
442 let _ = name; // page nodes carry no name; the caller owns the document name.
443 let pages = pdfium_backend::page_count(bytes, password)?;
444 let r = if self.target_workers >= 2 && pages >= self.parallel_min {
445 self.convert_streaming_parallel(bytes, password, emit)
446 } else {
447 self.convert_streaming_serial(bytes, password, emit)
448 };
449 timing::report();
450 r
451 }
452
453 /// Serial streaming: render → process → emit, one page at a time, holding back
454 /// only the tail that might still merge into the next page.
455 fn convert_streaming_serial<F>(
456 &mut self,
457 bytes: &[u8],
458 password: Option<&str>,
459 mut emit: F,
460 ) -> Result<(), PdfError>
461 where
462 F: FnMut(Vec<Node>, Vec<(String, String)>) -> Result<(), PdfError>,
463 {
464 let mut asm = assemble::StreamAssembler::new();
465 let render_image = !self.no_ocr;
466 let worker = self.primary()?;
467 pdfium_backend::for_each_page(bytes, password, render_image, |n, _total, mut page| {
468 let (nodes, links) = worker.process(n, &mut page)?;
469 emit(asm.push(nodes), links)
470 })?;
471 emit(asm.finish(), Vec::new())
472 }
473
474 /// Parallel streaming: pages render serially on a dedicated thread (pdfium is
475 /// not thread-safe) and process across the worker pool; results carry their
476 /// page index and are reordered on the calling thread into a
477 /// [`assemble::StreamAssembler`], which emits each page in document order as
478 /// soon as its predecessors have arrived. Bounded channels keep only a handful
479 /// of pages resident and let `emit`'s backpressure reach the renderer.
480 fn convert_streaming_parallel<F>(
481 &mut self,
482 bytes: &[u8],
483 password: Option<&str>,
484 mut emit: F,
485 ) -> Result<(), PdfError>
486 where
487 F: FnMut(Vec<Node>, Vec<(String, String)>) -> Result<(), PdfError>,
488 {
489 self.ensure_pool()?;
490 let n_workers = self.pool.len();
491 let render_image = !self.no_ocr;
492 let (work_tx, work_rx) = sync_channel::<(usize, PdfPage)>(n_workers * 2);
493 let work_rx: Arc<Mutex<Receiver<(usize, PdfPage)>>> = Arc::new(Mutex::new(work_rx));
494 // Workers and the renderer report here; the calling thread drains it in
495 // page order. Bounded so workers block (bounding resident bitmaps) when the
496 // consumer falls behind.
497 let (res_tx, res_rx) = sync_channel::<Result<(usize, PageOut), PdfError>>(n_workers * 2);
498
499 let mut workers = std::mem::take(&mut self.pool);
500 let mut asm = assemble::StreamAssembler::new();
501 let mut first_err: Option<PdfError> = None;
502
503 std::thread::scope(|s| {
504 // Workers: pull a page, process it, report (index-tagged) result.
505 for worker in workers.iter_mut() {
506 let work_rx = Arc::clone(&work_rx);
507 let res_tx = res_tx.clone();
508 s.spawn(move || loop {
509 let item = work_rx.lock().unwrap().recv();
510 let Ok((idx, mut page)) = item else { break };
511 let out = worker.process(idx, &mut page).map(|o| (idx, o));
512 if res_tx.send(out).is_err() {
513 break; // consumer gone
514 }
515 });
516 }
517 // Renderer: feed pages to the pool on its own thread (pdfium stays on a
518 // single thread); report a render error through the same channel.
519 {
520 let res_tx = res_tx.clone();
521 s.spawn(move || {
522 let render = pdfium_backend::for_each_page(
523 bytes,
524 password,
525 render_image,
526 |i, _total, page| {
527 work_tx
528 .send((i, page))
529 .map_err(|_| PdfError::Pdfium("page-worker channel closed".into()))
530 },
531 );
532 drop(work_tx); // signal workers to finish
533 if let Err(e) = render {
534 let _ = res_tx.send(Err(e));
535 }
536 });
537 }
538 // Drop our own sender so the channel closes once the threads finish.
539 drop(res_tx);
540
541 // Collector (this thread): reorder into document order and emit.
542 let mut buffer: BTreeMap<usize, PageOut> = BTreeMap::new();
543 let mut next = 0usize;
544 for msg in res_rx.iter() {
545 match msg {
546 Err(e) => {
547 if first_err.is_none() {
548 first_err = Some(e);
549 }
550 }
551 Ok((idx, out)) => {
552 buffer.insert(idx, out);
553 if first_err.is_some() {
554 continue; // keep draining so the threads can exit
555 }
556 while let Some((nodes, links)) = buffer.remove(&next) {
557 if let Err(e) = emit(asm.push(nodes), links) {
558 first_err = Some(e);
559 break;
560 }
561 next += 1;
562 }
563 }
564 }
565 }
566 });
567 // Threads have joined; restore the pool for the next conversion.
568 self.pool = workers;
569
570 if let Some(e) = first_err {
571 return Err(e);
572 }
573 emit(asm.finish(), Vec::new())
574 }
575
576 /// Lazily grow the pool to `target_workers`, loading the new workers
577 /// concurrently (model load is mostly I/O + mmap, so N loads overlap to roughly
578 /// one load's wall-time). Cached for reuse across documents.
579 fn ensure_pool(&mut self) -> Result<(), PdfError> {
580 let need = self.target_workers.saturating_sub(self.pool.len());
581 if need == 0 {
582 return Ok(());
583 }
584 let intra = pdf_intra();
585 let no_table_former = self.no_table_former;
586 let no_ocr = self.no_ocr;
587 let loaded: Vec<Result<Worker, PdfError>> = std::thread::scope(|s| {
588 let handles: Vec<_> = (0..need)
589 .map(|_| s.spawn(move || Worker::load(intra, no_table_former, no_ocr)))
590 .collect();
591 handles.into_iter().map(|h| h.join().unwrap()).collect()
592 });
593 for w in loaded {
594 self.pool.push(w?);
595 }
596 Ok(())
597 }
598
599 /// Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page —
600 /// docling routes images through the same layout+OCR pipeline as a PDF page.
601 pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
602 let image = image::load_from_memory(bytes)
603 .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
604 .into_rgb8();
605 let (w, h) = image.dimensions();
606 // The image is its own page rendered at 1 px per "point" (scale 1.0); a
607 // standalone image has no text layer, so OCR supplies the cells.
608 let page = PdfPage {
609 width: w as f32,
610 height: h as f32,
611 scale: 1.0,
612 cells: Vec::new(),
613 code_cells: Vec::new(),
614 word_cells: Vec::new(),
615 image,
616 links: Vec::new(),
617 };
618 self.process_pages(vec![page], name)
619 }
620
621 /// Run layout (+ OCR for cell-less pages) and assemble each already-rendered
622 /// page (image / METS inputs, which are small and already materialised).
623 fn process_pages(
624 &mut self,
625 mut pages: Vec<PdfPage>,
626 name: &str,
627 ) -> Result<DoclingDocument, PdfError> {
628 let mut doc = DoclingDocument::new(name);
629 let worker = self.primary()?;
630 for (n, page) in pages.iter_mut().enumerate() {
631 let (nodes, links) = worker.process(n, page)?;
632 doc.nodes.extend(nodes);
633 doc.links.extend(links);
634 }
635 assemble::merge_continuations(&mut doc.nodes);
636 Ok(doc)
637 }
638}
639
640/// Convenience one-shot conversion (loads the pipeline per call). Errors are
641/// detailed and surfaced (never silently skipped).
642pub fn convert(
643 bytes: &[u8],
644 password: Option<&str>,
645 name: &str,
646) -> Result<DoclingDocument, PdfError> {
647 convert_with_options(bytes, password, name, false, false)
648}
649
650/// Like [`convert`], but optionally skips loading/running TableFormer (see
651/// [`Pipeline::no_table_former`]) and/or layout+OCR+TableFormer entirely (see
652/// [`Pipeline::no_ocr`]).
653pub fn convert_with_options(
654 bytes: &[u8],
655 password: Option<&str>,
656 name: &str,
657 no_table_former: bool,
658 no_ocr: bool,
659) -> Result<DoclingDocument, PdfError> {
660 Pipeline::new()?
661 .no_table_former(no_table_former)
662 .no_ocr(no_ocr)
663 .convert(bytes, password, name)
664}
665
666/// Convenience one-shot image conversion (loads the pipeline per call).
667pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
668 convert_image_with_options(bytes, name, false, false)
669}
670
671/// Like [`convert_image`], but optionally skips loading/running TableFormer (see
672/// [`Pipeline::no_table_former`]) and/or layout+OCR+TableFormer entirely (see
673/// [`Pipeline::no_ocr`]).
674pub fn convert_image_with_options(
675 bytes: &[u8],
676 name: &str,
677 no_table_former: bool,
678 no_ocr: bool,
679) -> Result<DoclingDocument, PdfError> {
680 Pipeline::new()?
681 .no_table_former(no_table_former)
682 .no_ocr(no_ocr)
683 .convert_image(bytes, name)
684}
685
686/// Convert pre-segmented pages (image + already-known text cells, e.g. METS/hOCR
687/// scans) through the shared layout + assembly pipeline.
688pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
689 convert_pages_with_options(pages, name, false, false)
690}
691
692/// Like [`convert_pages`], but optionally skips loading/running TableFormer (see
693/// [`Pipeline::no_table_former`]) and/or layout+OCR+TableFormer entirely (see
694/// [`Pipeline::no_ocr`]).
695pub fn convert_pages_with_options(
696 pages: Vec<PdfPage>,
697 name: &str,
698 no_table_former: bool,
699 no_ocr: bool,
700) -> Result<DoclingDocument, PdfError> {
701 Pipeline::new()?
702 .no_table_former(no_table_former)
703 .no_ocr(no_ocr)
704 .process_pages(pages, name)
705}