Skip to main content

fleischwolf_pdf/
lib.rs

1//! PDF backend for fleischwolf.
2//!
3//! A port of docling's standard PDF pipeline: pdfium extracts the text layer
4//! (cells with bounding boxes) and renders page images; a discriminative ONNX
5//! stack (layout detection, table structure, OCR) classifies regions; the cells
6//! are assembled in reading order into a [`DoclingDocument`].
7//!
8//! Current stages: pdfium text-cell extraction + page rendering ([`pdfium_backend`])
9//! and the deterministic text/reading-order assembly ([`assemble`]). The layout,
10//! table-structure and OCR ONNX stages land behind [`Pipeline`] next.
11
12mod assemble;
13mod dp_lines;
14pub mod layout;
15mod mets;
16mod ocr;
17pub mod pdfium_backend;
18pub mod resample;
19pub mod tableformer;
20pub mod textparse;
21pub mod timing;
22
23use std::collections::BTreeMap;
24use std::fmt;
25use std::sync::mpsc::{sync_channel, Receiver};
26use std::sync::{Arc, Mutex};
27
28use fleischwolf_core::{DoclingDocument, Node};
29
30pub use mets::{convert_mets_gbs, convert_mets_gbs_with_options};
31pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
32
33/// Errors from the PDF backend. Detailed and surfaced (never silently skipped).
34#[derive(Debug)]
35pub enum PdfError {
36    /// pdfium failed to bind, open, or read the document.
37    Pdfium(String),
38    /// The layout ONNX model failed to load or run.
39    Layout(String),
40    /// The OCR ONNX model failed to load or run.
41    Ocr(String),
42}
43
44impl fmt::Display for PdfError {
45    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
46        match self {
47            PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
48            PdfError::Layout(m) => write!(f, "pdf: {m}"),
49            PdfError::Ocr(m) => write!(f, "pdf: {m}"),
50        }
51    }
52}
53
54impl std::error::Error for PdfError {}
55
56impl From<pdfium_render::prelude::PdfiumError> for PdfError {
57    fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
58        PdfError::Pdfium(e.to_string())
59    }
60}
61
62/// Threads ONNX inference may use, capped by `FLEISCHWOLF_PDF_THREADS` if set.
63/// Defaults to the available parallelism (ort otherwise picks a low number).
64pub(crate) fn intra_threads() -> usize {
65    if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
66        .ok()
67        .and_then(|v| v.parse::<usize>().ok())
68        .filter(|&n| n > 0)
69    {
70        return n;
71    }
72    std::thread::available_parallelism()
73        .map(|n| n.get())
74        .unwrap_or(1)
75}
76
77/// One page's assembled output: typed nodes plus the page's hyperlinks, kept
78/// separate so pages processed out of order can be stitched back in page order.
79type PageOut = (Vec<Node>, Vec<(String, String)>);
80
81/// A self-contained set of the per-page models (layout, OCR, TableFormer). Each
82/// parallel page-worker owns its own `Worker` so inference runs concurrently
83/// without sharing an ONNX session (`ort`'s `Session::run` is `&mut self`).
84struct Worker {
85    layout: layout::LayoutModel,
86    ocr: Option<ocr::OcrModel>,
87    /// TableFormer structure model; `None` when its ONNX graphs aren't present
88    /// (the assembler then falls back to geometric table reconstruction).
89    tables: Option<tableformer::TableFormer>,
90}
91
92impl Worker {
93    fn load(intra: usize, no_table_former: bool) -> Result<Self, PdfError> {
94        Ok(Self {
95            layout: layout::LayoutModel::load_with(intra).map_err(PdfError::Layout)?,
96            ocr: None,
97            tables: if no_table_former {
98                None
99            } else {
100                tableformer::TableFormer::load_with(intra)
101            },
102        })
103    }
104
105    /// Run layout (+ OCR for cell-less pages) + TableFormer and assemble page `n`
106    /// into its nodes and links. Pure given the page (mutates only the worker's
107    /// lazily-loaded OCR model), so it is safe to run concurrently across pages.
108    fn process(&mut self, n: usize, page: &mut PdfPage) -> Result<PageOut, PdfError> {
109        let regions = timing::timed("layout.predict", || {
110            self.layout.predict(&page.image, page.width, page.height)
111        })
112        .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
113        // Resolve overlapping detections once, before OCR.
114        let mut regions = assemble::resolve(regions);
115        // Emit text the detector missed as orphan text regions (docling parity).
116        assemble::add_orphan_regions(&mut regions, &page.cells);
117        // Drop phantom empty low-confidence picture boxes (docling parity).
118        assemble::drop_false_pictures(&mut regions, &page.cells, page.width, page.height);
119        // No text layer → recognise text from the page image via OCR.
120        if page.cells.is_empty() {
121            if self.ocr.is_none() {
122                self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
123            }
124            let cells = self
125                .ocr
126                .as_mut()
127                .unwrap()
128                .ocr_page(&page.image, &regions, page.scale)
129                .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
130            page.cells = cells;
131        }
132        // TableFormer structure per table region (else geometric fallback).
133        let mut table_rows: Vec<Option<Vec<Vec<String>>>> = vec![None; regions.len()];
134        if let Some(tf) = self.tables.as_mut() {
135            timing::timed("tableformer", || {
136                for (i, r) in regions.iter().enumerate() {
137                    if r.label == "table" {
138                        table_rows[i] = tf.predict_table_rows(
139                            &page.image,
140                            page.height,
141                            [r.l, r.t, r.r, r.b],
142                            &page.word_cells,
143                        );
144                    }
145                }
146            });
147        }
148        Ok(timing::timed("assemble_page", || {
149            assemble::assemble_page(page, regions, &table_rows)
150        }))
151    }
152}
153
154/// Per-worker ONNX intra-op threads. The layout model is memory-bandwidth bound,
155/// so on a typical machine two threads per worker (sharing one in-cache copy of
156/// the weights) extracts more throughput than one fat model or many single-thread
157/// workers. `FLEISCHWOLF_PDF_INTRA` overrides for per-machine tuning.
158fn pdf_intra() -> usize {
159    if let Some(n) = std::env::var("FLEISCHWOLF_PDF_INTRA")
160        .ok()
161        .and_then(|v| v.parse::<usize>().ok())
162        .filter(|&n| n > 0)
163    {
164        return n;
165    }
166    if intra_threads() >= 2 {
167        2
168    } else {
169        1
170    }
171}
172
173/// How many page-workers to spin up for a multi-page PDF. `FLEISCHWOLF_PDF_WORKERS`
174/// overrides; otherwise size the pool so `workers × intra ≈ cores`, capped at 4 so
175/// a worst-case pool holds a bounded amount of model memory (~0.4 GB per worker)
176/// and does not oversaturate the memory bus with model-weight traffic.
177fn pdf_worker_count() -> usize {
178    if let Some(n) = std::env::var("FLEISCHWOLF_PDF_WORKERS")
179        .ok()
180        .and_then(|v| v.parse::<usize>().ok())
181        .filter(|&n| n > 0)
182    {
183        return n;
184    }
185    (intra_threads() / pdf_intra()).clamp(1, 4)
186}
187
188/// Minimum page count before a PDF is worth the parallel worker pool. Below this,
189/// the serial primary (running its model on every core) is faster than fanning out
190/// — the helper pool's one-time model-load cost only pays off once enough pages
191/// share it. `FLEISCHWOLF_PDF_PARALLEL_MIN` overrides.
192fn pdf_parallel_min() -> usize {
193    std::env::var("FLEISCHWOLF_PDF_PARALLEL_MIN")
194        .ok()
195        .and_then(|v| v.parse::<usize>().ok())
196        .filter(|&n| n > 0)
197        .unwrap_or(6)
198}
199
200/// A reusable PDF pipeline. The **primary** worker runs its models on every core,
201/// so a single-page / small / image / METS input is converted at full intra-op
202/// speed with no pool to load. A document with enough pages instead fans out
203/// across a **pool** of narrower workers processed concurrently. Both load lazily
204/// and are cached for reuse, so a one-shot conversion only pays for what it uses.
205pub struct Pipeline {
206    /// Full-intra worker for the serial path; loaded on first serial use.
207    primary: Option<Worker>,
208    /// Narrower workers (≈cores/`target_workers` threads each) for the parallel
209    /// path; loaded on first multi-page use and cached.
210    pool: Vec<Worker>,
211    /// Desired pool size for multi-page documents.
212    target_workers: usize,
213    /// Page count at/above which the parallel pool is worth its load cost.
214    parallel_min: usize,
215    /// Skip loading/running TableFormer; table regions fall back to geometric
216    /// reconstruction. See [`Pipeline::no_table_former`].
217    no_table_former: bool,
218}
219
220impl Pipeline {
221    /// Construct the pipeline. Models load lazily on first use (full-intra primary
222    /// for serial inputs, the helper pool for multi-page PDFs), so nothing is
223    /// loaded that a given document doesn't need.
224    pub fn new() -> Result<Self, PdfError> {
225        Ok(Self {
226            primary: None,
227            pool: Vec::new(),
228            target_workers: pdf_worker_count(),
229            parallel_min: pdf_parallel_min(),
230            no_table_former: false,
231        })
232    }
233
234    /// Skip loading and running the TableFormer table-structure model. Table
235    /// regions still get emitted, but reconstructed geometrically from cell
236    /// positions instead of via the ONNX model's predicted structure — faster
237    /// (no model load, no per-table inference) at the cost of table fidelity.
238    /// No effect if a worker is already loaded; set this before the first
239    /// conversion.
240    pub fn no_table_former(mut self, disable: bool) -> Self {
241        self.no_table_former = disable;
242        self
243    }
244
245    /// The full-intra serial worker, loaded on first use.
246    fn primary(&mut self) -> Result<&mut Worker, PdfError> {
247        if self.primary.is_none() {
248            self.primary = Some(Worker::load(intra_threads(), self.no_table_former)?);
249        }
250        Ok(self.primary.as_mut().unwrap())
251    }
252
253    /// Convert a PDF (bytes) to a [`DoclingDocument`]. A document with fewer than
254    /// `parallel_min` pages (or a pool size of 1) streams through the full-intra
255    /// primary; a larger one renders on this thread (pdfium is not thread-safe) and
256    /// fans the pages out across the worker pool, reassembled in page order so the
257    /// output is byte-identical to the serial path.
258    pub fn convert(
259        &mut self,
260        bytes: &[u8],
261        password: Option<&str>,
262        name: &str,
263    ) -> Result<DoclingDocument, PdfError> {
264        let pages = pdfium_backend::page_count(bytes, password)?;
265        let doc = if self.target_workers >= 2 && pages >= self.parallel_min {
266            self.convert_parallel(bytes, password, name)?
267        } else {
268            self.convert_serial(bytes, password, name)?
269        };
270        timing::report();
271        Ok(doc)
272    }
273
274    /// Stream pages one at a time through the primary worker — render → process →
275    /// drop — so the document holds ~one page bitmap (~5 MB) at a time.
276    fn convert_serial(
277        &mut self,
278        bytes: &[u8],
279        password: Option<&str>,
280        name: &str,
281    ) -> Result<DoclingDocument, PdfError> {
282        let mut doc = DoclingDocument::new(name);
283        let worker = self.primary()?;
284        pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
285            let (nodes, links) = worker.process(n, &mut page)?;
286            doc.nodes.extend(nodes);
287            doc.links.extend(links);
288            Ok::<(), PdfError>(())
289        })?;
290        assemble::merge_continuations(&mut doc.nodes);
291        Ok(doc)
292    }
293
294    /// Render pages serially on this thread (pdfium) and process them in parallel
295    /// across the worker pool. A bounded channel applies backpressure so only a
296    /// handful of page bitmaps are resident at once; results carry their page
297    /// index and are reassembled in order, so the output is byte-identical to the
298    /// serial path.
299    fn convert_parallel(
300        &mut self,
301        bytes: &[u8],
302        password: Option<&str>,
303        name: &str,
304    ) -> Result<DoclingDocument, PdfError> {
305        self.ensure_pool()?;
306        let n_workers = self.pool.len();
307        let (work_tx, work_rx) = sync_channel::<(usize, PdfPage)>(n_workers * 2);
308        let work_rx: Arc<Mutex<Receiver<(usize, PdfPage)>>> = Arc::new(Mutex::new(work_rx));
309        let results: Arc<Mutex<Vec<(usize, PageOut)>>> = Arc::new(Mutex::new(Vec::new()));
310        let first_err: Arc<Mutex<Option<PdfError>>> = Arc::new(Mutex::new(None));
311
312        // Move the pool into the scope so each worker gets an exclusive `&mut`.
313        let mut workers = std::mem::take(&mut self.pool);
314        std::thread::scope(|s| {
315            for worker in workers.iter_mut() {
316                let work_rx = Arc::clone(&work_rx);
317                let results = Arc::clone(&results);
318                let first_err = Arc::clone(&first_err);
319                s.spawn(move || loop {
320                    // Hold the receiver lock only for the recv; release before the
321                    // (long) per-page work so other workers can pull concurrently.
322                    let item = work_rx.lock().unwrap().recv();
323                    let Ok((idx, mut page)) = item else { break };
324                    match worker.process(idx, &mut page) {
325                        Ok(out) => results.lock().unwrap().push((idx, out)),
326                        Err(e) => {
327                            let mut slot = first_err.lock().unwrap();
328                            if slot.is_none() {
329                                *slot = Some(e);
330                            }
331                        }
332                    }
333                });
334            }
335            // Render on this thread and feed the workers; backpressure blocks here
336            // when the channel is full. Dropping `work_tx` afterwards signals the
337            // workers (recv → Err) to finish.
338            let render = pdfium_backend::for_each_page(bytes, password, |i, _total, page| {
339                work_tx
340                    .send((i, page))
341                    .map_err(|_| PdfError::Pdfium("page-worker channel closed".into()))
342            });
343            drop(work_tx);
344            if let Err(e) = render {
345                let mut slot = first_err.lock().unwrap();
346                if slot.is_none() {
347                    *slot = Some(e);
348                }
349            }
350        });
351        // Threads have joined; restore the pool for the next conversion.
352        self.pool = workers;
353
354        if let Some(e) = first_err.lock().unwrap().take() {
355            return Err(e);
356        }
357        let mut results = Arc::try_unwrap(results)
358            .unwrap_or_else(|arc| Mutex::new(arc.lock().unwrap().clone()))
359            .into_inner()
360            .unwrap();
361        results.sort_by_key(|(idx, _)| *idx);
362        let mut doc = DoclingDocument::new(name);
363        for (_, (nodes, links)) in results {
364            doc.nodes.extend(nodes);
365            doc.links.extend(links);
366        }
367        assemble::merge_continuations(&mut doc.nodes);
368        Ok(doc)
369    }
370
371    /// Convert a PDF in **streaming** mode: `emit` is called with each finalized,
372    /// in-document-order batch of nodes (and that span's recovered links) as pages
373    /// complete, so a caller can serialize Markdown page by page instead of waiting
374    /// for the whole document. The batches are exactly the buffered [`convert`]'s
375    /// nodes, split at safe block boundaries by [`assemble::StreamAssembler`] — the
376    /// parallel path reorders pages back into document order before emitting, so
377    /// the output is identical regardless of worker scheduling.
378    ///
379    /// `emit` runs on the calling thread (never a worker), so it needn't be `Send`
380    /// and its backpressure throttles the whole pipeline. Returning `Err` from
381    /// `emit` aborts the conversion with that error.
382    pub fn convert_streaming<F>(
383        &mut self,
384        bytes: &[u8],
385        password: Option<&str>,
386        name: &str,
387        emit: F,
388    ) -> Result<(), PdfError>
389    where
390        F: FnMut(Vec<Node>, Vec<(String, String)>) -> Result<(), PdfError>,
391    {
392        let _ = name; // page nodes carry no name; the caller owns the document name.
393        let pages = pdfium_backend::page_count(bytes, password)?;
394        let r = if self.target_workers >= 2 && pages >= self.parallel_min {
395            self.convert_streaming_parallel(bytes, password, emit)
396        } else {
397            self.convert_streaming_serial(bytes, password, emit)
398        };
399        timing::report();
400        r
401    }
402
403    /// Serial streaming: render → process → emit, one page at a time, holding back
404    /// only the tail that might still merge into the next page.
405    fn convert_streaming_serial<F>(
406        &mut self,
407        bytes: &[u8],
408        password: Option<&str>,
409        mut emit: F,
410    ) -> Result<(), PdfError>
411    where
412        F: FnMut(Vec<Node>, Vec<(String, String)>) -> Result<(), PdfError>,
413    {
414        let mut asm = assemble::StreamAssembler::new();
415        let worker = self.primary()?;
416        pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
417            let (nodes, links) = worker.process(n, &mut page)?;
418            emit(asm.push(nodes), links)
419        })?;
420        emit(asm.finish(), Vec::new())
421    }
422
423    /// Parallel streaming: pages render serially on a dedicated thread (pdfium is
424    /// not thread-safe) and process across the worker pool; results carry their
425    /// page index and are reordered on the calling thread into a
426    /// [`assemble::StreamAssembler`], which emits each page in document order as
427    /// soon as its predecessors have arrived. Bounded channels keep only a handful
428    /// of pages resident and let `emit`'s backpressure reach the renderer.
429    fn convert_streaming_parallel<F>(
430        &mut self,
431        bytes: &[u8],
432        password: Option<&str>,
433        mut emit: F,
434    ) -> Result<(), PdfError>
435    where
436        F: FnMut(Vec<Node>, Vec<(String, String)>) -> Result<(), PdfError>,
437    {
438        self.ensure_pool()?;
439        let n_workers = self.pool.len();
440        let (work_tx, work_rx) = sync_channel::<(usize, PdfPage)>(n_workers * 2);
441        let work_rx: Arc<Mutex<Receiver<(usize, PdfPage)>>> = Arc::new(Mutex::new(work_rx));
442        // Workers and the renderer report here; the calling thread drains it in
443        // page order. Bounded so workers block (bounding resident bitmaps) when the
444        // consumer falls behind.
445        let (res_tx, res_rx) = sync_channel::<Result<(usize, PageOut), PdfError>>(n_workers * 2);
446
447        let mut workers = std::mem::take(&mut self.pool);
448        let mut asm = assemble::StreamAssembler::new();
449        let mut first_err: Option<PdfError> = None;
450
451        std::thread::scope(|s| {
452            // Workers: pull a page, process it, report (index-tagged) result.
453            for worker in workers.iter_mut() {
454                let work_rx = Arc::clone(&work_rx);
455                let res_tx = res_tx.clone();
456                s.spawn(move || loop {
457                    let item = work_rx.lock().unwrap().recv();
458                    let Ok((idx, mut page)) = item else { break };
459                    let out = worker.process(idx, &mut page).map(|o| (idx, o));
460                    if res_tx.send(out).is_err() {
461                        break; // consumer gone
462                    }
463                });
464            }
465            // Renderer: feed pages to the pool on its own thread (pdfium stays on a
466            // single thread); report a render error through the same channel.
467            {
468                let res_tx = res_tx.clone();
469                s.spawn(move || {
470                    let render =
471                        pdfium_backend::for_each_page(bytes, password, |i, _total, page| {
472                            work_tx
473                                .send((i, page))
474                                .map_err(|_| PdfError::Pdfium("page-worker channel closed".into()))
475                        });
476                    drop(work_tx); // signal workers to finish
477                    if let Err(e) = render {
478                        let _ = res_tx.send(Err(e));
479                    }
480                });
481            }
482            // Drop our own sender so the channel closes once the threads finish.
483            drop(res_tx);
484
485            // Collector (this thread): reorder into document order and emit.
486            let mut buffer: BTreeMap<usize, PageOut> = BTreeMap::new();
487            let mut next = 0usize;
488            for msg in res_rx.iter() {
489                match msg {
490                    Err(e) => {
491                        if first_err.is_none() {
492                            first_err = Some(e);
493                        }
494                    }
495                    Ok((idx, out)) => {
496                        buffer.insert(idx, out);
497                        if first_err.is_some() {
498                            continue; // keep draining so the threads can exit
499                        }
500                        while let Some((nodes, links)) = buffer.remove(&next) {
501                            if let Err(e) = emit(asm.push(nodes), links) {
502                                first_err = Some(e);
503                                break;
504                            }
505                            next += 1;
506                        }
507                    }
508                }
509            }
510        });
511        // Threads have joined; restore the pool for the next conversion.
512        self.pool = workers;
513
514        if let Some(e) = first_err {
515            return Err(e);
516        }
517        emit(asm.finish(), Vec::new())
518    }
519
520    /// Lazily grow the pool to `target_workers`, loading the new workers
521    /// concurrently (model load is mostly I/O + mmap, so N loads overlap to roughly
522    /// one load's wall-time). Cached for reuse across documents.
523    fn ensure_pool(&mut self) -> Result<(), PdfError> {
524        let need = self.target_workers.saturating_sub(self.pool.len());
525        if need == 0 {
526            return Ok(());
527        }
528        let intra = pdf_intra();
529        let no_table_former = self.no_table_former;
530        let loaded: Vec<Result<Worker, PdfError>> = std::thread::scope(|s| {
531            let handles: Vec<_> = (0..need)
532                .map(|_| s.spawn(move || Worker::load(intra, no_table_former)))
533                .collect();
534            handles.into_iter().map(|h| h.join().unwrap()).collect()
535        });
536        for w in loaded {
537            self.pool.push(w?);
538        }
539        Ok(())
540    }
541
542    /// Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page —
543    /// docling routes images through the same layout+OCR pipeline as a PDF page.
544    pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
545        let image = image::load_from_memory(bytes)
546            .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
547            .into_rgb8();
548        let (w, h) = image.dimensions();
549        // The image is its own page rendered at 1 px per "point" (scale 1.0); a
550        // standalone image has no text layer, so OCR supplies the cells.
551        let page = PdfPage {
552            width: w as f32,
553            height: h as f32,
554            scale: 1.0,
555            cells: Vec::new(),
556            code_cells: Vec::new(),
557            word_cells: Vec::new(),
558            image,
559            links: Vec::new(),
560        };
561        self.process_pages(vec![page], name)
562    }
563
564    /// Run layout (+ OCR for cell-less pages) and assemble each already-rendered
565    /// page (image / METS inputs, which are small and already materialised).
566    fn process_pages(
567        &mut self,
568        mut pages: Vec<PdfPage>,
569        name: &str,
570    ) -> Result<DoclingDocument, PdfError> {
571        let mut doc = DoclingDocument::new(name);
572        let worker = self.primary()?;
573        for (n, page) in pages.iter_mut().enumerate() {
574            let (nodes, links) = worker.process(n, page)?;
575            doc.nodes.extend(nodes);
576            doc.links.extend(links);
577        }
578        assemble::merge_continuations(&mut doc.nodes);
579        Ok(doc)
580    }
581}
582
583/// Convenience one-shot conversion (loads the pipeline per call). Errors are
584/// detailed and surfaced (never silently skipped).
585pub fn convert(
586    bytes: &[u8],
587    password: Option<&str>,
588    name: &str,
589) -> Result<DoclingDocument, PdfError> {
590    convert_with_options(bytes, password, name, false)
591}
592
593/// Like [`convert`], but optionally skips loading/running TableFormer (see
594/// [`Pipeline::no_table_former`]).
595pub fn convert_with_options(
596    bytes: &[u8],
597    password: Option<&str>,
598    name: &str,
599    no_table_former: bool,
600) -> Result<DoclingDocument, PdfError> {
601    Pipeline::new()?
602        .no_table_former(no_table_former)
603        .convert(bytes, password, name)
604}
605
606/// Convenience one-shot image conversion (loads the pipeline per call).
607pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
608    convert_image_with_options(bytes, name, false)
609}
610
611/// Like [`convert_image`], but optionally skips loading/running TableFormer (see
612/// [`Pipeline::no_table_former`]).
613pub fn convert_image_with_options(
614    bytes: &[u8],
615    name: &str,
616    no_table_former: bool,
617) -> Result<DoclingDocument, PdfError> {
618    Pipeline::new()?
619        .no_table_former(no_table_former)
620        .convert_image(bytes, name)
621}
622
623/// Convert pre-segmented pages (image + already-known text cells, e.g. METS/hOCR
624/// scans) through the shared layout + assembly pipeline.
625pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
626    convert_pages_with_options(pages, name, false)
627}
628
629/// Like [`convert_pages`], but optionally skips loading/running TableFormer (see
630/// [`Pipeline::no_table_former`]).
631pub fn convert_pages_with_options(
632    pages: Vec<PdfPage>,
633    name: &str,
634    no_table_former: bool,
635) -> Result<DoclingDocument, PdfError> {
636    Pipeline::new()?
637        .no_table_former(no_table_former)
638        .process_pages(pages, name)
639}