Skip to main content

fleischwolf_pdf/
lib.rs

1//! PDF backend for fleischwolf.
2//!
3//! A port of docling's standard PDF pipeline: pdfium extracts the text layer
4//! (cells with bounding boxes) and renders page images; a discriminative ONNX
5//! stack (layout detection, table structure, OCR) classifies regions; the cells
6//! are assembled in reading order into a [`DoclingDocument`].
7//!
8//! Current stages: pdfium text-cell extraction + page rendering ([`pdfium_backend`])
9//! and the deterministic text/reading-order assembly ([`assemble`]). The layout,
10//! table-structure and OCR ONNX stages land behind [`Pipeline`] next.
11
12mod assemble;
13mod dp_lines;
14pub mod layout;
15mod mets;
16mod ocr;
17pub mod pdfium_backend;
18pub mod resample;
19pub mod tableformer;
20pub mod textparse;
21
22use std::fmt;
23
24use fleischwolf_core::DoclingDocument;
25
26pub use mets::convert_mets_gbs;
27pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
28
29/// Errors from the PDF backend. Detailed and surfaced (never silently skipped).
30#[derive(Debug)]
31pub enum PdfError {
32    /// pdfium failed to bind, open, or read the document.
33    Pdfium(String),
34    /// The layout ONNX model failed to load or run.
35    Layout(String),
36    /// The OCR ONNX model failed to load or run.
37    Ocr(String),
38}
39
40impl fmt::Display for PdfError {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        match self {
43            PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
44            PdfError::Layout(m) => write!(f, "pdf: {m}"),
45            PdfError::Ocr(m) => write!(f, "pdf: {m}"),
46        }
47    }
48}
49
50impl std::error::Error for PdfError {}
51
52impl From<pdfium_render::prelude::PdfiumError> for PdfError {
53    fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
54        PdfError::Pdfium(e.to_string())
55    }
56}
57
58/// Threads ONNX inference may use, capped by `FLEISCHWOLF_PDF_THREADS` if set.
59/// Defaults to the available parallelism (ort otherwise picks a low number).
60pub(crate) fn intra_threads() -> usize {
61    if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
62        .ok()
63        .and_then(|v| v.parse::<usize>().ok())
64        .filter(|&n| n > 0)
65    {
66        return n;
67    }
68    std::thread::available_parallelism()
69        .map(|n| n.get())
70        .unwrap_or(1)
71}
72
73/// A reusable PDF pipeline: the layout model is loaded once and reused across
74/// documents; OCR loads lazily the first time a scanned page is seen.
75pub struct Pipeline {
76    layout: layout::LayoutModel,
77    ocr: Option<ocr::OcrModel>,
78    /// TableFormer structure model; `None` when its ONNX graphs aren't present
79    /// (the assembler then falls back to geometric table reconstruction).
80    tables: Option<tableformer::TableFormer>,
81}
82
83impl Pipeline {
84    /// Load the layout model (the only always-required model). TableFormer loads
85    /// if its exported graphs are present, else table regions use the geometric
86    /// fallback.
87    pub fn new() -> Result<Self, PdfError> {
88        Ok(Self {
89            layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
90            ocr: None,
91            tables: tableformer::TableFormer::load(),
92        })
93    }
94
95    /// Convert a PDF (bytes) to a [`DoclingDocument`] via the discriminative
96    /// pipeline: pdfium text cells (or OCR for scanned pages) + per-page layout
97    /// detection, assembled in reading order. Errors are detailed and surfaced.
98    pub fn convert(
99        &mut self,
100        bytes: &[u8],
101        password: Option<&str>,
102        name: &str,
103    ) -> Result<DoclingDocument, PdfError> {
104        // Stream pages: render → process → drop one at a time, so a large PDF
105        // holds ~one page bitmap (~5 MB) rather than every page at once (which
106        // is gigabytes for a multi-thousand-page document and drives the machine
107        // into swap).
108        let mut doc = DoclingDocument::new(name);
109        pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
110            self.process_one_page(n, &mut page, &mut doc)
111        })?;
112        assemble::merge_continuations(&mut doc.nodes);
113        Ok(doc)
114    }
115
116    /// Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page —
117    /// docling routes images through the same layout+OCR pipeline as a PDF page.
118    pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
119        let image = image::load_from_memory(bytes)
120            .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
121            .into_rgb8();
122        let (w, h) = image.dimensions();
123        // The image is its own page rendered at 1 px per "point" (scale 1.0); a
124        // standalone image has no text layer, so OCR supplies the cells.
125        let page = PdfPage {
126            width: w as f32,
127            height: h as f32,
128            scale: 1.0,
129            cells: Vec::new(),
130            code_cells: Vec::new(),
131            word_cells: Vec::new(),
132            image,
133            links: Vec::new(),
134        };
135        self.process_pages(vec![page], name)
136    }
137
138    /// Run layout (+ OCR for cell-less pages) and assemble one page into `doc`.
139    fn process_one_page(
140        &mut self,
141        n: usize,
142        page: &mut PdfPage,
143        doc: &mut DoclingDocument,
144    ) -> Result<(), PdfError> {
145        let regions = self
146            .layout
147            .predict(&page.image, page.width, page.height)
148            .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
149        // Resolve overlapping detections once, before OCR.
150        let mut regions = assemble::resolve(regions);
151        // Emit text the detector missed as orphan text regions (docling parity).
152        assemble::add_orphan_regions(&mut regions, &page.cells);
153        // Drop phantom empty low-confidence picture boxes (docling parity).
154        assemble::drop_false_pictures(&mut regions, &page.cells, page.width, page.height);
155        // No text layer → recognise text from the page image via OCR.
156        if page.cells.is_empty() {
157            if self.ocr.is_none() {
158                self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
159            }
160            let cells = self
161                .ocr
162                .as_mut()
163                .unwrap()
164                .ocr_page(&page.image, &regions, page.scale)
165                .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
166            page.cells = cells;
167        }
168        // TableFormer structure per table region (else geometric fallback).
169        let mut table_rows: Vec<Option<Vec<Vec<String>>>> = vec![None; regions.len()];
170        if let Some(tf) = self.tables.as_mut() {
171            for (i, r) in regions.iter().enumerate() {
172                if r.label == "table" {
173                    table_rows[i] = tf.predict_table_rows(
174                        &page.image,
175                        page.height,
176                        [r.l, r.t, r.r, r.b],
177                        &page.word_cells,
178                    );
179                }
180            }
181        }
182        assemble::assemble_page(page, regions, &table_rows, doc);
183        Ok(())
184    }
185
186    /// Run layout (+ OCR for cell-less pages) and assemble each already-rendered
187    /// page (image / METS inputs, which are small and already materialised).
188    fn process_pages(
189        &mut self,
190        mut pages: Vec<PdfPage>,
191        name: &str,
192    ) -> Result<DoclingDocument, PdfError> {
193        let mut doc = DoclingDocument::new(name);
194        for (n, page) in pages.iter_mut().enumerate() {
195            self.process_one_page(n, page, &mut doc)?;
196        }
197        assemble::merge_continuations(&mut doc.nodes);
198        Ok(doc)
199    }
200}
201
202/// Convenience one-shot conversion (loads the pipeline per call). Errors are
203/// detailed and surfaced (never silently skipped).
204pub fn convert(
205    bytes: &[u8],
206    password: Option<&str>,
207    name: &str,
208) -> Result<DoclingDocument, PdfError> {
209    Pipeline::new()?.convert(bytes, password, name)
210}
211
212/// Convenience one-shot image conversion (loads the pipeline per call).
213pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
214    Pipeline::new()?.convert_image(bytes, name)
215}
216
217/// Convert pre-segmented pages (image + already-known text cells, e.g. METS/hOCR
218/// scans) through the shared layout + assembly pipeline.
219pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
220    Pipeline::new()?.process_pages(pages, name)
221}