Skip to main content

fleischwolf_pdf/
lib.rs

1//! PDF backend for fleischwolf.
2//!
3//! A port of docling's standard PDF pipeline: pdfium extracts the text layer
4//! (cells with bounding boxes) and renders page images; a discriminative ONNX
5//! stack (layout detection, table structure, OCR) classifies regions; the cells
6//! are assembled in reading order into a [`DoclingDocument`].
7//!
8//! Current stages: pdfium text-cell extraction + page rendering ([`pdfium_backend`])
9//! and the deterministic text/reading-order assembly ([`assemble`]). The layout,
10//! table-structure and OCR ONNX stages land behind [`Pipeline`] next.
11
12mod assemble;
13pub mod layout;
14mod mets;
15mod ocr;
16mod pdfium_backend;
17
18use std::fmt;
19
20use fleischwolf_core::DoclingDocument;
21
22pub use mets::convert_mets_gbs;
23pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
24
25/// Errors from the PDF backend. Detailed and surfaced (never silently skipped).
26#[derive(Debug)]
27pub enum PdfError {
28    /// pdfium failed to bind, open, or read the document.
29    Pdfium(String),
30    /// The layout ONNX model failed to load or run.
31    Layout(String),
32    /// The OCR ONNX model failed to load or run.
33    Ocr(String),
34}
35
36impl fmt::Display for PdfError {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        match self {
39            PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
40            PdfError::Layout(m) => write!(f, "pdf: {m}"),
41            PdfError::Ocr(m) => write!(f, "pdf: {m}"),
42        }
43    }
44}
45
46impl std::error::Error for PdfError {}
47
48impl From<pdfium_render::prelude::PdfiumError> for PdfError {
49    fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
50        PdfError::Pdfium(e.to_string())
51    }
52}
53
54/// Threads ONNX inference may use, capped by `FLEISCHWOLF_PDF_THREADS` if set.
55/// Defaults to the available parallelism (ort otherwise picks a low number).
56pub(crate) fn intra_threads() -> usize {
57    if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
58        .ok()
59        .and_then(|v| v.parse::<usize>().ok())
60        .filter(|&n| n > 0)
61    {
62        return n;
63    }
64    std::thread::available_parallelism()
65        .map(|n| n.get())
66        .unwrap_or(1)
67}
68
69/// A reusable PDF pipeline: the layout model is loaded once and reused across
70/// documents; OCR loads lazily the first time a scanned page is seen.
71pub struct Pipeline {
72    layout: layout::LayoutModel,
73    ocr: Option<ocr::OcrModel>,
74}
75
76impl Pipeline {
77    /// Load the layout model (the only always-required model).
78    pub fn new() -> Result<Self, PdfError> {
79        Ok(Self {
80            layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
81            ocr: None,
82        })
83    }
84
85    /// Convert a PDF (bytes) to a [`DoclingDocument`] via the discriminative
86    /// pipeline: pdfium text cells (or OCR for scanned pages) + per-page layout
87    /// detection, assembled in reading order. Errors are detailed and surfaced.
88    pub fn convert(
89        &mut self,
90        bytes: &[u8],
91        password: Option<&str>,
92        name: &str,
93    ) -> Result<DoclingDocument, PdfError> {
94        // Stream pages: render → process → drop one at a time, so a large PDF
95        // holds ~one page bitmap (~5 MB) rather than every page at once (which
96        // is gigabytes for a multi-thousand-page document and drives the machine
97        // into swap).
98        let mut doc = DoclingDocument::new(name);
99        pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
100            self.process_one_page(n, &mut page, &mut doc)
101        })?;
102        Ok(doc)
103    }
104
105    /// Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page —
106    /// docling routes images through the same layout+OCR pipeline as a PDF page.
107    pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
108        let image = image::load_from_memory(bytes)
109            .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
110            .into_rgb8();
111        let (w, h) = image.dimensions();
112        // The image is its own page rendered at 1 px per "point" (scale 1.0); a
113        // standalone image has no text layer, so OCR supplies the cells.
114        let page = PdfPage {
115            width: w as f32,
116            height: h as f32,
117            scale: 1.0,
118            cells: Vec::new(),
119            image,
120        };
121        self.process_pages(vec![page], name)
122    }
123
124    /// Run layout (+ OCR for cell-less pages) and assemble one page into `doc`.
125    fn process_one_page(
126        &mut self,
127        n: usize,
128        page: &mut PdfPage,
129        doc: &mut DoclingDocument,
130    ) -> Result<(), PdfError> {
131        let regions = self
132            .layout
133            .predict(&page.image, page.width, page.height)
134            .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
135        // Resolve overlapping detections once, before OCR.
136        let regions = assemble::resolve(regions);
137        // No text layer → recognise text from the page image via OCR.
138        if page.cells.is_empty() {
139            if self.ocr.is_none() {
140                self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
141            }
142            let cells = self
143                .ocr
144                .as_mut()
145                .unwrap()
146                .ocr_page(&page.image, &regions, page.scale)
147                .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
148            page.cells = cells;
149        }
150        assemble::assemble_page(page, regions, doc);
151        Ok(())
152    }
153
154    /// Run layout (+ OCR for cell-less pages) and assemble each already-rendered
155    /// page (image / METS inputs, which are small and already materialised).
156    fn process_pages(
157        &mut self,
158        mut pages: Vec<PdfPage>,
159        name: &str,
160    ) -> Result<DoclingDocument, PdfError> {
161        let mut doc = DoclingDocument::new(name);
162        for (n, page) in pages.iter_mut().enumerate() {
163            self.process_one_page(n, page, &mut doc)?;
164        }
165        Ok(doc)
166    }
167}
168
169/// Convenience one-shot conversion (loads the pipeline per call). Errors are
170/// detailed and surfaced (never silently skipped).
171pub fn convert(
172    bytes: &[u8],
173    password: Option<&str>,
174    name: &str,
175) -> Result<DoclingDocument, PdfError> {
176    Pipeline::new()?.convert(bytes, password, name)
177}
178
179/// Convenience one-shot image conversion (loads the pipeline per call).
180pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
181    Pipeline::new()?.convert_image(bytes, name)
182}
183
184/// Convert pre-segmented pages (image + already-known text cells, e.g. METS/hOCR
185/// scans) through the shared layout + assembly pipeline.
186pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
187    Pipeline::new()?.process_pages(pages, name)
188}