Skip to main content

fleischwolf_pdf/
lib.rs

1//! PDF backend for fleischwolf.
2//!
3//! A port of docling's standard PDF pipeline: pdfium extracts the text layer
4//! (cells with bounding boxes) and renders page images; a discriminative ONNX
5//! stack (layout detection, table structure, OCR) classifies regions; the cells
6//! are assembled in reading order into a [`DoclingDocument`].
7//!
8//! Current stages: pdfium text-cell extraction + page rendering ([`pdfium_backend`])
9//! and the deterministic text/reading-order assembly ([`assemble`]). The layout,
10//! table-structure and OCR ONNX stages land behind [`Pipeline`] next.
11
12mod assemble;
13pub mod layout;
14mod mets;
15mod ocr;
16mod pdfium_backend;
17
18use std::fmt;
19
20use fleischwolf_core::DoclingDocument;
21
22pub use mets::convert_mets_gbs;
23pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
24
25/// Errors from the PDF backend. Detailed and surfaced (never silently skipped).
26#[derive(Debug)]
27pub enum PdfError {
28    /// pdfium failed to bind, open, or read the document.
29    Pdfium(String),
30    /// The layout ONNX model failed to load or run.
31    Layout(String),
32    /// The OCR ONNX model failed to load or run.
33    Ocr(String),
34}
35
36impl fmt::Display for PdfError {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        match self {
39            PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
40            PdfError::Layout(m) => write!(f, "pdf: {m}"),
41            PdfError::Ocr(m) => write!(f, "pdf: {m}"),
42        }
43    }
44}
45
46impl std::error::Error for PdfError {}
47
48/// A reusable PDF pipeline: the layout model is loaded once and reused across
49/// documents; OCR loads lazily the first time a scanned page is seen.
50pub struct Pipeline {
51    layout: layout::LayoutModel,
52    ocr: Option<ocr::OcrModel>,
53}
54
55impl Pipeline {
56    /// Load the layout model (the only always-required model).
57    pub fn new() -> Result<Self, PdfError> {
58        Ok(Self {
59            layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
60            ocr: None,
61        })
62    }
63
64    /// Convert a PDF (bytes) to a [`DoclingDocument`] via the discriminative
65    /// pipeline: pdfium text cells (or OCR for scanned pages) + per-page layout
66    /// detection, assembled in reading order. Errors are detailed and surfaced.
67    pub fn convert(
68        &mut self,
69        bytes: &[u8],
70        password: Option<&str>,
71        name: &str,
72    ) -> Result<DoclingDocument, PdfError> {
73        let parsed =
74            PdfDocument::open(bytes, password).map_err(|e| PdfError::Pdfium(e.to_string()))?;
75        self.process_pages(parsed.pages, name)
76    }
77
78    /// Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page —
79    /// docling routes images through the same layout+OCR pipeline as a PDF page.
80    pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
81        let image = image::load_from_memory(bytes)
82            .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
83            .into_rgb8();
84        let (w, h) = image.dimensions();
85        // The image is its own page rendered at 1 px per "point" (scale 1.0); a
86        // standalone image has no text layer, so OCR supplies the cells.
87        let page = PdfPage {
88            width: w as f32,
89            height: h as f32,
90            scale: 1.0,
91            cells: Vec::new(),
92            image,
93        };
94        self.process_pages(vec![page], name)
95    }
96
97    /// Run layout (+ OCR for cell-less pages) and assemble each page.
98    fn process_pages(
99        &mut self,
100        mut pages: Vec<PdfPage>,
101        name: &str,
102    ) -> Result<DoclingDocument, PdfError> {
103        let mut doc = DoclingDocument::new(name);
104        for (n, page) in pages.iter_mut().enumerate() {
105            let regions = self
106                .layout
107                .predict(&page.image, page.width, page.height)
108                .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
109            // Resolve overlapping detections once, before OCR.
110            let regions = assemble::resolve(regions);
111            // No text layer → recognise text from the page image via OCR.
112            if page.cells.is_empty() {
113                if self.ocr.is_none() {
114                    self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
115                }
116                let cells = self
117                    .ocr
118                    .as_mut()
119                    .unwrap()
120                    .ocr_page(&page.image, &regions, page.scale)
121                    .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
122                page.cells = cells;
123            }
124            assemble::assemble_page(page, regions, &mut doc);
125        }
126        Ok(doc)
127    }
128}
129
130/// Convenience one-shot conversion (loads the pipeline per call). Errors are
131/// detailed and surfaced (never silently skipped).
132pub fn convert(
133    bytes: &[u8],
134    password: Option<&str>,
135    name: &str,
136) -> Result<DoclingDocument, PdfError> {
137    Pipeline::new()?.convert(bytes, password, name)
138}
139
140/// Convenience one-shot image conversion (loads the pipeline per call).
141pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
142    Pipeline::new()?.convert_image(bytes, name)
143}
144
145/// Convert pre-segmented pages (image + already-known text cells, e.g. METS/hOCR
146/// scans) through the shared layout + assembly pipeline.
147pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
148    Pipeline::new()?.process_pages(pages, name)
149}