Skip to main content

fleischwolf_pdf/
pdfium_backend.rs

1//! pdfium-based text extraction and page rendering (docling's `PdfPipeline`
2//! text path uses pypdfium2 the same way).
3
4use image::RgbImage;
5use pdfium_render::prelude::*;
6
7/// A run of text with its bounding box, in PDF points with a **top-left** origin
8/// (pdfium's native origin is bottom-left; we flip it to match docling's
9/// `BoundingBox(..., origin=TOPLEFT)`).
10#[derive(Debug, Clone)]
11pub struct TextCell {
12    pub text: String,
13    pub l: f32,
14    pub t: f32,
15    pub r: f32,
16    pub b: f32,
17}
18
19/// Pixels-per-point used to render page images. Layout is scale-invariant (it
20/// scales normalized boxes by the page point size), but OCR benefits from the
21/// extra resolution.
22pub const RENDER_SCALE: f32 = 2.0;
23
24/// One page's geometry, extracted text cells, and a rendered RGB image. The
25/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
26/// page point × scale`.
27#[derive(Clone)]
28pub struct PdfPage {
29    pub width: f32,
30    pub height: f32,
31    pub scale: f32,
32    pub cells: Vec<TextCell>,
33    pub image: RgbImage,
34}
35
36/// A parsed PDF: per-page text cells and page images.
37pub struct PdfDocument {
38    pub pages: Vec<PdfPage>,
39}
40
41/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
42/// directory or file), else the directory of the current exe, else the system
43/// library — mirroring how a deployment ships `libpdfium` alongside the binary.
44fn bind() -> Result<Pdfium, PdfiumError> {
45    if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
46        let name = Pdfium::pdfium_platform_library_name_at_path(&path);
47        if let Ok(b) = Pdfium::bind_to_library(&name) {
48            return Ok(Pdfium::new(b));
49        }
50        if let Ok(b) = Pdfium::bind_to_library(&path) {
51            return Ok(Pdfium::new(b));
52        }
53    }
54    Pdfium::bind_to_system_library().map(Pdfium::new)
55}
56
57impl PdfDocument {
58    /// Parse a PDF from bytes, optionally decrypting with `password`.
59    ///
60    /// Note: this materialises **every** page's rendered bitmap in memory at
61    /// once. For large documents prefer [`for_each_page`], which streams.
62    pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
63        let pdfium = bind()?;
64        let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
65        let mut pages = Vec::new();
66        for page in doc.pages().iter() {
67            pages.push(extract_page(&page)?);
68        }
69        Ok(PdfDocument { pages })
70    }
71}
72
73/// Render + extract pages one at a time, handing each (owned) [`PdfPage`] to `f`.
74/// Only one page bitmap is resident at a time — a rendered page is ~5 MB, so a
75/// large PDF would otherwise hold gigabytes of bitmaps at once. `f` receives the
76/// zero-based page index and the total page count.
77///
78/// `E` is the caller's error type; pdfium errors convert into it via `From`.
79pub fn for_each_page<E, F>(bytes: &[u8], password: Option<&str>, mut f: F) -> Result<(), E>
80where
81    E: From<PdfiumError>,
82    F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
83{
84    let pdfium = bind()?;
85    let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
86    let pages = doc.pages();
87    let total = pages.len() as usize;
88    for (i, page) in pages.iter().enumerate() {
89        let extracted = extract_page(&page)?;
90        f(i, total, extracted)?;
91    }
92    Ok(())
93}
94
95fn extract_page(page: &pdfium_render::prelude::PdfPage<'_>) -> Result<PdfPage, PdfiumError> {
96    let width = page.width().value;
97    let height = page.height().value;
98
99    let text = page.text()?;
100    let mut cells = Vec::new();
101    for segment in text.segments().iter() {
102        let rect = segment.bounds();
103        let s = segment.text();
104        if s.trim().is_empty() {
105            continue;
106        }
107        // Flip Y to a top-left origin.
108        cells.push(TextCell {
109            text: s,
110            l: rect.left().value,
111            t: height - rect.top().value,
112            r: rect.right().value,
113            b: height - rect.bottom().value,
114        });
115    }
116
117    let tw = (width * RENDER_SCALE).round().max(1.0) as i32;
118    let th = (height * RENDER_SCALE).round().max(1.0) as i32;
119    let cfg = PdfRenderConfig::new()
120        .set_target_width(tw)
121        .set_target_height(th);
122    let bitmap = page.render_with_config(&cfg)?;
123    let image = bitmap.as_image().into_rgb8();
124
125    Ok(PdfPage {
126        width,
127        height,
128        scale: RENDER_SCALE,
129        cells,
130        image,
131    })
132}