Skip to main content

fleischwolf_pdf/
pdfium_backend.rs

1//! pdfium-based text extraction and page rendering (docling's `PdfPipeline`
2//! text path uses pypdfium2 the same way).
3
4use image::RgbImage;
5use pdfium_render::prelude::*;
6
7/// A run of text with its bounding box, in PDF points with a **top-left** origin
8/// (pdfium's native origin is bottom-left; we flip it to match docling's
9/// `BoundingBox(..., origin=TOPLEFT)`).
10#[derive(Debug, Clone)]
11pub struct TextCell {
12    pub text: String,
13    pub l: f32,
14    pub t: f32,
15    pub r: f32,
16    pub b: f32,
17}
18
19/// Pixels-per-point used to render page images. Layout is scale-invariant (it
20/// scales normalized boxes by the page point size), but OCR benefits from the
21/// extra resolution.
22pub const RENDER_SCALE: f32 = 2.0;
23
24/// One page's geometry, extracted text cells, and a rendered RGB image. The
25/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
26/// page point × scale`.
27#[derive(Clone)]
28pub struct PdfPage {
29    pub width: f32,
30    pub height: f32,
31    pub scale: f32,
32    pub cells: Vec<TextCell>,
33    pub image: RgbImage,
34}
35
36/// A parsed PDF: per-page text cells and page images.
37pub struct PdfDocument {
38    pub pages: Vec<PdfPage>,
39}
40
41/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
42/// directory or file), else the directory of the current exe, else the system
43/// library — mirroring how a deployment ships `libpdfium` alongside the binary.
44fn bind() -> Result<Pdfium, PdfiumError> {
45    if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
46        let name = Pdfium::pdfium_platform_library_name_at_path(&path);
47        if let Ok(b) = Pdfium::bind_to_library(&name) {
48            return Ok(Pdfium::new(b));
49        }
50        if let Ok(b) = Pdfium::bind_to_library(&path) {
51            return Ok(Pdfium::new(b));
52        }
53    }
54    Pdfium::bind_to_system_library().map(Pdfium::new)
55}
56
57impl PdfDocument {
58    /// Parse a PDF from bytes, optionally decrypting with `password`.
59    pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
60        let pdfium = bind()?;
61        let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
62        let mut pages = Vec::new();
63        for page in doc.pages().iter() {
64            pages.push(extract_page(&page)?);
65        }
66        Ok(PdfDocument { pages })
67    }
68}
69
70fn extract_page(page: &pdfium_render::prelude::PdfPage<'_>) -> Result<PdfPage, PdfiumError> {
71    let width = page.width().value;
72    let height = page.height().value;
73
74    let text = page.text()?;
75    let mut cells = Vec::new();
76    for segment in text.segments().iter() {
77        let rect = segment.bounds();
78        let s = segment.text();
79        if s.trim().is_empty() {
80            continue;
81        }
82        // Flip Y to a top-left origin.
83        cells.push(TextCell {
84            text: s,
85            l: rect.left().value,
86            t: height - rect.top().value,
87            r: rect.right().value,
88            b: height - rect.bottom().value,
89        });
90    }
91
92    let tw = (width * RENDER_SCALE).round().max(1.0) as i32;
93    let th = (height * RENDER_SCALE).round().max(1.0) as i32;
94    let cfg = PdfRenderConfig::new()
95        .set_target_width(tw)
96        .set_target_height(th);
97    let bitmap = page.render_with_config(&cfg)?;
98    let image = bitmap.as_image().into_rgb8();
99
100    Ok(PdfPage {
101        width,
102        height,
103        scale: RENDER_SCALE,
104        cells,
105        image,
106    })
107}