Skip to main content

gaze_document/extract/
pdf.rs

1//! PDF text extraction and rasterization via [`pdfium-render`](https://crates.io/crates/pdfium-render).
2//!
3//! ## Runtime dependency
4//!
5//! `pdfium-render` dynamically loads the pdfium shared library at runtime.
6//! Adopters must have `libpdfium` reachable to the process (system library,
7//! `LD_LIBRARY_PATH` / `DYLD_LIBRARY_PATH`, or alongside the executable).
8//!
9//! Per-OS install guidance is surfaced in [`DocumentError::PdfiumNotFound`]
10//! whenever binding fails.
11//!
12//! ## Scope (v0.0.x)
13//!
14//! * Selectable text is extracted directly, per page, before OCR is attempted.
15//! * Pages with no selectable text are rasterized at 150 DPI by default.
16
17use std::io::Cursor;
18use std::path::Path;
19
20use image::ImageFormat;
21use pdfium_render::prelude::{PdfRenderConfig, Pdfium, PdfiumError};
22
23use crate::DocumentError;
24
25/// Configuration for one PDF rasterization pass.
26#[non_exhaustive]
27#[derive(Debug, Clone, Copy)]
28pub struct PdfRasterConfig {
29    /// Target image width in pixels (height auto-scales).
30    pub width_px: u32,
31    /// Target image height in pixels (height auto-scales when 0).
32    pub height_px: u32,
33    /// Zero-based page index to rasterize.
34    pub page_index: i32,
35}
36
37impl PdfRasterConfig {
38    /// Default config: 1240×1754 (≈150 DPI A4) on page 0.
39    pub fn new() -> Self {
40        Self {
41            width_px: 1240,
42            height_px: 1754,
43            page_index: 0,
44        }
45    }
46}
47
48impl Default for PdfRasterConfig {
49    fn default() -> Self {
50        Self::new()
51    }
52}
53
54/// Result of rasterizing a PDF page.
55#[non_exhaustive]
56#[derive(Debug, Clone)]
57pub struct RasterizedPage {
58    /// PNG-encoded image bytes.
59    pub png_bytes: Vec<u8>,
60    /// Page index that was rasterized.
61    pub page_index: i32,
62    /// Total page count in the source document.
63    pub page_count: i32,
64    /// Width in pixels of the rasterized page.
65    pub width_px: u32,
66    /// Height in pixels of the rasterized page.
67    pub height_px: u32,
68}
69
70impl RasterizedPage {
71    /// Build a [`RasterizedPage`] from already-encoded fields.
72    pub fn new(
73        png_bytes: Vec<u8>,
74        page_index: i32,
75        page_count: i32,
76        width_px: u32,
77        height_px: u32,
78    ) -> Self {
79        Self {
80            png_bytes,
81            page_index,
82            page_count,
83            width_px,
84            height_px,
85        }
86    }
87}
88
89/// Per-page PDF extraction payload.
90#[non_exhaustive]
91#[derive(Debug, Clone)]
92pub enum PdfPagePayload {
93    /// Page had selectable text and did not require OCR.
94    VectorText {
95        /// Extracted text in PDF text order.
96        text: String,
97        /// Zero-based page index.
98        page_index: i32,
99        /// Total page count in the source document.
100        page_count: i32,
101    },
102    /// Page had no selectable text and was rasterized for OCR.
103    Raster(RasterizedPage),
104}
105
106impl PdfPagePayload {
107    /// Zero-based page index.
108    pub fn page_index(&self) -> i32 {
109        match self {
110            Self::VectorText { page_index, .. } => *page_index,
111            Self::Raster(page) => page.page_index,
112        }
113    }
114
115    /// Total page count in the source document.
116    pub fn page_count(&self) -> i32 {
117        match self {
118            Self::VectorText { page_count, .. } => *page_count,
119            Self::Raster(page) => page.page_count,
120        }
121    }
122}
123
124/// Rasterize a single page of a PDF on disk to PNG bytes.
125///
126/// # Errors
127///
128/// * [`DocumentError::PdfiumNotFound`] — pdfium dynamic library could not be
129///   located. Payload carries per-OS install guidance.
130/// * [`DocumentError::PdfRasterFailed`] — pdfium reported an error while
131///   opening or rendering the document.
132pub fn rasterize_first_page(
133    path: &Path,
134    config: PdfRasterConfig,
135) -> Result<RasterizedPage, DocumentError> {
136    let bindings = Pdfium::bind_to_system_library().map_err(|err| {
137        DocumentError::PdfiumNotFound(format!("{}. {}", err, pdfium_install_hint()))
138    })?;
139    let pdfium = Pdfium::new(bindings);
140    let document = pdfium
141        .load_pdf_from_file(path, None)
142        .map_err(map_pdfium_error)?;
143    let pages = document.pages();
144    let page_count = pages.len();
145    if page_count == 0 {
146        return Err(DocumentError::PdfRasterFailed(
147            "input PDF contains zero pages".to_string(),
148        ));
149    }
150
151    if config.page_index < 0 || config.page_index >= page_count {
152        return Err(DocumentError::PdfRasterFailed(format!(
153            "requested page index {} but document has {} page(s)",
154            config.page_index, page_count
155        )));
156    }
157
158    let page = pages.get(config.page_index).map_err(map_pdfium_error)?;
159    let mut render_config = PdfRenderConfig::new().set_target_width(config.width_px as i32);
160    if config.height_px > 0 {
161        render_config = render_config.set_maximum_height(config.height_px as i32);
162    }
163    let bitmap = page
164        .render_with_config(&render_config)
165        .map_err(map_pdfium_error)?;
166    let dynamic_image = bitmap.as_image().map_err(map_pdfium_error)?;
167    let (width, height) = (dynamic_image.width(), dynamic_image.height());
168
169    let mut buf = Cursor::new(Vec::with_capacity(64 * 1024));
170    dynamic_image
171        .write_to(&mut buf, ImageFormat::Png)
172        .map_err(|err| DocumentError::PdfRasterFailed(format!("png encode failed: {err}")))?;
173
174    Ok(RasterizedPage {
175        png_bytes: buf.into_inner(),
176        page_index: config.page_index,
177        page_count,
178        width_px: width,
179        height_px: height,
180    })
181}
182
183/// Extract every PDF page, routing selectable-text pages directly and
184/// rasterizing image-only pages for OCR.
185///
186/// # Errors
187///
188/// Returns [`DocumentError`] when pdfium cannot open the document or a page
189/// cannot be rasterized.
190pub fn extract_pages(
191    path: &Path,
192    config: PdfRasterConfig,
193) -> Result<Vec<PdfPagePayload>, DocumentError> {
194    let bindings = Pdfium::bind_to_system_library().map_err(|err| {
195        DocumentError::PdfiumNotFound(format!("{}. {}", err, pdfium_install_hint()))
196    })?;
197    let pdfium = Pdfium::new(bindings);
198    let document = pdfium
199        .load_pdf_from_file(path, None)
200        .map_err(map_pdfium_error)?;
201    let pages = document.pages();
202    let page_count = pages.len();
203    if page_count == 0 {
204        return Err(DocumentError::PdfRasterFailed(
205            "input PDF contains zero pages".to_string(),
206        ));
207    }
208
209    let mut out = Vec::with_capacity(page_count as usize);
210    for page_index in 0..page_count {
211        let page = pages.get(page_index).map_err(map_pdfium_error)?;
212        let text = page
213            .text()
214            .ok()
215            .map(|page_text| normalize_pdf_text(&page_text.all()))
216            .unwrap_or_default();
217        if !text.trim().is_empty() {
218            out.push(PdfPagePayload::VectorText {
219                text,
220                page_index,
221                page_count,
222            });
223            continue;
224        }
225
226        out.push(PdfPagePayload::Raster(render_page(
227            &page, page_index, page_count, config,
228        )?));
229    }
230
231    Ok(out)
232}
233
234fn render_page(
235    page: &pdfium_render::prelude::PdfPage<'_>,
236    page_index: i32,
237    page_count: i32,
238    config: PdfRasterConfig,
239) -> Result<RasterizedPage, DocumentError> {
240    let mut render_config = PdfRenderConfig::new().set_target_width(config.width_px as i32);
241    if config.height_px > 0 {
242        render_config = render_config.set_maximum_height(config.height_px as i32);
243    }
244    let bitmap = page
245        .render_with_config(&render_config)
246        .map_err(map_pdfium_error)?;
247    let dynamic_image = bitmap.as_image().map_err(map_pdfium_error)?;
248    let (width, height) = (dynamic_image.width(), dynamic_image.height());
249
250    let mut buf = Cursor::new(Vec::with_capacity(64 * 1024));
251    dynamic_image
252        .write_to(&mut buf, ImageFormat::Png)
253        .map_err(|err| DocumentError::PdfRasterFailed(format!("png encode failed: {err}")))?;
254
255    Ok(RasterizedPage {
256        png_bytes: buf.into_inner(),
257        page_index,
258        page_count,
259        width_px: width,
260        height_px: height,
261    })
262}
263
264fn normalize_pdf_text(text: &str) -> String {
265    text.replace('\0', "")
266        .lines()
267        .map(str::trim_end)
268        .collect::<Vec<_>>()
269        .join("\n")
270        .trim()
271        .to_string()
272}
273
274fn map_pdfium_error(err: PdfiumError) -> DocumentError {
275    DocumentError::PdfRasterFailed(err.to_string())
276}
277
278fn pdfium_install_hint() -> String {
279    if cfg!(target_os = "macos") {
280        "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
281         and place `libpdfium.dylib` on DYLD_LIBRARY_PATH, in /usr/local/lib, or next to your binary."
282            .to_string()
283    } else if cfg!(target_os = "linux") {
284        "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
285         and place `libpdfium.so` on LD_LIBRARY_PATH, in /usr/local/lib, or next to your binary."
286            .to_string()
287    } else if cfg!(target_os = "windows") {
288        "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
289         and place `pdfium.dll` on PATH or next to your executable."
290            .to_string()
291    } else {
292        "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries."
293            .to_string()
294    }
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    #[test]
302    fn raster_config_defaults_to_first_page_150_dpi() {
303        let cfg = PdfRasterConfig::new();
304        assert_eq!(cfg.page_index, 0);
305        assert_eq!(cfg.width_px, 1240);
306        assert_eq!(cfg.height_px, 1754);
307    }
308
309    #[test]
310    fn install_hint_is_non_empty() {
311        assert!(!pdfium_install_hint().is_empty());
312    }
313
314    #[test]
315    fn normalize_pdf_text_removes_nuls_and_outer_blank_space() {
316        assert_eq!(normalize_pdf_text(" hello\0 \n\n"), "hello");
317    }
318}