use std::io::Cursor;
use std::path::Path;
use image::ImageFormat;
use pdfium_render::prelude::{PdfRenderConfig, Pdfium, PdfiumError};
use crate::DocumentError;
#[non_exhaustive]
#[derive(Debug, Clone, Copy)]
pub struct PdfRasterConfig {
pub width_px: u32,
pub height_px: u32,
pub page_index: i32,
}
impl PdfRasterConfig {
pub fn new() -> Self {
Self {
width_px: 1240,
height_px: 1754,
page_index: 0,
}
}
}
impl Default for PdfRasterConfig {
fn default() -> Self {
Self::new()
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct RasterizedPage {
pub png_bytes: Vec<u8>,
pub page_index: i32,
pub page_count: i32,
pub width_px: u32,
pub height_px: u32,
}
impl RasterizedPage {
pub fn new(
png_bytes: Vec<u8>,
page_index: i32,
page_count: i32,
width_px: u32,
height_px: u32,
) -> Self {
Self {
png_bytes,
page_index,
page_count,
width_px,
height_px,
}
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub enum PdfPagePayload {
VectorText {
text: String,
page_index: i32,
page_count: i32,
},
Raster(RasterizedPage),
}
impl PdfPagePayload {
pub fn page_index(&self) -> i32 {
match self {
Self::VectorText { page_index, .. } => *page_index,
Self::Raster(page) => page.page_index,
}
}
pub fn page_count(&self) -> i32 {
match self {
Self::VectorText { page_count, .. } => *page_count,
Self::Raster(page) => page.page_count,
}
}
}
pub fn rasterize_first_page(
path: &Path,
config: PdfRasterConfig,
) -> Result<RasterizedPage, DocumentError> {
let bindings = Pdfium::bind_to_system_library().map_err(|err| {
DocumentError::PdfiumNotFound(format!("{}. {}", err, pdfium_install_hint()))
})?;
let pdfium = Pdfium::new(bindings);
let document = pdfium
.load_pdf_from_file(path, None)
.map_err(map_pdfium_error)?;
let pages = document.pages();
let page_count = pages.len();
if page_count == 0 {
return Err(DocumentError::PdfRasterFailed(
"input PDF contains zero pages".to_string(),
));
}
if config.page_index < 0 || config.page_index >= page_count {
return Err(DocumentError::PdfRasterFailed(format!(
"requested page index {} but document has {} page(s)",
config.page_index, page_count
)));
}
let page = pages.get(config.page_index).map_err(map_pdfium_error)?;
let mut render_config = PdfRenderConfig::new().set_target_width(config.width_px as i32);
if config.height_px > 0 {
render_config = render_config.set_maximum_height(config.height_px as i32);
}
let bitmap = page
.render_with_config(&render_config)
.map_err(map_pdfium_error)?;
let dynamic_image = bitmap.as_image().map_err(map_pdfium_error)?;
let (width, height) = (dynamic_image.width(), dynamic_image.height());
let mut buf = Cursor::new(Vec::with_capacity(64 * 1024));
dynamic_image
.write_to(&mut buf, ImageFormat::Png)
.map_err(|err| DocumentError::PdfRasterFailed(format!("png encode failed: {err}")))?;
Ok(RasterizedPage {
png_bytes: buf.into_inner(),
page_index: config.page_index,
page_count,
width_px: width,
height_px: height,
})
}
pub fn extract_pages(
path: &Path,
config: PdfRasterConfig,
) -> Result<Vec<PdfPagePayload>, DocumentError> {
let bindings = Pdfium::bind_to_system_library().map_err(|err| {
DocumentError::PdfiumNotFound(format!("{}. {}", err, pdfium_install_hint()))
})?;
let pdfium = Pdfium::new(bindings);
let document = pdfium
.load_pdf_from_file(path, None)
.map_err(map_pdfium_error)?;
let pages = document.pages();
let page_count = pages.len();
if page_count == 0 {
return Err(DocumentError::PdfRasterFailed(
"input PDF contains zero pages".to_string(),
));
}
let mut out = Vec::with_capacity(page_count as usize);
for page_index in 0..page_count {
let page = pages.get(page_index).map_err(map_pdfium_error)?;
let text = page
.text()
.ok()
.map(|page_text| normalize_pdf_text(&page_text.all()))
.unwrap_or_default();
if !text.trim().is_empty() {
out.push(PdfPagePayload::VectorText {
text,
page_index,
page_count,
});
continue;
}
out.push(PdfPagePayload::Raster(render_page(
&page, page_index, page_count, config,
)?));
}
Ok(out)
}
fn render_page(
page: &pdfium_render::prelude::PdfPage<'_>,
page_index: i32,
page_count: i32,
config: PdfRasterConfig,
) -> Result<RasterizedPage, DocumentError> {
let mut render_config = PdfRenderConfig::new().set_target_width(config.width_px as i32);
if config.height_px > 0 {
render_config = render_config.set_maximum_height(config.height_px as i32);
}
let bitmap = page
.render_with_config(&render_config)
.map_err(map_pdfium_error)?;
let dynamic_image = bitmap.as_image().map_err(map_pdfium_error)?;
let (width, height) = (dynamic_image.width(), dynamic_image.height());
let mut buf = Cursor::new(Vec::with_capacity(64 * 1024));
dynamic_image
.write_to(&mut buf, ImageFormat::Png)
.map_err(|err| DocumentError::PdfRasterFailed(format!("png encode failed: {err}")))?;
Ok(RasterizedPage {
png_bytes: buf.into_inner(),
page_index,
page_count,
width_px: width,
height_px: height,
})
}
fn normalize_pdf_text(text: &str) -> String {
text.replace('\0', "")
.lines()
.map(str::trim_end)
.collect::<Vec<_>>()
.join("\n")
.trim()
.to_string()
}
fn map_pdfium_error(err: PdfiumError) -> DocumentError {
DocumentError::PdfRasterFailed(err.to_string())
}
fn pdfium_install_hint() -> String {
if cfg!(target_os = "macos") {
"Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
and place `libpdfium.dylib` on DYLD_LIBRARY_PATH, in /usr/local/lib, or next to your binary."
.to_string()
} else if cfg!(target_os = "linux") {
"Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
and place `libpdfium.so` on LD_LIBRARY_PATH, in /usr/local/lib, or next to your binary."
.to_string()
} else if cfg!(target_os = "windows") {
"Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
and place `pdfium.dll` on PATH or next to your executable."
.to_string()
} else {
"Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries."
.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn raster_config_defaults_to_first_page_150_dpi() {
let cfg = PdfRasterConfig::new();
assert_eq!(cfg.page_index, 0);
assert_eq!(cfg.width_px, 1240);
assert_eq!(cfg.height_px, 1754);
}
#[test]
fn install_hint_is_non_empty() {
assert!(!pdfium_install_hint().is_empty());
}
#[test]
fn normalize_pdf_text_removes_nuls_and_outer_blank_space() {
assert_eq!(normalize_pdf_text(" hello\0 \n\n"), "hello");
}
}