1use std::io::Cursor;
18use std::path::Path;
19
20use image::ImageFormat;
21use pdfium_render::prelude::{PdfRenderConfig, Pdfium, PdfiumError};
22
23use crate::DocumentError;
24
25#[non_exhaustive]
27#[derive(Debug, Clone, Copy)]
28pub struct PdfRasterConfig {
29 pub width_px: u32,
31 pub height_px: u32,
33 pub page_index: i32,
35}
36
37impl PdfRasterConfig {
38 pub fn new() -> Self {
40 Self {
41 width_px: 1240,
42 height_px: 1754,
43 page_index: 0,
44 }
45 }
46}
47
48impl Default for PdfRasterConfig {
49 fn default() -> Self {
50 Self::new()
51 }
52}
53
54#[non_exhaustive]
56#[derive(Debug, Clone)]
57pub struct RasterizedPage {
58 pub png_bytes: Vec<u8>,
60 pub page_index: i32,
62 pub page_count: i32,
64 pub width_px: u32,
66 pub height_px: u32,
68}
69
70impl RasterizedPage {
71 pub fn new(
73 png_bytes: Vec<u8>,
74 page_index: i32,
75 page_count: i32,
76 width_px: u32,
77 height_px: u32,
78 ) -> Self {
79 Self {
80 png_bytes,
81 page_index,
82 page_count,
83 width_px,
84 height_px,
85 }
86 }
87}
88
89#[non_exhaustive]
91#[derive(Debug, Clone)]
92pub enum PdfPagePayload {
93 VectorText {
95 text: String,
97 page_index: i32,
99 page_count: i32,
101 },
102 Raster(RasterizedPage),
104}
105
106impl PdfPagePayload {
107 pub fn page_index(&self) -> i32 {
109 match self {
110 Self::VectorText { page_index, .. } => *page_index,
111 Self::Raster(page) => page.page_index,
112 }
113 }
114
115 pub fn page_count(&self) -> i32 {
117 match self {
118 Self::VectorText { page_count, .. } => *page_count,
119 Self::Raster(page) => page.page_count,
120 }
121 }
122}
123
124pub fn rasterize_first_page(
133 path: &Path,
134 config: PdfRasterConfig,
135) -> Result<RasterizedPage, DocumentError> {
136 let bindings = Pdfium::bind_to_system_library().map_err(|err| {
137 DocumentError::PdfiumNotFound(format!("{}. {}", err, pdfium_install_hint()))
138 })?;
139 let pdfium = Pdfium::new(bindings);
140 let document = pdfium
141 .load_pdf_from_file(path, None)
142 .map_err(map_pdfium_error)?;
143 let pages = document.pages();
144 let page_count = pages.len();
145 if page_count == 0 {
146 return Err(DocumentError::PdfRasterFailed(
147 "input PDF contains zero pages".to_string(),
148 ));
149 }
150
151 if config.page_index < 0 || config.page_index >= page_count {
152 return Err(DocumentError::PdfRasterFailed(format!(
153 "requested page index {} but document has {} page(s)",
154 config.page_index, page_count
155 )));
156 }
157
158 let page = pages.get(config.page_index).map_err(map_pdfium_error)?;
159 let mut render_config = PdfRenderConfig::new().set_target_width(config.width_px as i32);
160 if config.height_px > 0 {
161 render_config = render_config.set_maximum_height(config.height_px as i32);
162 }
163 let bitmap = page
164 .render_with_config(&render_config)
165 .map_err(map_pdfium_error)?;
166 let dynamic_image = bitmap.as_image().map_err(map_pdfium_error)?;
167 let (width, height) = (dynamic_image.width(), dynamic_image.height());
168
169 let mut buf = Cursor::new(Vec::with_capacity(64 * 1024));
170 dynamic_image
171 .write_to(&mut buf, ImageFormat::Png)
172 .map_err(|err| DocumentError::PdfRasterFailed(format!("png encode failed: {err}")))?;
173
174 Ok(RasterizedPage {
175 png_bytes: buf.into_inner(),
176 page_index: config.page_index,
177 page_count,
178 width_px: width,
179 height_px: height,
180 })
181}
182
183pub fn extract_pages(
191 path: &Path,
192 config: PdfRasterConfig,
193) -> Result<Vec<PdfPagePayload>, DocumentError> {
194 let bindings = Pdfium::bind_to_system_library().map_err(|err| {
195 DocumentError::PdfiumNotFound(format!("{}. {}", err, pdfium_install_hint()))
196 })?;
197 let pdfium = Pdfium::new(bindings);
198 let document = pdfium
199 .load_pdf_from_file(path, None)
200 .map_err(map_pdfium_error)?;
201 let pages = document.pages();
202 let page_count = pages.len();
203 if page_count == 0 {
204 return Err(DocumentError::PdfRasterFailed(
205 "input PDF contains zero pages".to_string(),
206 ));
207 }
208
209 let mut out = Vec::with_capacity(page_count as usize);
210 for page_index in 0..page_count {
211 let page = pages.get(page_index).map_err(map_pdfium_error)?;
212 let text = page
213 .text()
214 .ok()
215 .map(|page_text| normalize_pdf_text(&page_text.all()))
216 .unwrap_or_default();
217 if !text.trim().is_empty() {
218 out.push(PdfPagePayload::VectorText {
219 text,
220 page_index,
221 page_count,
222 });
223 continue;
224 }
225
226 out.push(PdfPagePayload::Raster(render_page(
227 &page, page_index, page_count, config,
228 )?));
229 }
230
231 Ok(out)
232}
233
234fn render_page(
235 page: &pdfium_render::prelude::PdfPage<'_>,
236 page_index: i32,
237 page_count: i32,
238 config: PdfRasterConfig,
239) -> Result<RasterizedPage, DocumentError> {
240 let mut render_config = PdfRenderConfig::new().set_target_width(config.width_px as i32);
241 if config.height_px > 0 {
242 render_config = render_config.set_maximum_height(config.height_px as i32);
243 }
244 let bitmap = page
245 .render_with_config(&render_config)
246 .map_err(map_pdfium_error)?;
247 let dynamic_image = bitmap.as_image().map_err(map_pdfium_error)?;
248 let (width, height) = (dynamic_image.width(), dynamic_image.height());
249
250 let mut buf = Cursor::new(Vec::with_capacity(64 * 1024));
251 dynamic_image
252 .write_to(&mut buf, ImageFormat::Png)
253 .map_err(|err| DocumentError::PdfRasterFailed(format!("png encode failed: {err}")))?;
254
255 Ok(RasterizedPage {
256 png_bytes: buf.into_inner(),
257 page_index,
258 page_count,
259 width_px: width,
260 height_px: height,
261 })
262}
263
264fn normalize_pdf_text(text: &str) -> String {
265 text.replace('\0', "")
266 .lines()
267 .map(str::trim_end)
268 .collect::<Vec<_>>()
269 .join("\n")
270 .trim()
271 .to_string()
272}
273
274fn map_pdfium_error(err: PdfiumError) -> DocumentError {
275 DocumentError::PdfRasterFailed(err.to_string())
276}
277
278fn pdfium_install_hint() -> String {
279 if cfg!(target_os = "macos") {
280 "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
281 and place `libpdfium.dylib` on DYLD_LIBRARY_PATH, in /usr/local/lib, or next to your binary."
282 .to_string()
283 } else if cfg!(target_os = "linux") {
284 "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
285 and place `libpdfium.so` on LD_LIBRARY_PATH, in /usr/local/lib, or next to your binary."
286 .to_string()
287 } else if cfg!(target_os = "windows") {
288 "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries \
289 and place `pdfium.dll` on PATH or next to your executable."
290 .to_string()
291 } else {
292 "Download the pdfium dynamic library from https://github.com/bblanchon/pdfium-binaries."
293 .to_string()
294 }
295}
296
297#[cfg(test)]
298mod tests {
299 use super::*;
300
301 #[test]
302 fn raster_config_defaults_to_first_page_150_dpi() {
303 let cfg = PdfRasterConfig::new();
304 assert_eq!(cfg.page_index, 0);
305 assert_eq!(cfg.width_px, 1240);
306 assert_eq!(cfg.height_px, 1754);
307 }
308
309 #[test]
310 fn install_hint_is_non_empty() {
311 assert!(!pdfium_install_hint().is_empty());
312 }
313
314 #[test]
315 fn normalize_pdf_text_removes_nuls_and_outer_blank_space() {
316 assert_eq!(normalize_pdf_text(" hello\0 \n\n"), "hello");
317 }
318}