pdfium_auto/lib.rs
1//! # pdfium-auto
2//!
3//! Auto-download and cache [PDFium](https://pdfium.googlesource.com/pdfium/)
4//! binaries at runtime, so that users of `pdfium-render` no longer need to
5//! manually download libpdfium and set `DYLD_LIBRARY_PATH` / `LD_LIBRARY_PATH`.
6//!
7//! ## How it works
8//!
9//! On first call to [`bind_pdfium`] or [`ensure_pdfium_library`]:
10//!
11//! 1. Checks `~/.cache/pdf2md/pdfium-{VERSION}/` for the platform library.
12//! 2. If absent, downloads the correct `.tgz` from
13//! [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries).
14//! 3. Extracts `lib/libpdfium.dylib` (or `.so` / `.dll`) to the cache dir.
15//! 4. Calls [`Pdfium::bind_to_library`] to load the real library.
16//!
17//! Subsequent calls skip the network entirely — the library is already cached.
18//!
19//! ## `bundled` feature — compile-time embedding
20//!
21//! For use-cases that require a fully self-contained binary (e.g., CI/CD
22//! distribution), the optional `bundled` feature embeds the pdfium shared
23//! library directly into the compiled executable.
24//!
25//! **Build steps:**
26//!
27//! ```sh
28//! # 1. Download and extract the platform archive (example: macOS arm64).
29//! curl -L https://github.com/bblanchon/pdfium-binaries/releases/download/ \
30//! chromium%2F7690/pdfium-mac-arm64.tgz | tar xz
31//!
32//! # 2. Build with the bundled feature, pointing PDFIUM_BUNDLE_LIB at the lib.
33//! PDFIUM_BUNDLE_LIB=./lib/libpdfium.dylib \
34//! cargo build --release --features pdfium-auto/bundled
35//! ```
36//!
37//! At runtime, the embedded bytes are extracted to the cache directory on
38//! first use ([`ensure_pdfium_bundled`] / [`bind_bundled`]). The resulting
39//! binary ships without any external dependency on libpdfium or network access.
40//!
41//! **Trade-offs:**
42//!
43//! | | Runtime-download (`bind_pdfium`) | Compile-time-bundled (`bind_bundled`) |
44//! |--|--|--|
45//! | Binary size | ~5 MB | ~35 MB (+30 MB) |
46//! | First run | Downloads pdfium (~20 s) | Instant (already embedded) |
47//! | Net access required at runtime | Once (first run) | Never |
48//! | Net access required at compile time | No | No |
49//! | Cross-platform binary | N/A (same arch) | Same constraints |
50//!
51//! ## Usage
52//!
53//! ```rust,no_run
54//! use pdfium_auto::{bind_pdfium_silent, bind_pdfium_from_path, ensure_pdfium_library};
55//!
56//! // Option A: convenient one-shot bind (silent, no progress)
57//! let pdfium = bind_pdfium_silent().expect("PDFium unavailable");
58//!
59//! // Option B: download with progress, then bind
60//! let path = ensure_pdfium_library(Some(&|downloaded, total| {
61//! if let Some(t) = total {
62//! eprint!("\rDownloading PDFium: {}/{} bytes", downloaded, t);
63//! }
64//! })).expect("download failed");
65//! let pdfium = bind_pdfium_from_path(&path).expect("bind failed");
66//! ```
67//!
68//! ## Platform support
69//!
70//! | OS | Arch | Library |
71//! |---------|---------|-----------------------|
72//! | macOS | arm64 | `libpdfium.dylib` |
73//! | macOS | x86_64 | `libpdfium.dylib` |
74//! | Linux | x86_64 | `libpdfium.so` |
75//! | Linux | aarch64 | `libpdfium.so` |
76//! | Windows | x86_64 | `pdfium.dll` |
77//! | Windows | aarch64 | `pdfium.dll` |
78//! | Windows | x86 | `pdfium.dll` |
79//!
80//! ## Environment variable overrides
81//!
82//! - `PDFIUM_LIB_PATH` — path to an existing pdfium library; skips download.
83//! - `PDFIUM_AUTO_CACHE_DIR` — override the default cache directory.
84//! - `PDFIUM_BUNDLE_LIB` — (compile time) path to the dylib to embed when
85//! the `bundled` feature is active.
86
87use std::io::Read;
88use std::path::{Path, PathBuf};
89use std::sync::OnceLock;
90
91use pdfium_render::prelude::Pdfium;
92use thiserror::Error;
93
94// ── Public constants ─────────────────────────────────────────────────────────
95
96/// The pdfium-binaries release tag used for downloads.
97///
98/// Maps to [`bblanchon/pdfium-binaries chromium/7690`](https://github.com/bblanchon/pdfium-binaries/releases/tag/chromium%2F7690).
99pub const PDFIUM_VERSION: &str = "7690";
100
101/// GitHub release base URL.
102const BASE_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download";
103
104// ── Error type ───────────────────────────────────────────────────────────────
105
106/// Errors returned by pdfium-auto operations.
107#[derive(Error, Debug)]
108pub enum PdfiumAutoError {
109 /// The current OS/architecture combination is not supported.
110 #[error("Unsupported platform: {os}/{arch}")]
111 UnsupportedPlatform { os: String, arch: String },
112
113 /// Could not create or navigate the local cache directory.
114 #[error("Cache directory error: {0}")]
115 CacheDir(#[source] std::io::Error),
116
117 /// Network download failed.
118 #[error("Download failed: {0}")]
119 Download(String),
120
121 /// gzip/tar extraction failed.
122 #[error("Archive extraction failed: {0}")]
123 Extract(String),
124
125 /// `libloading` / `pdfium-render` could not load the library.
126 #[error("Failed to bind PDFium from '{path}': {reason}")]
127 Bind { path: PathBuf, reason: String },
128}
129
130// ── Internal: platform metadata ──────────────────────────────────────────────
131
132struct PlatformInfo {
133 /// Asset filename in the GitHub release, e.g. `pdfium-mac-arm64.tgz`.
134 archive_name: &'static str,
135 /// Relative path inside the archive, e.g. `lib/libpdfium.dylib`.
136 lib_path_in_archive: &'static str,
137 /// Filename to write on disk, e.g. `libpdfium.dylib`.
138 lib_name: &'static str,
139}
140
141fn detect_platform() -> Result<PlatformInfo, PdfiumAutoError> {
142 let os = std::env::consts::OS;
143 let arch = std::env::consts::ARCH;
144
145 match (os, arch) {
146 ("macos", "aarch64") => Ok(PlatformInfo {
147 archive_name: "pdfium-mac-arm64.tgz",
148 lib_path_in_archive: "lib/libpdfium.dylib",
149 lib_name: "libpdfium.dylib",
150 }),
151 ("macos", "x86_64") => Ok(PlatformInfo {
152 archive_name: "pdfium-mac-x64.tgz",
153 lib_path_in_archive: "lib/libpdfium.dylib",
154 lib_name: "libpdfium.dylib",
155 }),
156 ("linux", "x86_64") => Ok(PlatformInfo {
157 archive_name: "pdfium-linux-x64.tgz",
158 lib_path_in_archive: "lib/libpdfium.so",
159 lib_name: "libpdfium.so",
160 }),
161 ("linux", "aarch64") => Ok(PlatformInfo {
162 archive_name: "pdfium-linux-arm64.tgz",
163 lib_path_in_archive: "lib/libpdfium.so",
164 lib_name: "libpdfium.so",
165 }),
166 ("windows", "x86_64") => Ok(PlatformInfo {
167 archive_name: "pdfium-win-x64.tgz",
168 lib_path_in_archive: "bin/pdfium.dll",
169 lib_name: "pdfium.dll",
170 }),
171 ("windows", "aarch64") => Ok(PlatformInfo {
172 archive_name: "pdfium-win-arm64.tgz",
173 lib_path_in_archive: "bin/pdfium.dll",
174 lib_name: "pdfium.dll",
175 }),
176 ("windows", "x86") => Ok(PlatformInfo {
177 archive_name: "pdfium-win-x86.tgz",
178 lib_path_in_archive: "bin/pdfium.dll",
179 lib_name: "pdfium.dll",
180 }),
181 (os, arch) => Err(PdfiumAutoError::UnsupportedPlatform {
182 os: os.to_string(),
183 arch: arch.to_string(),
184 }),
185 }
186}
187
188// ── Cache directory resolution ───────────────────────────────────────────────
189
190/// Returns the per-version cache directory for the PDFium library.
191///
192/// Default locations:
193/// - **macOS**: `~/Library/Caches/pdf2md/pdfium-{VERSION}/`
194/// - **Linux**: `~/.cache/pdf2md/pdfium-{VERSION}/`
195/// - **Windows**: `%LOCALAPPDATA%\pdf2md\pdfium-{VERSION}\`
196///
197/// Override by setting `PDFIUM_AUTO_CACHE_DIR`.
198pub fn pdfium_cache_dir() -> PathBuf {
199 if let Ok(override_dir) = std::env::var("PDFIUM_AUTO_CACHE_DIR") {
200 return PathBuf::from(override_dir).join(format!("pdfium-{PDFIUM_VERSION}"));
201 }
202
203 let base = dirs::cache_dir()
204 .or_else(|| dirs::home_dir().map(|h| h.join(".cache")))
205 .unwrap_or_else(std::env::temp_dir);
206
207 base.join("pdf2md").join(format!("pdfium-{PDFIUM_VERSION}"))
208}
209
210// ── Thread-safe singleton path cache ─────────────────────────────────────────
211
212static RESOLVED_PATH: OnceLock<PathBuf> = OnceLock::new();
213
214// ── Public API ───────────────────────────────────────────────────────────────
215
216/// Returns `true` if the PDFium library is already cached on disk (no network
217/// access needed on next call to [`ensure_pdfium_library`]).
218///
219/// Also returns `true` when `PDFIUM_LIB_PATH` points to an existing file.
220pub fn is_pdfium_cached() -> bool {
221 if let Ok(p) = std::env::var("PDFIUM_LIB_PATH") {
222 return PathBuf::from(p).exists();
223 }
224 if let Ok(info) = detect_platform() {
225 return pdfium_cache_dir().join(info.lib_name).exists();
226 }
227 false
228}
229
230/// Returns the on-disk path to the PDFium library, or `None` if not cached.
231pub fn cached_pdfium_path() -> Option<PathBuf> {
232 if let Ok(p) = std::env::var("PDFIUM_LIB_PATH") {
233 let pb = PathBuf::from(p);
234 if pb.exists() {
235 return Some(pb);
236 }
237 }
238 if let Ok(info) = detect_platform() {
239 let p = pdfium_cache_dir().join(info.lib_name);
240 if p.exists() {
241 return Some(p);
242 }
243 }
244 None
245}
246
247/// Ensures the PDFium dynamic library is present in the local cache.
248///
249/// - If `PDFIUM_LIB_PATH` is set (and the file exists), that path is used.
250/// - Otherwise, checks `pdfium_cache_dir()` for an existing library.
251/// - If absent, downloads the appropriate platform binary from GitHub
252/// and extracts it to the cache directory.
253///
254/// `on_progress` receives `(bytes_downloaded, total_size_option)` during
255/// the download. Pass `None` to suppress progress callbacks.
256///
257/// # Thread safety
258///
259/// Safe to call from multiple threads simultaneously; the download happens
260/// only once per process lifetime.
261pub fn ensure_pdfium_library(
262 on_progress: Option<&dyn Fn(u64, Option<u64>)>,
263) -> Result<PathBuf, PdfiumAutoError> {
264 // Fast path: already resolved in this process.
265 if let Some(path) = RESOLVED_PATH.get() {
266 return Ok(path.clone());
267 }
268
269 let path = resolve_or_download(on_progress)?;
270
271 // Best-effort cache in the OnceLock (ignore race; both will succeed).
272 let _ = RESOLVED_PATH.set(path.clone());
273
274 Ok(path)
275}
276
277/// Binds to PDFium, downloading it first if necessary.
278///
279/// `on_progress` receives `(bytes_downloaded, total_bytes_option)` during
280/// the initial download.
281pub fn bind_pdfium(
282 on_progress: Option<&dyn Fn(u64, Option<u64>)>,
283) -> Result<Pdfium, PdfiumAutoError> {
284 let lib_path = ensure_pdfium_library(on_progress)?;
285 bind_pdfium_from_path(&lib_path)
286}
287
288/// Binds to PDFium without any progress output.
289///
290/// Downloads and caches on first call if required.
291pub fn bind_pdfium_silent() -> Result<Pdfium, PdfiumAutoError> {
292 bind_pdfium(None)
293}
294
295/// Binds to a PDFium library at an explicit `path`.
296///
297/// Does not interact with the download / cache layer.
298pub fn bind_pdfium_from_path(path: &Path) -> Result<Pdfium, PdfiumAutoError> {
299 Pdfium::bind_to_library(path)
300 .map(Pdfium::new)
301 .map_err(|e| PdfiumAutoError::Bind {
302 path: path.to_path_buf(),
303 reason: e.to_string(),
304 })
305}
306
307// ── Bundled feature ───────────────────────────────────────────────────────────
308//
309// When compiled with `--features bundled` (and `PDFIUM_BUNDLE_LIB` set at
310// build time), the pdfium shared library bytes are embedded directly in the
311// binary via `include_bytes!`. At first use the bytes are written to the
312// standard cache directory and loaded from there. Subsequent runs reuse
313// the cached copy and skip the write.
314//
315// Build workflow:
316//
317// # 1. Download the platform archive from bblanchon/pdfium-binaries and
318// # extract the shared library to a local path.
319// curl -L https://github.com/bblanchon/pdfium-binaries/releases/download/\
320// chromium%2F7690/pdfium-mac-arm64.tgz | tar xz
321//
322// # 2. Build with the bundled feature enabled, pointing PDFIUM_BUNDLE_LIB
323// # at the extracted library.
324// PDFIUM_BUNDLE_LIB=./lib/libpdfium.dylib \
325// cargo build --release --features pdfium-auto/bundled
326//
327// ─────────────────────────────────────────────────────────────────────────────
328
329#[cfg(feature = "bundled")]
330mod bundled_lib {
331 // `bundled.rs` is generated by build.rs and defines:
332 // pub static PDFIUM_BYTES: &[u8] = include_bytes!("bundled_pdfium_lib");
333 include!(concat!(env!("OUT_DIR"), "/bundled.rs"));
334}
335
336/// Ensures the embedded PDFium library is extracted to the local cache and
337/// returns its on-disk path.
338///
339/// This function is only available when the crate is compiled with the
340/// `bundled` feature. The shared library bytes are embedded in the binary
341/// at compile time (via `PDFIUM_BUNDLE_LIB`); on first call they are written
342/// to `pdfium_cache_dir()` so that the OS can load them. Subsequent calls
343/// simply return the cached path without any I/O.
344///
345/// # Errors
346///
347/// Returns [`PdfiumAutoError::CacheDir`] if the cache directory cannot be
348/// created, or [`PdfiumAutoError::Extract`] if writing the library fails.
349#[cfg(feature = "bundled")]
350pub fn ensure_pdfium_bundled() -> Result<PathBuf, PdfiumAutoError> {
351 // Fast path: already resolved in this process.
352 if let Some(path) = RESOLVED_PATH.get() {
353 return Ok(path.clone());
354 }
355
356 let info = detect_platform()?;
357 let cache_dir = pdfium_cache_dir();
358 let lib_path = cache_dir.join(info.lib_name);
359
360 // Write the embedded bytes only when the file is absent.
361 if !lib_path.exists() {
362 std::fs::create_dir_all(&cache_dir).map_err(PdfiumAutoError::CacheDir)?;
363 std::fs::write(&lib_path, bundled_lib::PDFIUM_BYTES).map_err(|e| {
364 PdfiumAutoError::Extract(format!(
365 "Failed to write bundled pdfium to {}: {}",
366 lib_path.display(),
367 e
368 ))
369 })?;
370
371 // On Unix, ensure the shared library is executable so the dynamic
372 // linker accepts it.
373 #[cfg(unix)]
374 {
375 use std::os::unix::fs::PermissionsExt;
376 let mut perms = std::fs::metadata(&lib_path)
377 .map_err(PdfiumAutoError::CacheDir)?
378 .permissions();
379 perms.set_mode(perms.mode() | 0o755);
380 std::fs::set_permissions(&lib_path, perms).map_err(PdfiumAutoError::CacheDir)?;
381 }
382 }
383
384 let _ = RESOLVED_PATH.set(lib_path.clone());
385 Ok(lib_path)
386}
387
388/// Binds to the PDFium library that was embedded at compile time.
389///
390/// Extracts the library to the local cache directory on first call (see
391/// [`ensure_pdfium_bundled`]). No network access is required.
392///
393/// This function is only available when the crate is compiled with the
394/// `bundled` feature.
395#[cfg(feature = "bundled")]
396pub fn bind_bundled() -> Result<Pdfium, PdfiumAutoError> {
397 let lib_path = ensure_pdfium_bundled()?;
398 bind_pdfium_from_path(&lib_path)
399}
400
401// ── Internal helpers ─────────────────────────────────────────────────────────
402
403fn resolve_or_download(
404 on_progress: Option<&dyn Fn(u64, Option<u64>)>,
405) -> Result<PathBuf, PdfiumAutoError> {
406 // 1. Environment variable override.
407 if let Ok(env_path) = std::env::var("PDFIUM_LIB_PATH") {
408 let p = PathBuf::from(env_path);
409 if p.exists() {
410 return Ok(p);
411 }
412 // Fall through: env var set but file missing → still auto-download.
413 eprintln!(
414 "pdfium-auto: PDFIUM_LIB_PATH '{}' not found; downloading …",
415 p.display()
416 );
417 }
418
419 let info = detect_platform()?;
420 let cache_dir = pdfium_cache_dir();
421 let lib_path = cache_dir.join(info.lib_name);
422
423 // 2. Already cached on disk.
424 if lib_path.exists() {
425 return Ok(lib_path);
426 }
427
428 // 3. Download and extract.
429 let url = format!(
430 "{}/chromium%2F{}/{}",
431 BASE_URL, PDFIUM_VERSION, info.archive_name
432 );
433
434 std::fs::create_dir_all(&cache_dir).map_err(PdfiumAutoError::CacheDir)?;
435
436 let archive_bytes = download_bytes(&url, on_progress)?;
437 extract_library(&archive_bytes, info.lib_path_in_archive, &lib_path)?;
438
439 Ok(lib_path)
440}
441
442/// Streams a URL into a `Vec<u8>`, calling `on_progress` every 64 KiB.
443fn download_bytes(
444 url: &str,
445 on_progress: Option<&dyn Fn(u64, Option<u64>)>,
446) -> Result<Vec<u8>, PdfiumAutoError> {
447 let client = reqwest::blocking::Client::builder()
448 .user_agent(concat!("pdfium-auto/", env!("CARGO_PKG_VERSION")))
449 .redirect(reqwest::redirect::Policy::limited(5))
450 .build()
451 .map_err(|e| PdfiumAutoError::Download(e.to_string()))?;
452
453 let response = client
454 .get(url)
455 .send()
456 .map_err(|e| PdfiumAutoError::Download(format!("GET {url}: {e}")))?;
457
458 if !response.status().is_success() {
459 return Err(PdfiumAutoError::Download(format!(
460 "HTTP {} for {url}",
461 response.status()
462 )));
463 }
464
465 let total = response.content_length();
466 let capacity = total.unwrap_or(35 * 1024 * 1024) as usize;
467 let mut buf = Vec::with_capacity(capacity);
468
469 let mut stream = response;
470 let mut chunk = vec![0u8; 64 * 1024]; // 64 KiB
471 let mut downloaded: u64 = 0;
472
473 loop {
474 match stream.read(&mut chunk) {
475 Ok(0) => break,
476 Ok(n) => {
477 buf.extend_from_slice(&chunk[..n]);
478 downloaded += n as u64;
479 if let Some(cb) = on_progress {
480 cb(downloaded, total);
481 }
482 }
483 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
484 Err(e) => {
485 return Err(PdfiumAutoError::Download(format!("Read error: {e}")));
486 }
487 }
488 }
489
490 Ok(buf)
491}
492
493/// Extracts a single file from a gzipped tar archive into `dest_path`.
494fn extract_library(
495 archive_bytes: &[u8],
496 lib_path_in_archive: &str,
497 dest_path: &Path,
498) -> Result<(), PdfiumAutoError> {
499 use flate2::read::GzDecoder;
500 use tar::Archive;
501
502 let gz = GzDecoder::new(archive_bytes);
503 let mut archive = Archive::new(gz);
504
505 for entry in archive
506 .entries()
507 .map_err(|e| PdfiumAutoError::Extract(e.to_string()))?
508 {
509 let mut entry = entry.map_err(|e| PdfiumAutoError::Extract(e.to_string()))?;
510 let entry_path = entry
511 .path()
512 .map_err(|e| PdfiumAutoError::Extract(e.to_string()))?;
513
514 let entry_str = entry_path.to_string_lossy();
515 if entry_str == lib_path_in_archive {
516 entry
517 .unpack(dest_path)
518 .map_err(|e| PdfiumAutoError::Extract(format!("Unpack failed: {e}")))?;
519 return Ok(());
520 }
521 }
522
523 Err(PdfiumAutoError::Extract(format!(
524 "Library '{}' not found in archive",
525 lib_path_in_archive
526 )))
527}
528
529// ── Tests ─────────────────────────────────────────────────────────────────────
530
531#[cfg(test)]
532mod tests {
533 use super::*;
534
535 #[test]
536 fn detect_platform_is_supported() {
537 // Verify the current platform is recognised.
538 detect_platform().expect("current platform should be supported");
539 }
540
541 #[test]
542 fn cache_dir_is_deterministic() {
543 let d1 = pdfium_cache_dir();
544 let d2 = pdfium_cache_dir();
545 assert_eq!(d1, d2);
546 assert!(d1.to_str().unwrap().contains("pdf2md"));
547 assert!(d1.to_str().unwrap().contains(PDFIUM_VERSION));
548 }
549
550 #[test]
551 fn cache_dir_override_via_env() {
552 std::env::set_var("PDFIUM_AUTO_CACHE_DIR", "/tmp/test_pdf2md_override");
553 let d = pdfium_cache_dir();
554 std::env::remove_var("PDFIUM_AUTO_CACHE_DIR");
555 assert!(d.starts_with("/tmp/test_pdf2md_override"));
556 assert!(d.to_str().unwrap().contains(PDFIUM_VERSION));
557 }
558
559 #[test]
560 fn platform_info_fields_nonempty() {
561 let info = detect_platform().unwrap();
562 assert!(!info.archive_name.is_empty());
563 assert!(!info.lib_path_in_archive.is_empty());
564 assert!(!info.lib_name.is_empty());
565 }
566}