Skip to main content

pdfium_auto/
lib.rs

1//! # pdfium-auto
2//!
3//! Auto-download and cache [PDFium](https://pdfium.googlesource.com/pdfium/)
4//! binaries at runtime, so that users of `pdfium-render` no longer need to
5//! manually download libpdfium and set `DYLD_LIBRARY_PATH` / `LD_LIBRARY_PATH`.
6//!
7//! ## How it works
8//!
9//! On first call to [`bind_pdfium`] or [`ensure_pdfium_library`]:
10//!
11//! 1. Checks `~/.cache/pdf2md/pdfium-{VERSION}/` for the platform library.
12//! 2. If absent, downloads the correct `.tgz` from
13//!    [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries).
14//! 3. Extracts `lib/libpdfium.dylib` (or `.so` / `.dll`) to the cache dir.
15//! 4. Calls [`Pdfium::bind_to_library`] to load the real library.
16//!
17//! Subsequent calls skip the network entirely — the library is already cached.
18//!
19//! ## `bundled` feature — compile-time embedding
20//!
21//! For use-cases that require a fully self-contained binary (e.g., CI/CD
22//! distribution), the optional `bundled` feature embeds the pdfium shared
23//! library directly into the compiled executable.
24//!
25//! **Build steps:**
26//!
27//! ```sh
28//! # 1. Download and extract the platform archive (example: macOS arm64).
29//! curl -L https://github.com/bblanchon/pdfium-binaries/releases/download/ \
30//!      chromium%2F7690/pdfium-mac-arm64.tgz | tar xz
31//!
32//! # 2. Build with the bundled feature, pointing PDFIUM_BUNDLE_LIB at the lib.
33//! PDFIUM_BUNDLE_LIB=./lib/libpdfium.dylib \
34//!   cargo build --release --features pdfium-auto/bundled
35//! ```
36//!
37//! At runtime, the embedded bytes are extracted to the cache directory on
38//! first use ([`ensure_pdfium_bundled`] / [`bind_bundled`]).  The resulting
39//! binary ships without any external dependency on libpdfium or network access.
40//!
41//! **Trade-offs:**
42//!
43//! | | Runtime-download (`bind_pdfium`) | Compile-time-bundled (`bind_bundled`) |
44//! |--|--|--|
45//! | Binary size | ~5 MB | ~35 MB (+30 MB) |
46//! | First run | Downloads pdfium (~20 s) | Instant (already embedded) |
47//! | Net access required at runtime | Once (first run) | Never |
48//! | Net access required at compile time | No | No |
49//! | Cross-platform binary | N/A (same arch) | Same constraints |
50//!
51//! ## Usage
52//!
53//! ```rust,no_run
54//! use pdfium_auto::{bind_pdfium_silent, bind_pdfium_from_path, ensure_pdfium_library};
55//!
56//! // Option A: convenient one-shot bind (silent, no progress)
57//! let pdfium = bind_pdfium_silent().expect("PDFium unavailable");
58//!
59//! // Option B: download with progress, then bind
60//! let path = ensure_pdfium_library(Some(&|downloaded, total| {
61//!     if let Some(t) = total {
62//!         eprint!("\rDownloading PDFium: {}/{} bytes", downloaded, t);
63//!     }
64//! })).expect("download failed");
65//! let pdfium = bind_pdfium_from_path(&path).expect("bind failed");
66//! ```
67//!
68//! ## Platform support
69//!
70//! | OS      | Arch    | Library               |
71//! |---------|---------|-----------------------|
72//! | macOS   | arm64   | `libpdfium.dylib`     |
73//! | macOS   | x86_64  | `libpdfium.dylib`     |
74//! | Linux   | x86_64  | `libpdfium.so`        |
75//! | Linux   | aarch64 | `libpdfium.so`        |
76//! | Windows | x86_64  | `pdfium.dll`          |
77//! | Windows | aarch64 | `pdfium.dll`          |
78//! | Windows | x86     | `pdfium.dll`          |
79//!
80//! ## Environment variable overrides
81//!
82//! - `PDFIUM_LIB_PATH` — path to an existing pdfium library; skips download.
83//! - `PDFIUM_AUTO_CACHE_DIR` — override the default cache directory.
84//! - `PDFIUM_BUNDLE_LIB` — (compile time) path to the dylib to embed when
85//!   the `bundled` feature is active.
86
87use std::io::Read;
88use std::path::{Path, PathBuf};
89use std::sync::OnceLock;
90
91use pdfium_render::prelude::Pdfium;
92use thiserror::Error;
93
94// ── Public constants ─────────────────────────────────────────────────────────
95
96/// The pdfium-binaries release tag used for downloads.
97///
98/// Maps to [`bblanchon/pdfium-binaries chromium/7690`](https://github.com/bblanchon/pdfium-binaries/releases/tag/chromium%2F7690).
99pub const PDFIUM_VERSION: &str = "7690";
100
101/// GitHub release base URL.
102const BASE_URL: &str = "https://github.com/bblanchon/pdfium-binaries/releases/download";
103
104// ── Error type ───────────────────────────────────────────────────────────────
105
106/// Errors returned by pdfium-auto operations.
107#[derive(Error, Debug)]
108pub enum PdfiumAutoError {
109    /// The current OS/architecture combination is not supported.
110    #[error("Unsupported platform: {os}/{arch}")]
111    UnsupportedPlatform { os: String, arch: String },
112
113    /// Could not create or navigate the local cache directory.
114    #[error("Cache directory error: {0}")]
115    CacheDir(#[source] std::io::Error),
116
117    /// Network download failed.
118    #[error("Download failed: {0}")]
119    Download(String),
120
121    /// gzip/tar extraction failed.
122    #[error("Archive extraction failed: {0}")]
123    Extract(String),
124
125    /// `libloading` / `pdfium-render` could not load the library.
126    #[error("Failed to bind PDFium from '{path}': {reason}")]
127    Bind { path: PathBuf, reason: String },
128}
129
130// ── Internal: platform metadata ──────────────────────────────────────────────
131
132struct PlatformInfo {
133    /// Asset filename in the GitHub release, e.g. `pdfium-mac-arm64.tgz`.
134    archive_name: &'static str,
135    /// Relative path inside the archive, e.g. `lib/libpdfium.dylib`.
136    lib_path_in_archive: &'static str,
137    /// Filename to write on disk, e.g. `libpdfium.dylib`.
138    lib_name: &'static str,
139}
140
141fn detect_platform() -> Result<PlatformInfo, PdfiumAutoError> {
142    let os = std::env::consts::OS;
143    let arch = std::env::consts::ARCH;
144
145    match (os, arch) {
146        ("macos", "aarch64") => Ok(PlatformInfo {
147            archive_name: "pdfium-mac-arm64.tgz",
148            lib_path_in_archive: "lib/libpdfium.dylib",
149            lib_name: "libpdfium.dylib",
150        }),
151        ("macos", "x86_64") => Ok(PlatformInfo {
152            archive_name: "pdfium-mac-x64.tgz",
153            lib_path_in_archive: "lib/libpdfium.dylib",
154            lib_name: "libpdfium.dylib",
155        }),
156        ("linux", "x86_64") => Ok(PlatformInfo {
157            archive_name: "pdfium-linux-x64.tgz",
158            lib_path_in_archive: "lib/libpdfium.so",
159            lib_name: "libpdfium.so",
160        }),
161        ("linux", "aarch64") => Ok(PlatformInfo {
162            archive_name: "pdfium-linux-arm64.tgz",
163            lib_path_in_archive: "lib/libpdfium.so",
164            lib_name: "libpdfium.so",
165        }),
166        ("windows", "x86_64") => Ok(PlatformInfo {
167            archive_name: "pdfium-win-x64.tgz",
168            lib_path_in_archive: "bin/pdfium.dll",
169            lib_name: "pdfium.dll",
170        }),
171        ("windows", "aarch64") => Ok(PlatformInfo {
172            archive_name: "pdfium-win-arm64.tgz",
173            lib_path_in_archive: "bin/pdfium.dll",
174            lib_name: "pdfium.dll",
175        }),
176        ("windows", "x86") => Ok(PlatformInfo {
177            archive_name: "pdfium-win-x86.tgz",
178            lib_path_in_archive: "bin/pdfium.dll",
179            lib_name: "pdfium.dll",
180        }),
181        (os, arch) => Err(PdfiumAutoError::UnsupportedPlatform {
182            os: os.to_string(),
183            arch: arch.to_string(),
184        }),
185    }
186}
187
188// ── Cache directory resolution ───────────────────────────────────────────────
189
190/// Returns the per-version cache directory for the PDFium library.
191///
192/// Default locations:
193/// - **macOS**: `~/Library/Caches/pdf2md/pdfium-{VERSION}/`
194/// - **Linux**: `~/.cache/pdf2md/pdfium-{VERSION}/`
195/// - **Windows**: `%LOCALAPPDATA%\pdf2md\pdfium-{VERSION}\`
196///
197/// Override by setting `PDFIUM_AUTO_CACHE_DIR`.
198pub fn pdfium_cache_dir() -> PathBuf {
199    if let Ok(override_dir) = std::env::var("PDFIUM_AUTO_CACHE_DIR") {
200        return PathBuf::from(override_dir).join(format!("pdfium-{PDFIUM_VERSION}"));
201    }
202
203    let base = dirs::cache_dir()
204        .or_else(|| dirs::home_dir().map(|h| h.join(".cache")))
205        .unwrap_or_else(std::env::temp_dir);
206
207    base.join("pdf2md").join(format!("pdfium-{PDFIUM_VERSION}"))
208}
209
210// ── Thread-safe singleton path cache ─────────────────────────────────────────
211
212static RESOLVED_PATH: OnceLock<PathBuf> = OnceLock::new();
213
214// ── Public API ───────────────────────────────────────────────────────────────
215
216/// Returns `true` if the PDFium library is already cached on disk (no network
217/// access needed on next call to [`ensure_pdfium_library`]).
218///
219/// Also returns `true` when `PDFIUM_LIB_PATH` points to an existing file.
220pub fn is_pdfium_cached() -> bool {
221    if let Ok(p) = std::env::var("PDFIUM_LIB_PATH") {
222        return PathBuf::from(p).exists();
223    }
224    if let Ok(info) = detect_platform() {
225        return pdfium_cache_dir().join(info.lib_name).exists();
226    }
227    false
228}
229
230/// Returns the on-disk path to the PDFium library, or `None` if not cached.
231pub fn cached_pdfium_path() -> Option<PathBuf> {
232    if let Ok(p) = std::env::var("PDFIUM_LIB_PATH") {
233        let pb = PathBuf::from(p);
234        if pb.exists() {
235            return Some(pb);
236        }
237    }
238    if let Ok(info) = detect_platform() {
239        let p = pdfium_cache_dir().join(info.lib_name);
240        if p.exists() {
241            return Some(p);
242        }
243    }
244    None
245}
246
247/// Ensures the PDFium dynamic library is present in the local cache.
248///
249/// - If `PDFIUM_LIB_PATH` is set (and the file exists), that path is used.
250/// - Otherwise, checks `pdfium_cache_dir()` for an existing library.
251/// - If absent, downloads the appropriate platform binary from GitHub
252///   and extracts it to the cache directory.
253///
254/// `on_progress` receives `(bytes_downloaded, total_size_option)` during
255/// the download.  Pass `None` to suppress progress callbacks.
256///
257/// # Thread safety
258///
259/// Safe to call from multiple threads simultaneously; the download happens
260/// only once per process lifetime.
261pub fn ensure_pdfium_library(
262    on_progress: Option<&dyn Fn(u64, Option<u64>)>,
263) -> Result<PathBuf, PdfiumAutoError> {
264    // Fast path: already resolved in this process.
265    if let Some(path) = RESOLVED_PATH.get() {
266        return Ok(path.clone());
267    }
268
269    let path = resolve_or_download(on_progress)?;
270
271    // Best-effort cache in the OnceLock (ignore race; both will succeed).
272    let _ = RESOLVED_PATH.set(path.clone());
273
274    Ok(path)
275}
276
277/// Binds to PDFium, downloading it first if necessary.
278///
279/// `on_progress` receives `(bytes_downloaded, total_bytes_option)` during
280/// the initial download.
281pub fn bind_pdfium(
282    on_progress: Option<&dyn Fn(u64, Option<u64>)>,
283) -> Result<Pdfium, PdfiumAutoError> {
284    let lib_path = ensure_pdfium_library(on_progress)?;
285    bind_pdfium_from_path(&lib_path)
286}
287
288/// Binds to PDFium without any progress output.
289///
290/// Downloads and caches on first call if required.
291pub fn bind_pdfium_silent() -> Result<Pdfium, PdfiumAutoError> {
292    bind_pdfium(None)
293}
294
295/// Binds to a PDFium library at an explicit `path`.
296///
297/// Does not interact with the download / cache layer.
298pub fn bind_pdfium_from_path(path: &Path) -> Result<Pdfium, PdfiumAutoError> {
299    Pdfium::bind_to_library(path)
300        .map(Pdfium::new)
301        .map_err(|e| PdfiumAutoError::Bind {
302            path: path.to_path_buf(),
303            reason: e.to_string(),
304        })
305}
306
307// ── Bundled feature ───────────────────────────────────────────────────────────
308//
309// When compiled with `--features bundled` (and `PDFIUM_BUNDLE_LIB` set at
310// build time), the pdfium shared library bytes are embedded directly in the
311// binary via `include_bytes!`.  At first use the bytes are written to the
312// standard cache directory and loaded from there.  Subsequent runs reuse
313// the cached copy and skip the write.
314//
315// Build workflow:
316//
317//   # 1. Download the platform archive from bblanchon/pdfium-binaries and
318//   #    extract the shared library to a local path.
319//   curl -L https://github.com/bblanchon/pdfium-binaries/releases/download/\
320//        chromium%2F7690/pdfium-mac-arm64.tgz | tar xz
321//
322//   # 2. Build with the bundled feature enabled, pointing PDFIUM_BUNDLE_LIB
323//   #    at the extracted library.
324//   PDFIUM_BUNDLE_LIB=./lib/libpdfium.dylib \
325//     cargo build --release --features pdfium-auto/bundled
326//
327// ─────────────────────────────────────────────────────────────────────────────
328
329#[cfg(feature = "bundled")]
330mod bundled_lib {
331    // `bundled.rs` is generated by build.rs and defines:
332    //   pub static PDFIUM_BYTES: &[u8] = include_bytes!("bundled_pdfium_lib");
333    include!(concat!(env!("OUT_DIR"), "/bundled.rs"));
334}
335
336/// Ensures the embedded PDFium library is extracted to the local cache and
337/// returns its on-disk path.
338///
339/// This function is only available when the crate is compiled with the
340/// `bundled` feature.  The shared library bytes are embedded in the binary
341/// at compile time (via `PDFIUM_BUNDLE_LIB`); on first call they are written
342/// to `pdfium_cache_dir()` so that the OS can load them.  Subsequent calls
343/// simply return the cached path without any I/O.
344///
345/// # Errors
346///
347/// Returns [`PdfiumAutoError::CacheDir`] if the cache directory cannot be
348/// created, or [`PdfiumAutoError::Extract`] if writing the library fails.
349#[cfg(feature = "bundled")]
350pub fn ensure_pdfium_bundled() -> Result<PathBuf, PdfiumAutoError> {
351    // Fast path: already resolved in this process.
352    if let Some(path) = RESOLVED_PATH.get() {
353        return Ok(path.clone());
354    }
355
356    let info = detect_platform()?;
357    let cache_dir = pdfium_cache_dir();
358    let lib_path = cache_dir.join(info.lib_name);
359
360    // Write the embedded bytes only when the file is absent.
361    if !lib_path.exists() {
362        std::fs::create_dir_all(&cache_dir).map_err(PdfiumAutoError::CacheDir)?;
363        std::fs::write(&lib_path, bundled_lib::PDFIUM_BYTES).map_err(|e| {
364            PdfiumAutoError::Extract(format!(
365                "Failed to write bundled pdfium to {}: {}",
366                lib_path.display(),
367                e
368            ))
369        })?;
370
371        // On Unix, ensure the shared library is executable so the dynamic
372        // linker accepts it.
373        #[cfg(unix)]
374        {
375            use std::os::unix::fs::PermissionsExt;
376            let mut perms = std::fs::metadata(&lib_path)
377                .map_err(PdfiumAutoError::CacheDir)?
378                .permissions();
379            perms.set_mode(perms.mode() | 0o755);
380            std::fs::set_permissions(&lib_path, perms).map_err(PdfiumAutoError::CacheDir)?;
381        }
382    }
383
384    let _ = RESOLVED_PATH.set(lib_path.clone());
385    Ok(lib_path)
386}
387
388/// Binds to the PDFium library that was embedded at compile time.
389///
390/// Extracts the library to the local cache directory on first call (see
391/// [`ensure_pdfium_bundled`]).  No network access is required.
392///
393/// This function is only available when the crate is compiled with the
394/// `bundled` feature.
395#[cfg(feature = "bundled")]
396pub fn bind_bundled() -> Result<Pdfium, PdfiumAutoError> {
397    let lib_path = ensure_pdfium_bundled()?;
398    bind_pdfium_from_path(&lib_path)
399}
400
401// ── Internal helpers ─────────────────────────────────────────────────────────
402
403fn resolve_or_download(
404    on_progress: Option<&dyn Fn(u64, Option<u64>)>,
405) -> Result<PathBuf, PdfiumAutoError> {
406    // 1. Environment variable override.
407    if let Ok(env_path) = std::env::var("PDFIUM_LIB_PATH") {
408        let p = PathBuf::from(env_path);
409        if p.exists() {
410            return Ok(p);
411        }
412        // Fall through: env var set but file missing → still auto-download.
413        eprintln!(
414            "pdfium-auto: PDFIUM_LIB_PATH '{}' not found; downloading …",
415            p.display()
416        );
417    }
418
419    let info = detect_platform()?;
420    let cache_dir = pdfium_cache_dir();
421    let lib_path = cache_dir.join(info.lib_name);
422
423    // 2. Already cached on disk.
424    if lib_path.exists() {
425        return Ok(lib_path);
426    }
427
428    // 3. Download and extract.
429    let url = format!(
430        "{}/chromium%2F{}/{}",
431        BASE_URL, PDFIUM_VERSION, info.archive_name
432    );
433
434    std::fs::create_dir_all(&cache_dir).map_err(PdfiumAutoError::CacheDir)?;
435
436    let archive_bytes = download_bytes(&url, on_progress)?;
437    extract_library(&archive_bytes, info.lib_path_in_archive, &lib_path)?;
438
439    Ok(lib_path)
440}
441
442/// Streams a URL into a `Vec<u8>`, calling `on_progress` every 64 KiB.
443fn download_bytes(
444    url: &str,
445    on_progress: Option<&dyn Fn(u64, Option<u64>)>,
446) -> Result<Vec<u8>, PdfiumAutoError> {
447    let client = reqwest::blocking::Client::builder()
448        .user_agent(concat!("pdfium-auto/", env!("CARGO_PKG_VERSION")))
449        .redirect(reqwest::redirect::Policy::limited(5))
450        .build()
451        .map_err(|e| PdfiumAutoError::Download(e.to_string()))?;
452
453    let response = client
454        .get(url)
455        .send()
456        .map_err(|e| PdfiumAutoError::Download(format!("GET {url}: {e}")))?;
457
458    if !response.status().is_success() {
459        return Err(PdfiumAutoError::Download(format!(
460            "HTTP {} for {url}",
461            response.status()
462        )));
463    }
464
465    let total = response.content_length();
466    let capacity = total.unwrap_or(35 * 1024 * 1024) as usize;
467    let mut buf = Vec::with_capacity(capacity);
468
469    let mut stream = response;
470    let mut chunk = vec![0u8; 64 * 1024]; // 64 KiB
471    let mut downloaded: u64 = 0;
472
473    loop {
474        match stream.read(&mut chunk) {
475            Ok(0) => break,
476            Ok(n) => {
477                buf.extend_from_slice(&chunk[..n]);
478                downloaded += n as u64;
479                if let Some(cb) = on_progress {
480                    cb(downloaded, total);
481                }
482            }
483            Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
484            Err(e) => {
485                return Err(PdfiumAutoError::Download(format!("Read error: {e}")));
486            }
487        }
488    }
489
490    Ok(buf)
491}
492
493/// Extracts a single file from a gzipped tar archive into `dest_path`.
494fn extract_library(
495    archive_bytes: &[u8],
496    lib_path_in_archive: &str,
497    dest_path: &Path,
498) -> Result<(), PdfiumAutoError> {
499    use flate2::read::GzDecoder;
500    use tar::Archive;
501
502    let gz = GzDecoder::new(archive_bytes);
503    let mut archive = Archive::new(gz);
504
505    for entry in archive
506        .entries()
507        .map_err(|e| PdfiumAutoError::Extract(e.to_string()))?
508    {
509        let mut entry = entry.map_err(|e| PdfiumAutoError::Extract(e.to_string()))?;
510        let entry_path = entry
511            .path()
512            .map_err(|e| PdfiumAutoError::Extract(e.to_string()))?;
513
514        let entry_str = entry_path.to_string_lossy();
515        if entry_str == lib_path_in_archive {
516            entry
517                .unpack(dest_path)
518                .map_err(|e| PdfiumAutoError::Extract(format!("Unpack failed: {e}")))?;
519            return Ok(());
520        }
521    }
522
523    Err(PdfiumAutoError::Extract(format!(
524        "Library '{}' not found in archive",
525        lib_path_in_archive
526    )))
527}
528
529// ── Tests ─────────────────────────────────────────────────────────────────────
530
531#[cfg(test)]
532mod tests {
533    use super::*;
534
535    #[test]
536    fn detect_platform_is_supported() {
537        // Verify the current platform is recognised.
538        detect_platform().expect("current platform should be supported");
539    }
540
541    #[test]
542    fn cache_dir_is_deterministic() {
543        let d1 = pdfium_cache_dir();
544        let d2 = pdfium_cache_dir();
545        assert_eq!(d1, d2);
546        assert!(d1.to_str().unwrap().contains("pdf2md"));
547        assert!(d1.to_str().unwrap().contains(PDFIUM_VERSION));
548    }
549
550    #[test]
551    fn cache_dir_override_via_env() {
552        std::env::set_var("PDFIUM_AUTO_CACHE_DIR", "/tmp/test_pdf2md_override");
553        let d = pdfium_cache_dir();
554        std::env::remove_var("PDFIUM_AUTO_CACHE_DIR");
555        assert!(d.starts_with("/tmp/test_pdf2md_override"));
556        assert!(d.to_str().unwrap().contains(PDFIUM_VERSION));
557    }
558
559    #[test]
560    fn platform_info_fields_nonempty() {
561        let info = detect_platform().unwrap();
562        assert!(!info.archive_name.is_empty());
563        assert!(!info.lib_path_in_archive.is_empty());
564        assert!(!info.lib_name.is_empty());
565    }
566}