use crate::errors::PdfMetadataError;
use std::path::Path;
#[cfg(feature = "media-metadata")]
use std::path::PathBuf;
#[cfg(feature = "media-metadata")]
use std::sync::OnceLock;
#[cfg(feature = "media-metadata")]
use tokio::sync::Semaphore;
#[derive(Debug, Clone, Default)]
pub struct PdfMetadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub keywords: Option<Vec<String>>,
pub num_pages: u32,
}
pub fn probe_pdf(path: &Path) -> Result<PdfMetadata, PdfMetadataError> {
let doc = lopdf::Document::load(path).map_err(|e| PdfMetadataError::OpenFailed {
path: path.to_path_buf(),
source: e,
})?;
let num_pages = doc.get_pages().len() as u32;
let info = doc
.trailer
.get(b"Info")
.ok()
.and_then(|obj| obj.as_reference().ok())
.and_then(|info_ref| doc.get_dictionary(info_ref).ok());
let (title, author, subject, keywords) = if let Some(info) = info {
(
get_string_from_dict(&doc, info, b"Title"),
get_string_from_dict(&doc, info, b"Author"),
get_string_from_dict(&doc, info, b"Subject"),
get_string_from_dict(&doc, info, b"Keywords").map(|s| {
s.split(',')
.map(|k| k.trim().to_string())
.filter(|k| !k.is_empty())
.collect()
}),
)
} else {
(None, None, None, None)
};
Ok(PdfMetadata {
title,
author,
subject,
keywords,
num_pages,
})
}
fn get_string_from_dict(
doc: &lopdf::Document,
dict: &lopdf::Dictionary,
key: &[u8],
) -> Option<String> {
let obj = dict.get(key).ok()?;
pdf_object_to_string(doc, obj)
}
fn pdf_object_to_string(doc: &lopdf::Document, obj: &lopdf::Object) -> Option<String> {
match obj {
lopdf::Object::String(bytes, _) => decode_pdf_string(bytes),
lopdf::Object::Reference(r) => doc
.get_object(*r)
.ok()
.and_then(|o| pdf_object_to_string(doc, o)),
_ => None,
}
}
fn decode_pdf_string(bytes: &[u8]) -> Option<String> {
if bytes.is_empty() {
return None;
}
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let utf16: Vec<u16> = bytes[2..]
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect();
String::from_utf16(&utf16).ok()
} else {
String::from_utf8(bytes.to_vec())
.ok()
.or_else(|| Some(bytes.iter().map(|&b| b as char).collect()))
}
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
pub fn parse_pdf_cover_request(path: &str) -> Option<&str> {
let base = path.strip_suffix(".cover.jpg")?;
if base.to_lowercase().ends_with(".pdf") {
Some(base)
} else {
None
}
}
#[cfg(feature = "media-metadata")]
static PDFIUM_SEMAPHORE: OnceLock<Semaphore> = OnceLock::new();
#[cfg(feature = "media-metadata")]
fn pdfium_semaphore() -> &'static Semaphore {
PDFIUM_SEMAPHORE.get_or_init(|| Semaphore::new(1))
}
#[cfg(feature = "media-metadata")]
static PDFIUM_MUTEX: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
#[cfg(feature = "media-metadata")]
fn pdfium_mutex() -> &'static std::sync::Mutex<()> {
PDFIUM_MUTEX.get_or_init(|| std::sync::Mutex::new(()))
}
#[cfg(feature = "media-metadata")]
pub async fn extract_cover_async(path: &Path) -> Result<Vec<u8>, PdfMetadataError> {
let path = path.to_path_buf();
let _permit = pdfium_semaphore().acquire().await.map_err(|_| {
PdfMetadataError::RenderFailed("Failed to acquire pdfium semaphore".to_string())
})?;
tokio::task::spawn_blocking(move || extract_cover_sync(&path))
.await
.map_err(|e| PdfMetadataError::RenderFailed(format!("Task join error: {}", e)))?
}
#[cfg(feature = "media-metadata")]
pub fn extract_cover(path: &Path) -> Result<Vec<u8>, PdfMetadataError> {
extract_cover_sync(path)
}
#[cfg(feature = "media-metadata")]
const PDF_COVER_MAX_WIDTH: f32 = 1200.0;
#[cfg(feature = "media-metadata")]
const PDF_COVER_MIN_HEIGHT: f32 = 1600.0;
#[cfg(feature = "media-metadata")]
fn extract_cover_sync(path: &Path) -> Result<Vec<u8>, PdfMetadataError> {
use image::codecs::jpeg::JpegEncoder;
use pdfium_render::prelude::*;
let _guard = pdfium_mutex()
.lock()
.map_err(|e| PdfMetadataError::RenderFailed(format!("Pdfium mutex poisoned: {}", e)))?;
let pdfium = create_pdfium_instance()?;
let document = pdfium
.load_pdf_from_file(path, None)
.map_err(|e| map_pdfium_error(e, path.to_path_buf()))?;
let page = document
.pages()
.first()
.map_err(|_| PdfMetadataError::NoPages {
path: path.to_path_buf(),
})?;
let page_width = page.width().value;
let page_height = page.height().value;
let scale = if page_width > PDF_COVER_MAX_WIDTH {
PDF_COVER_MAX_WIDTH / page_width
} else {
1.0
};
let render_width = (page_width * scale) as i32;
let max_height = (page_height * scale).max(PDF_COVER_MIN_HEIGHT) as i32;
let config = PdfRenderConfig::new()
.set_target_width(render_width)
.set_maximum_height(max_height);
let bitmap = page
.render_with_config(&config)
.map_err(|e| PdfMetadataError::RenderFailed(format!("Render failed: {}", e)))?;
let image = bitmap.as_image();
let mut jpg_bytes = Vec::new();
let encoder = JpegEncoder::new_with_quality(&mut jpg_bytes, crate::constants::JPEG_QUALITY);
image
.write_with_encoder(encoder)
.map_err(|e| PdfMetadataError::EncodeFailed(format!("JPEG encode failed: {}", e)))?;
Ok(jpg_bytes)
}
#[cfg(feature = "media-metadata")]
fn pdfium_lib_name() -> &'static str {
if cfg!(target_os = "macos") {
"libpdfium.dylib"
} else if cfg!(target_os = "windows") {
"pdfium.dll"
} else {
"libpdfium.so"
}
}
#[cfg(feature = "media-metadata")]
fn pdfium_candidate_paths() -> Vec<PathBuf> {
let lib_name = pdfium_lib_name();
let mut candidates = Vec::new();
if let Ok(lib_path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
candidates.push(PathBuf::from(&lib_path).join(lib_name));
}
if let Ok(exe_path) = std::env::current_exe()
&& let Some(exe_dir) = exe_path.parent()
{
candidates.push(exe_dir.join(lib_name));
candidates.push(exe_dir.join("lib").join(lib_name));
if cfg!(target_os = "macos")
&& let Some(contents_dir) = exe_dir.parent()
{
candidates.push(contents_dir.join("Frameworks").join(lib_name));
}
}
candidates
}
#[cfg(feature = "media-metadata")]
fn create_pdfium_instance() -> Result<pdfium_render::prelude::Pdfium, PdfMetadataError> {
use pdfium_render::prelude::Pdfium;
for candidate in pdfium_candidate_paths() {
if candidate.exists() {
tracing::debug!("Attempting to load pdfium from: {:?}", candidate);
match Pdfium::bind_to_library(&candidate) {
Ok(bindings) => {
tracing::debug!("Successfully loaded pdfium from {:?}", candidate);
return Ok(Pdfium::new(bindings));
}
Err(e) => {
tracing::warn!("Failed to bind to pdfium at {:?}: {}", candidate, e);
}
}
} else {
tracing::trace!("Pdfium not found at {:?}", candidate);
}
}
tracing::debug!("Attempting to load pdfium from system library");
match Pdfium::bind_to_system_library() {
Ok(bindings) => {
tracing::debug!("Successfully loaded pdfium from system library");
Ok(Pdfium::new(bindings))
}
Err(e) => {
tracing::warn!("Failed to load pdfium from system library: {}", e);
Err(PdfMetadataError::RenderFailed(format!(
"Pdfium library not found. Install pdfium or set PDFIUM_DYNAMIC_LIB_PATH. Error: {}",
e
)))
}
}
}
#[cfg(feature = "media-metadata")]
fn map_pdfium_error(e: pdfium_render::prelude::PdfiumError, path: PathBuf) -> PdfMetadataError {
use pdfium_render::prelude::{PdfiumError, PdfiumInternalError};
match e {
PdfiumError::PdfiumLibraryInternalError(PdfiumInternalError::PasswordError) => {
PdfMetadataError::PasswordProtected { path }
}
_ => PdfMetadataError::RenderFailed(format!("Failed to load PDF: {}", e)),
}
}
#[cfg(feature = "media-metadata")]
pub fn save_cover(path: &Path) -> Result<PathBuf, PdfMetadataError> {
let jpg_bytes = extract_cover(path)?;
let sidecar_path = PathBuf::from(format!("{}.cover.jpg", path.display()));
std::fs::write(&sidecar_path, jpg_bytes)?;
Ok(sidecar_path)
}
#[cfg(feature = "media-metadata")]
#[derive(Debug, Default)]
pub struct ExtractCoversResult {
pub success_count: usize,
pub failure_count: usize,
pub failures: Vec<(PathBuf, String)>,
}
#[cfg(feature = "media-metadata")]
pub fn extract_pdf_covers_recursive<F>(dir: &Path, mut progress_callback: F) -> ExtractCoversResult
where
F: FnMut(&Path, Option<&Path>),
{
use walkdir::WalkDir;
let mut result = ExtractCoversResult::default();
for entry in WalkDir::new(dir)
.follow_links(true)
.into_iter()
.filter_map(|e| e.ok())
{
let path = entry.path();
if path.is_file()
&& path
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
{
match save_cover(path) {
Ok(sidecar_path) => {
progress_callback(path, Some(&sidecar_path));
result.success_count += 1;
}
Err(e) => {
progress_callback(path, None);
result.failures.push((path.to_path_buf(), e.to_string()));
result.failure_count += 1;
}
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn test_pdfs_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/pdfs")
}
#[test]
fn test_parse_pdf_cover_request_valid() {
assert_eq!(
parse_pdf_cover_request("docs/report.pdf.cover.jpg"),
Some("docs/report.pdf")
);
assert_eq!(
parse_pdf_cover_request("docs/REPORT.PDF.cover.jpg"),
Some("docs/REPORT.PDF")
);
assert_eq!(parse_pdf_cover_request("a.pdf.cover.jpg"), Some("a.pdf"));
assert_eq!(
parse_pdf_cover_request("some dir/my file.pdf.cover.jpg"),
Some("some dir/my file.pdf")
);
}
#[test]
fn test_parse_pdf_cover_request_invalid() {
assert_eq!(parse_pdf_cover_request("docs/report.pdf"), None);
assert_eq!(parse_pdf_cover_request("docs/report.png"), None);
assert_eq!(parse_pdf_cover_request("docs/report.txt.cover.jpg"), None);
assert_eq!(parse_pdf_cover_request("docs/report.cover.jpg"), None);
assert_eq!(parse_pdf_cover_request("docs/report.pdf.cover.png"), None);
assert_eq!(parse_pdf_cover_request(""), None);
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_extract_cover_success() {
let path = test_pdfs_dir().join("DGA.pdf");
let result = extract_cover(&path);
match result {
Ok(bytes) => {
assert!(bytes.len() > 2, "JPEG should have at least 2 bytes");
assert_eq!(&bytes[0..2], &[0xFF, 0xD8], "Should be valid JPEG");
}
Err(e) => {
if e.to_string().contains("library")
|| e.to_string().contains("not found")
|| e.to_string().contains("bind")
{
eprintln!("Skipping test: pdfium library not available: {}", e);
return;
}
panic!("Unexpected error: {}", e);
}
}
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_extract_cover_nonexistent() {
let path = test_pdfs_dir().join("nonexistent.pdf");
let result = extract_cover(&path);
assert!(result.is_err(), "Should fail for nonexistent file");
}
#[test]
fn test_probe_dga_pdf() {
let path = test_pdfs_dir().join("DGA.pdf");
let meta = probe_pdf(&path).expect("Should parse DGA.pdf");
assert!(meta.title.is_some(), "DGA.pdf should have title");
assert!(
meta.title.as_ref().unwrap().contains("Dietary Guidelines"),
"Title should contain 'Dietary Guidelines', got: {:?}",
meta.title
);
assert!(meta.num_pages > 0, "Should have pages");
}
#[test]
fn test_probe_united_nations_charter_pdf() {
let path = test_pdfs_dir().join("united_nations_charter.pdf");
let meta = probe_pdf(&path).expect("Should parse united_nations_charter.pdf");
assert!(meta.title.is_some(), "Should have title");
assert!(
meta.title
.as_ref()
.unwrap()
.contains("United Nations Charter"),
"Title should contain 'United Nations Charter', got: {:?}",
meta.title
);
assert_eq!(
meta.author.as_deref(),
Some("Richard Mazula"),
"Author should be Richard Mazula"
);
assert_eq!(
meta.subject.as_deref(),
Some("freedom"),
"Subject should be freedom"
);
}
#[test]
fn test_probe_f1099_pdf() {
let path = test_pdfs_dir().join("f1099msc--dft.pdf");
let meta = probe_pdf(&path).expect("Should parse f1099msc--dft.pdf");
assert!(meta.title.is_some(), "Should have title");
assert!(
meta.title.as_ref().unwrap().contains("1099"),
"Title should contain '1099', got: {:?}",
meta.title
);
assert!(meta.author.is_some(), "Should have author");
assert!(meta.subject.is_some(), "Should have subject");
}
#[test]
fn test_probe_nonexistent_pdf() {
let path = test_pdfs_dir().join("nonexistent.pdf");
let result = probe_pdf(&path);
assert!(result.is_err(), "Should fail for nonexistent file");
}
#[test]
fn test_decode_pdf_string_empty() {
assert_eq!(decode_pdf_string(&[]), None);
}
#[test]
fn test_decode_pdf_string_utf8() {
let bytes = b"Hello World";
assert_eq!(decode_pdf_string(bytes), Some("Hello World".to_string()));
}
#[test]
fn test_decode_pdf_string_utf16be() {
let bytes = [0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
assert_eq!(decode_pdf_string(&bytes), Some("Hi".to_string()));
}
#[test]
fn test_decode_pdf_string_trims_whitespace() {
let bytes = b" Hello ";
assert_eq!(decode_pdf_string(bytes), Some("Hello".to_string()));
}
#[test]
fn test_decode_pdf_string_filters_empty() {
let bytes = b" ";
assert_eq!(decode_pdf_string(bytes), None);
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_save_cover_creates_sidecar_file() {
let test_dir = tempfile::tempdir().expect("create temp dir");
let pdf_path = test_pdfs_dir().join("DGA.pdf");
let temp_pdf = test_dir.path().join("test.pdf");
std::fs::copy(&pdf_path, &temp_pdf).expect("copy pdf");
let result = super::save_cover(&temp_pdf);
match result {
Ok(sidecar_path) => {
assert_eq!(
sidecar_path,
test_dir.path().join("test.pdf.cover.jpg"),
"Sidecar path should be {{pdf}}.cover.jpg"
);
assert!(sidecar_path.exists(), "Sidecar file should exist");
let bytes = std::fs::read(&sidecar_path).expect("read sidecar");
assert!(bytes.len() > 2, "JPEG should have at least 2 bytes");
assert_eq!(&bytes[0..2], &[0xFF, 0xD8], "Should be valid JPEG");
}
Err(e) => {
if e.to_string().contains("library")
|| e.to_string().contains("not found")
|| e.to_string().contains("bind")
{
eprintln!("Skipping test: pdfium library not available: {}", e);
return;
}
panic!("Unexpected error: {}", e);
}
}
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_save_cover_nonexistent_file() {
let path = PathBuf::from("/nonexistent/path/to/file.pdf");
let result = super::save_cover(&path);
assert!(result.is_err(), "Should fail for nonexistent file");
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_extract_pdf_covers_recursive_empty_dir() {
let test_dir = tempfile::tempdir().expect("create temp dir");
let result = super::extract_pdf_covers_recursive(test_dir.path(), |_, _| {});
assert_eq!(result.success_count, 0);
assert_eq!(result.failure_count, 0);
assert!(result.failures.is_empty());
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_extract_pdf_covers_recursive_with_pdfs() {
let test_dir = tempfile::tempdir().expect("create temp dir");
let pdf_path = test_pdfs_dir().join("DGA.pdf");
let temp_pdf = test_dir.path().join("test.pdf");
std::fs::copy(&pdf_path, &temp_pdf).expect("copy pdf");
let subdir = test_dir.path().join("subdir");
std::fs::create_dir(&subdir).expect("create subdir");
let temp_pdf2 = subdir.join("test2.pdf");
std::fs::copy(&pdf_path, &temp_pdf2).expect("copy pdf to subdir");
let mut progress_calls = 0;
let result = super::extract_pdf_covers_recursive(test_dir.path(), |_, sidecar| {
if sidecar.is_some() {
progress_calls += 1;
}
});
if result.success_count == 0 && result.failure_count > 0 {
let first_error = &result.failures[0].1;
if first_error.contains("library")
|| first_error.contains("not found")
|| first_error.contains("bind")
{
eprintln!("Skipping test: pdfium library not available");
return;
}
}
assert_eq!(result.success_count, 2, "Should process both PDFs");
assert_eq!(result.failure_count, 0, "No failures expected");
assert_eq!(
progress_calls, 2,
"Progress callback should be called twice"
);
assert!(
test_dir.path().join("test.pdf.cover.jpg").exists(),
"First sidecar should exist"
);
assert!(
subdir.join("test2.pdf.cover.jpg").exists(),
"Second sidecar should exist"
);
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_extract_pdf_covers_recursive_case_insensitive() {
let test_dir = tempfile::tempdir().expect("create temp dir");
let pdf_path = test_pdfs_dir().join("DGA.pdf");
let temp_pdf = test_dir.path().join("test.PDF");
std::fs::copy(&pdf_path, &temp_pdf).expect("copy pdf");
let result = super::extract_pdf_covers_recursive(test_dir.path(), |_, _| {});
if result.success_count == 0 && result.failure_count > 0 {
let first_error = &result.failures[0].1;
if first_error.contains("library")
|| first_error.contains("not found")
|| first_error.contains("bind")
{
eprintln!("Skipping test: pdfium library not available");
return;
}
}
assert_eq!(
result.success_count, 1,
"Should process PDF with uppercase extension"
);
}
#[cfg(feature = "media-metadata")]
#[test]
fn test_extract_covers_result_default() {
let result = super::ExtractCoversResult::default();
assert_eq!(result.success_count, 0);
assert_eq!(result.failure_count, 0);
assert!(result.failures.is_empty());
}
use proptest::prelude::*;
fn path_component_strategy() -> impl Strategy<Value = String> {
"[a-zA-Z0-9_-]{1,15}"
}
fn pdf_basename_strategy() -> impl Strategy<Value = String> {
"[a-zA-Z0-9_-]{1,20}"
}
proptest! {
#[test]
fn prop_parse_pdf_cover_deterministic(path in ".*") {
let result1 = parse_pdf_cover_request(&path);
let result2 = parse_pdf_cover_request(&path);
prop_assert_eq!(result1, result2);
}
#[test]
fn prop_valid_pdf_cover_request_returns_base(
dir in proptest::option::of(path_component_strategy()),
name in pdf_basename_strategy(),
ext_case in prop_oneof!["pdf", "PDF", "Pdf"]
) {
let base = match dir {
Some(d) => format!("{}/{}.{}", d, name, ext_case),
None => format!("{}.{}", name, ext_case),
};
let full_path = format!("{}.cover.jpg", base);
let result = parse_pdf_cover_request(&full_path);
prop_assert!(result.is_some(), "Valid PDF cover request should return Some");
prop_assert_eq!(result.unwrap(), base.as_str());
}
#[test]
fn prop_non_cover_suffix_returns_none(
path in "[a-zA-Z0-9_/.\\-]{1,50}",
suffix in prop_oneof![".png", ".jpg", ".pdf", ".cover.png", ""]
) {
let full_path = format!("{}{}", path, suffix);
if full_path.ends_with(".cover.jpg") {
return Ok(());
}
let result = parse_pdf_cover_request(&full_path);
prop_assert!(result.is_none(), "Non .cover.jpg paths should return None");
}
#[test]
fn prop_non_pdf_cover_returns_none(
name in pdf_basename_strategy(),
ext in prop_oneof!["txt", "doc", "jpg", "png", ""]
) {
let path = if ext.is_empty() {
format!("{}.cover.jpg", name)
} else {
format!("{}.{}.cover.jpg", name, ext)
};
let result = parse_pdf_cover_request(&path);
prop_assert!(result.is_none(), "Non-PDF .cover.jpg should return None: {}", path);
}
#[test]
fn prop_base_path_plus_suffix_equals_input(
dir in proptest::option::of(path_component_strategy()),
name in pdf_basename_strategy()
) {
let base = match dir {
Some(d) => format!("{}/{}.pdf", d, name),
None => format!("{}.pdf", name),
};
let full_path = format!("{}.cover.jpg", base);
if let Some(result_base) = parse_pdf_cover_request(&full_path) {
let reconstructed = format!("{}.cover.jpg", result_base);
prop_assert_eq!(reconstructed, full_path);
}
}
#[test]
fn prop_pdf_extension_case_insensitive(
name in pdf_basename_strategy()
) {
let variations = [
format!("{}.pdf.cover.jpg", name),
format!("{}.PDF.cover.jpg", name),
format!("{}.Pdf.cover.jpg", name),
format!("{}.pDf.cover.jpg", name),
];
for path in &variations {
let result = parse_pdf_cover_request(path);
prop_assert!(result.is_some(), "Should parse: {}", path);
}
}
}
}