use lopdf::{dictionary, Document, Object, ObjectId};
use std::collections::BTreeMap;
pub use lopdf;
#[derive(Debug, thiserror::Error)]
pub enum PdfError {
#[error("Failed to parse PDF: {0}")]
Parse(#[from] lopdf::Error),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid page index: {index} (document has {total} pages)")]
InvalidPage { index: usize, total: usize },
#[error("Cannot produce empty PDF")]
EmptyResult,
#[error("No input PDFs provided")]
NoInput,
}
pub type Result<T> = std::result::Result<T, PdfError>;
#[derive(Debug, Clone, Default)]
pub struct PdfMetadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub creator: Option<String>,
pub producer: Option<String>,
pub page_count: usize,
pub page_sizes: Vec<PageSize>,
}
#[derive(Debug, Clone, Copy)]
pub struct PageSize {
pub width: f32,
pub height: f32,
}
pub fn merge_pdfs(pdfs: &[&[u8]]) -> Result<Vec<u8>> {
if pdfs.is_empty() {
return Err(PdfError::NoInput);
}
let mut documents: Vec<Document> = Vec::with_capacity(pdfs.len());
for pdf in pdfs {
documents.push(Document::load_mem(pdf)?);
}
let mut merged = Document::with_version("1.7");
let mut pages_object_id: Option<ObjectId> = None;
let mut all_page_ids: Vec<ObjectId> = Vec::new();
let mut max_id = 1;
for doc in &documents {
let mut doc = doc.clone();
doc.renumber_objects_with(max_id);
max_id = doc.max_id + 1;
let pages = doc.get_pages();
let mut page_ids: Vec<ObjectId> = pages.into_values().collect();
page_ids.sort();
for (id, object) in doc.objects {
merged.objects.insert(id, object);
}
if pages_object_id.is_none() {
if let Some(catalog_id) = find_pages_id(&merged) {
pages_object_id = Some(catalog_id);
}
}
all_page_ids.extend(page_ids);
}
if let Some(pid) = pages_object_id {
let kids: Vec<Object> = all_page_ids.iter().map(|id| Object::Reference(*id)).collect();
let count = kids.len() as i64;
merged.objects.insert(
pid,
Object::Dictionary(lopdf::dictionary! {
"Type" => "Pages",
"Count" => count,
"Kids" => kids,
}),
);
for page_id in &all_page_ids {
if let Ok(Object::Dictionary(ref mut dict)) = merged.objects.get_mut(page_id).ok_or(PdfError::EmptyResult) {
dict.set("Parent", Object::Reference(pid));
}
}
let catalog_id = merged.new_object_id();
merged.objects.insert(
catalog_id,
Object::Dictionary(lopdf::dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pid),
}),
);
merged.trailer.set("Root", Object::Reference(catalog_id));
}
let mut buf = Vec::new();
merged.save_to(&mut buf)?;
Ok(buf)
}
pub fn split_pdf(pdf: &[u8]) -> Result<Vec<Vec<u8>>> {
let doc = Document::load_mem(pdf)?;
let page_count = doc.get_pages().len();
let mut results = Vec::with_capacity(page_count);
for i in 0..page_count {
results.push(extract_pages(pdf, &[i])?);
}
Ok(results)
}
pub fn extract_pages(pdf: &[u8], indices: &[usize]) -> Result<Vec<u8>> {
if indices.is_empty() {
return Err(PdfError::EmptyResult);
}
let doc = Document::load_mem(pdf)?;
let pages: BTreeMap<u32, ObjectId> = doc.get_pages();
let total = pages.len();
for &idx in indices {
if idx >= total {
return Err(PdfError::InvalidPage { index: idx, total });
}
}
let keep: Vec<u32> = indices.iter().map(|&i| (i + 1) as u32).collect();
let mut new_doc = doc.clone();
let all_pages: Vec<u32> = pages.keys().copied().collect();
let remove: Vec<u32> = all_pages.into_iter().filter(|p| !keep.contains(p)).collect();
for page_num in remove.into_iter().rev() {
new_doc.delete_pages(&[page_num]);
}
let mut buf = Vec::new();
new_doc.save_to(&mut buf)?;
Ok(buf)
}
pub fn get_metadata(pdf: &[u8]) -> Result<PdfMetadata> {
let doc = Document::load_mem(pdf)?;
let pages = doc.get_pages();
let mut meta = PdfMetadata {
page_count: pages.len(),
..Default::default()
};
if let Ok(info_ref) = doc.trailer.get(b"Info") {
let info_id = match info_ref {
Object::Reference(id) => Some(*id),
_ => info_ref.as_reference().ok(),
};
if let Some(id) = info_id {
if let Ok(Object::Dictionary(info)) = doc.get_object(id) {
meta.title = get_string_from_dict(info, b"Title");
meta.author = get_string_from_dict(info, b"Author");
meta.subject = get_string_from_dict(info, b"Subject");
meta.creator = get_string_from_dict(info, b"Creator");
meta.producer = get_string_from_dict(info, b"Producer");
}
}
}
for (_, page_id) in &pages {
if let Ok(Object::Dictionary(page)) = doc.get_object(*page_id) {
if let Ok(mediabox) = page.get(b"MediaBox") {
if let Ok(arr) = mediabox.as_array() {
if arr.len() >= 4 {
let w = arr[2].as_float().or_else(|_| arr[2].as_i64().map(|v| v as f32)).unwrap_or(612.0)
- arr[0].as_float().or_else(|_| arr[0].as_i64().map(|v| v as f32)).unwrap_or(0.0);
let h = arr[3].as_float().or_else(|_| arr[3].as_i64().map(|v| v as f32)).unwrap_or(792.0)
- arr[1].as_float().or_else(|_| arr[1].as_i64().map(|v| v as f32)).unwrap_or(0.0);
meta.page_sizes.push(PageSize { width: w, height: h });
continue;
}
}
}
meta.page_sizes.push(PageSize { width: 612.0, height: 792.0 });
}
}
Ok(meta)
}
fn find_pages_id(doc: &Document) -> Option<ObjectId> {
for (id, obj) in &doc.objects {
if let Object::Dictionary(dict) = obj {
if let Ok(type_val) = dict.get(b"Type") {
if type_val.as_name_str().ok() == Some("Pages") {
return Some(*id);
}
}
}
}
None
}
fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
dict.get(key).ok().and_then(|v| match v {
Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
_ => None,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_merge_empty_returns_error() {
assert!(merge_pdfs(&[]).is_err());
}
#[test]
fn test_extract_invalid_page_returns_error() {
let doc = Document::with_version("1.7");
let mut buf = Vec::new();
doc.save_to(&mut buf).unwrap();
let result = extract_pages(&buf, &[99]);
assert!(result.is_err());
}
}