mioffice-pdf-utils 0.1.0

Lightweight PDF utilities — merge, split, extract pages, read metadata. Pure Rust, no native dependencies.
Documentation
//! # mioffice-pdf-utils
//!
//! Lightweight PDF utilities for Rust — merge, split, extract pages, and read metadata.
//! Pure Rust with zero native dependencies.
//!
//! Built by [MiOffice.ai](https://www.mioffice.ai) — the AI office suite where files
//! never leave your browser.
//!
//! ## Quick Start
//!
//! ```rust,no_run
//! use mioffice_pdf_utils::{merge_pdfs, get_metadata};
//! use std::fs;
//!
//! let pdf1 = fs::read("doc1.pdf").unwrap();
//! let pdf2 = fs::read("doc2.pdf").unwrap();
//! let merged = merge_pdfs(&[&pdf1, &pdf2]).unwrap();
//! fs::write("merged.pdf", merged).unwrap();
//!
//! let meta = get_metadata(&pdf1).unwrap();
//! println!("Pages: {}, Title: {:?}", meta.page_count, meta.title);
//! ```

use lopdf::{dictionary, Document, Object, ObjectId};
use std::collections::BTreeMap;

pub use lopdf;

#[derive(Debug, thiserror::Error)]
pub enum PdfError {
    #[error("Failed to parse PDF: {0}")]
    Parse(#[from] lopdf::Error),
    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),
    #[error("Invalid page index: {index} (document has {total} pages)")]
    InvalidPage { index: usize, total: usize },
    #[error("Cannot produce empty PDF")]
    EmptyResult,
    #[error("No input PDFs provided")]
    NoInput,
}

pub type Result<T> = std::result::Result<T, PdfError>;

/// PDF metadata extracted from the document info dictionary.
#[derive(Debug, Clone, Default)]
pub struct PdfMetadata {
    pub title: Option<String>,
    pub author: Option<String>,
    pub subject: Option<String>,
    pub creator: Option<String>,
    pub producer: Option<String>,
    pub page_count: usize,
    pub page_sizes: Vec<PageSize>,
}

/// Width and height of a PDF page in points (1 point = 1/72 inch).
#[derive(Debug, Clone, Copy)]
pub struct PageSize {
    pub width: f32,
    pub height: f32,
}

/// Merge multiple PDF byte slices into a single PDF.
pub fn merge_pdfs(pdfs: &[&[u8]]) -> Result<Vec<u8>> {
    if pdfs.is_empty() {
        return Err(PdfError::NoInput);
    }

    let mut documents: Vec<Document> = Vec::with_capacity(pdfs.len());
    for pdf in pdfs {
        documents.push(Document::load_mem(pdf)?);
    }

    let mut merged = Document::with_version("1.7");
    let mut pages_object_id: Option<ObjectId> = None;
    let mut all_page_ids: Vec<ObjectId> = Vec::new();
    let mut max_id = 1;

    for doc in &documents {
        // Renumber objects to avoid ID collisions
        let mut doc = doc.clone();
        doc.renumber_objects_with(max_id);
        max_id = doc.max_id + 1;

        // Collect page references
        let pages = doc.get_pages();
        let mut page_ids: Vec<ObjectId> = pages.into_values().collect();
        page_ids.sort();

        // Copy all objects into merged document
        for (id, object) in doc.objects {
            merged.objects.insert(id, object);
        }

        // Set up or extend pages catalog
        if pages_object_id.is_none() {
            if let Some(catalog_id) = find_pages_id(&merged) {
                pages_object_id = Some(catalog_id);
            }
        }

        all_page_ids.extend(page_ids);
    }

    // Build final pages tree
    if let Some(pid) = pages_object_id {
        let kids: Vec<Object> = all_page_ids.iter().map(|id| Object::Reference(*id)).collect();
        let count = kids.len() as i64;

        merged.objects.insert(
            pid,
            Object::Dictionary(lopdf::dictionary! {
                "Type" => "Pages",
                "Count" => count,
                "Kids" => kids,
            }),
        );

        // Update parent references
        for page_id in &all_page_ids {
            if let Ok(Object::Dictionary(ref mut dict)) = merged.objects.get_mut(page_id).ok_or(PdfError::EmptyResult) {
                dict.set("Parent", Object::Reference(pid));
            }
        }

        // Build catalog
        let catalog_id = merged.new_object_id();
        merged.objects.insert(
            catalog_id,
            Object::Dictionary(lopdf::dictionary! {
                "Type" => "Catalog",
                "Pages" => Object::Reference(pid),
            }),
        );
        merged.trailer.set("Root", Object::Reference(catalog_id));
    }

    let mut buf = Vec::new();
    merged.save_to(&mut buf)?;
    Ok(buf)
}

/// Split a PDF into individual single-page PDFs.
pub fn split_pdf(pdf: &[u8]) -> Result<Vec<Vec<u8>>> {
    let doc = Document::load_mem(pdf)?;
    let page_count = doc.get_pages().len();
    let mut results = Vec::with_capacity(page_count);

    for i in 0..page_count {
        results.push(extract_pages(pdf, &[i])?);
    }

    Ok(results)
}

/// Extract specific pages from a PDF. Pages are 0-indexed.
pub fn extract_pages(pdf: &[u8], indices: &[usize]) -> Result<Vec<u8>> {
    if indices.is_empty() {
        return Err(PdfError::EmptyResult);
    }

    let doc = Document::load_mem(pdf)?;
    let pages: BTreeMap<u32, ObjectId> = doc.get_pages();
    let total = pages.len();

    // Validate indices
    for &idx in indices {
        if idx >= total {
            return Err(PdfError::InvalidPage { index: idx, total });
        }
    }

    // Build page list (lopdf pages are 1-indexed)
    let keep: Vec<u32> = indices.iter().map(|&i| (i + 1) as u32).collect();

    let mut new_doc = doc.clone();
    let all_pages: Vec<u32> = pages.keys().copied().collect();
    let remove: Vec<u32> = all_pages.into_iter().filter(|p| !keep.contains(p)).collect();

    for page_num in remove.into_iter().rev() {
        new_doc.delete_pages(&[page_num]);
    }

    let mut buf = Vec::new();
    new_doc.save_to(&mut buf)?;
    Ok(buf)
}

/// Get metadata and page information from a PDF.
pub fn get_metadata(pdf: &[u8]) -> Result<PdfMetadata> {
    let doc = Document::load_mem(pdf)?;
    let pages = doc.get_pages();

    let mut meta = PdfMetadata {
        page_count: pages.len(),
        ..Default::default()
    };

    // Extract info dictionary
    if let Ok(info_ref) = doc.trailer.get(b"Info") {
        let info_id = match info_ref {
            Object::Reference(id) => Some(*id),
            _ => info_ref.as_reference().ok(),
        };
        if let Some(id) = info_id {
            if let Ok(Object::Dictionary(info)) = doc.get_object(id) {
                meta.title = get_string_from_dict(info, b"Title");
                meta.author = get_string_from_dict(info, b"Author");
                meta.subject = get_string_from_dict(info, b"Subject");
                meta.creator = get_string_from_dict(info, b"Creator");
                meta.producer = get_string_from_dict(info, b"Producer");
            }
        }
    }

    // Extract page sizes
    for (_, page_id) in &pages {
        if let Ok(Object::Dictionary(page)) = doc.get_object(*page_id) {
            if let Ok(mediabox) = page.get(b"MediaBox") {
                if let Ok(arr) = mediabox.as_array() {
                    if arr.len() >= 4 {
                        let w = arr[2].as_float().or_else(|_| arr[2].as_i64().map(|v| v as f32)).unwrap_or(612.0)
                            - arr[0].as_float().or_else(|_| arr[0].as_i64().map(|v| v as f32)).unwrap_or(0.0);
                        let h = arr[3].as_float().or_else(|_| arr[3].as_i64().map(|v| v as f32)).unwrap_or(792.0)
                            - arr[1].as_float().or_else(|_| arr[1].as_i64().map(|v| v as f32)).unwrap_or(0.0);
                        meta.page_sizes.push(PageSize { width: w, height: h });
                        continue;
                    }
                }
            }
            meta.page_sizes.push(PageSize { width: 612.0, height: 792.0 });
        }
    }

    Ok(meta)
}

fn find_pages_id(doc: &Document) -> Option<ObjectId> {
    for (id, obj) in &doc.objects {
        if let Object::Dictionary(dict) = obj {
            if let Ok(type_val) = dict.get(b"Type") {
                if type_val.as_name_str().ok() == Some("Pages") {
                    return Some(*id);
                }
            }
        }
    }
    None
}

fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
    dict.get(key).ok().and_then(|v| match v {
        Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
        _ => None,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_merge_empty_returns_error() {
        assert!(merge_pdfs(&[]).is_err());
    }

    #[test]
    fn test_extract_invalid_page_returns_error() {
        // Minimal valid PDF
        let doc = Document::with_version("1.7");
        let mut buf = Vec::new();
        doc.save_to(&mut buf).unwrap();

        let result = extract_pages(&buf, &[99]);
        assert!(result.is_err());
    }
}