Skip to main content

mioffice_pdf_utils/
lib.rs

1//! # mioffice-pdf-utils
2//!
3//! Lightweight PDF utilities for Rust — merge, split, extract pages, and read metadata.
4//! Pure Rust with zero native dependencies.
5//!
6//! Built by [MiOffice.ai](https://www.mioffice.ai) — the AI office suite where files
7//! never leave your browser.
8//!
9//! ## Quick Start
10//!
11//! ```rust,no_run
12//! use mioffice_pdf_utils::{merge_pdfs, get_metadata};
13//! use std::fs;
14//!
15//! let pdf1 = fs::read("doc1.pdf").unwrap();
16//! let pdf2 = fs::read("doc2.pdf").unwrap();
17//! let merged = merge_pdfs(&[&pdf1, &pdf2]).unwrap();
18//! fs::write("merged.pdf", merged).unwrap();
19//!
20//! let meta = get_metadata(&pdf1).unwrap();
21//! println!("Pages: {}, Title: {:?}", meta.page_count, meta.title);
22//! ```
23
24use lopdf::{dictionary, Document, Object, ObjectId};
25use std::collections::BTreeMap;
26
27pub use lopdf;
28
29#[derive(Debug, thiserror::Error)]
30pub enum PdfError {
31    #[error("Failed to parse PDF: {0}")]
32    Parse(#[from] lopdf::Error),
33    #[error("IO error: {0}")]
34    Io(#[from] std::io::Error),
35    #[error("Invalid page index: {index} (document has {total} pages)")]
36    InvalidPage { index: usize, total: usize },
37    #[error("Cannot produce empty PDF")]
38    EmptyResult,
39    #[error("No input PDFs provided")]
40    NoInput,
41}
42
43pub type Result<T> = std::result::Result<T, PdfError>;
44
45/// PDF metadata extracted from the document info dictionary.
46#[derive(Debug, Clone, Default)]
47pub struct PdfMetadata {
48    pub title: Option<String>,
49    pub author: Option<String>,
50    pub subject: Option<String>,
51    pub creator: Option<String>,
52    pub producer: Option<String>,
53    pub page_count: usize,
54    pub page_sizes: Vec<PageSize>,
55}
56
57/// Width and height of a PDF page in points (1 point = 1/72 inch).
58#[derive(Debug, Clone, Copy)]
59pub struct PageSize {
60    pub width: f32,
61    pub height: f32,
62}
63
64/// Merge multiple PDF byte slices into a single PDF.
65pub fn merge_pdfs(pdfs: &[&[u8]]) -> Result<Vec<u8>> {
66    if pdfs.is_empty() {
67        return Err(PdfError::NoInput);
68    }
69
70    let mut documents: Vec<Document> = Vec::with_capacity(pdfs.len());
71    for pdf in pdfs {
72        documents.push(Document::load_mem(pdf)?);
73    }
74
75    let mut merged = Document::with_version("1.7");
76    let mut pages_object_id: Option<ObjectId> = None;
77    let mut all_page_ids: Vec<ObjectId> = Vec::new();
78    let mut max_id = 1;
79
80    for doc in &documents {
81        // Renumber objects to avoid ID collisions
82        let mut doc = doc.clone();
83        doc.renumber_objects_with(max_id);
84        max_id = doc.max_id + 1;
85
86        // Collect page references
87        let pages = doc.get_pages();
88        let mut page_ids: Vec<ObjectId> = pages.into_values().collect();
89        page_ids.sort();
90
91        // Copy all objects into merged document
92        for (id, object) in doc.objects {
93            merged.objects.insert(id, object);
94        }
95
96        // Set up or extend pages catalog
97        if pages_object_id.is_none() {
98            if let Some(catalog_id) = find_pages_id(&merged) {
99                pages_object_id = Some(catalog_id);
100            }
101        }
102
103        all_page_ids.extend(page_ids);
104    }
105
106    // Build final pages tree
107    if let Some(pid) = pages_object_id {
108        let kids: Vec<Object> = all_page_ids.iter().map(|id| Object::Reference(*id)).collect();
109        let count = kids.len() as i64;
110
111        merged.objects.insert(
112            pid,
113            Object::Dictionary(lopdf::dictionary! {
114                "Type" => "Pages",
115                "Count" => count,
116                "Kids" => kids,
117            }),
118        );
119
120        // Update parent references
121        for page_id in &all_page_ids {
122            if let Ok(Object::Dictionary(ref mut dict)) = merged.objects.get_mut(page_id).ok_or(PdfError::EmptyResult) {
123                dict.set("Parent", Object::Reference(pid));
124            }
125        }
126
127        // Build catalog
128        let catalog_id = merged.new_object_id();
129        merged.objects.insert(
130            catalog_id,
131            Object::Dictionary(lopdf::dictionary! {
132                "Type" => "Catalog",
133                "Pages" => Object::Reference(pid),
134            }),
135        );
136        merged.trailer.set("Root", Object::Reference(catalog_id));
137    }
138
139    let mut buf = Vec::new();
140    merged.save_to(&mut buf)?;
141    Ok(buf)
142}
143
144/// Split a PDF into individual single-page PDFs.
145pub fn split_pdf(pdf: &[u8]) -> Result<Vec<Vec<u8>>> {
146    let doc = Document::load_mem(pdf)?;
147    let page_count = doc.get_pages().len();
148    let mut results = Vec::with_capacity(page_count);
149
150    for i in 0..page_count {
151        results.push(extract_pages(pdf, &[i])?);
152    }
153
154    Ok(results)
155}
156
157/// Extract specific pages from a PDF. Pages are 0-indexed.
158pub fn extract_pages(pdf: &[u8], indices: &[usize]) -> Result<Vec<u8>> {
159    if indices.is_empty() {
160        return Err(PdfError::EmptyResult);
161    }
162
163    let doc = Document::load_mem(pdf)?;
164    let pages: BTreeMap<u32, ObjectId> = doc.get_pages();
165    let total = pages.len();
166
167    // Validate indices
168    for &idx in indices {
169        if idx >= total {
170            return Err(PdfError::InvalidPage { index: idx, total });
171        }
172    }
173
174    // Build page list (lopdf pages are 1-indexed)
175    let keep: Vec<u32> = indices.iter().map(|&i| (i + 1) as u32).collect();
176
177    let mut new_doc = doc.clone();
178    let all_pages: Vec<u32> = pages.keys().copied().collect();
179    let remove: Vec<u32> = all_pages.into_iter().filter(|p| !keep.contains(p)).collect();
180
181    for page_num in remove.into_iter().rev() {
182        new_doc.delete_pages(&[page_num]);
183    }
184
185    let mut buf = Vec::new();
186    new_doc.save_to(&mut buf)?;
187    Ok(buf)
188}
189
190/// Get metadata and page information from a PDF.
191pub fn get_metadata(pdf: &[u8]) -> Result<PdfMetadata> {
192    let doc = Document::load_mem(pdf)?;
193    let pages = doc.get_pages();
194
195    let mut meta = PdfMetadata {
196        page_count: pages.len(),
197        ..Default::default()
198    };
199
200    // Extract info dictionary
201    if let Ok(info_ref) = doc.trailer.get(b"Info") {
202        let info_id = match info_ref {
203            Object::Reference(id) => Some(*id),
204            _ => info_ref.as_reference().ok(),
205        };
206        if let Some(id) = info_id {
207            if let Ok(Object::Dictionary(info)) = doc.get_object(id) {
208                meta.title = get_string_from_dict(info, b"Title");
209                meta.author = get_string_from_dict(info, b"Author");
210                meta.subject = get_string_from_dict(info, b"Subject");
211                meta.creator = get_string_from_dict(info, b"Creator");
212                meta.producer = get_string_from_dict(info, b"Producer");
213            }
214        }
215    }
216
217    // Extract page sizes
218    for (_, page_id) in &pages {
219        if let Ok(Object::Dictionary(page)) = doc.get_object(*page_id) {
220            if let Ok(mediabox) = page.get(b"MediaBox") {
221                if let Ok(arr) = mediabox.as_array() {
222                    if arr.len() >= 4 {
223                        let w = arr[2].as_float().or_else(|_| arr[2].as_i64().map(|v| v as f32)).unwrap_or(612.0)
224                            - arr[0].as_float().or_else(|_| arr[0].as_i64().map(|v| v as f32)).unwrap_or(0.0);
225                        let h = arr[3].as_float().or_else(|_| arr[3].as_i64().map(|v| v as f32)).unwrap_or(792.0)
226                            - arr[1].as_float().or_else(|_| arr[1].as_i64().map(|v| v as f32)).unwrap_or(0.0);
227                        meta.page_sizes.push(PageSize { width: w, height: h });
228                        continue;
229                    }
230                }
231            }
232            meta.page_sizes.push(PageSize { width: 612.0, height: 792.0 });
233        }
234    }
235
236    Ok(meta)
237}
238
239fn find_pages_id(doc: &Document) -> Option<ObjectId> {
240    for (id, obj) in &doc.objects {
241        if let Object::Dictionary(dict) = obj {
242            if let Ok(type_val) = dict.get(b"Type") {
243                if type_val.as_name_str().ok() == Some("Pages") {
244                    return Some(*id);
245                }
246            }
247        }
248    }
249    None
250}
251
252fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
253    dict.get(key).ok().and_then(|v| match v {
254        Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
255        _ => None,
256    })
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262
263    #[test]
264    fn test_merge_empty_returns_error() {
265        assert!(merge_pdfs(&[]).is_err());
266    }
267
268    #[test]
269    fn test_extract_invalid_page_returns_error() {
270        // Minimal valid PDF
271        let doc = Document::with_version("1.7");
272        let mut buf = Vec::new();
273        doc.save_to(&mut buf).unwrap();
274
275        let result = extract_pages(&buf, &[99]);
276        assert!(result.is_err());
277    }
278}