mioffice_pdf_utils/
lib.rs1use lopdf::{dictionary, Document, Object, ObjectId};
25use std::collections::BTreeMap;
26
27pub use lopdf;
28
29#[derive(Debug, thiserror::Error)]
30pub enum PdfError {
31 #[error("Failed to parse PDF: {0}")]
32 Parse(#[from] lopdf::Error),
33 #[error("IO error: {0}")]
34 Io(#[from] std::io::Error),
35 #[error("Invalid page index: {index} (document has {total} pages)")]
36 InvalidPage { index: usize, total: usize },
37 #[error("Cannot produce empty PDF")]
38 EmptyResult,
39 #[error("No input PDFs provided")]
40 NoInput,
41}
42
43pub type Result<T> = std::result::Result<T, PdfError>;
44
45#[derive(Debug, Clone, Default)]
47pub struct PdfMetadata {
48 pub title: Option<String>,
49 pub author: Option<String>,
50 pub subject: Option<String>,
51 pub creator: Option<String>,
52 pub producer: Option<String>,
53 pub page_count: usize,
54 pub page_sizes: Vec<PageSize>,
55}
56
57#[derive(Debug, Clone, Copy)]
59pub struct PageSize {
60 pub width: f32,
61 pub height: f32,
62}
63
64pub fn merge_pdfs(pdfs: &[&[u8]]) -> Result<Vec<u8>> {
66 if pdfs.is_empty() {
67 return Err(PdfError::NoInput);
68 }
69
70 let mut documents: Vec<Document> = Vec::with_capacity(pdfs.len());
71 for pdf in pdfs {
72 documents.push(Document::load_mem(pdf)?);
73 }
74
75 let mut merged = Document::with_version("1.7");
76 let mut pages_object_id: Option<ObjectId> = None;
77 let mut all_page_ids: Vec<ObjectId> = Vec::new();
78 let mut max_id = 1;
79
80 for doc in &documents {
81 let mut doc = doc.clone();
83 doc.renumber_objects_with(max_id);
84 max_id = doc.max_id + 1;
85
86 let pages = doc.get_pages();
88 let mut page_ids: Vec<ObjectId> = pages.into_values().collect();
89 page_ids.sort();
90
91 for (id, object) in doc.objects {
93 merged.objects.insert(id, object);
94 }
95
96 if pages_object_id.is_none() {
98 if let Some(catalog_id) = find_pages_id(&merged) {
99 pages_object_id = Some(catalog_id);
100 }
101 }
102
103 all_page_ids.extend(page_ids);
104 }
105
106 if let Some(pid) = pages_object_id {
108 let kids: Vec<Object> = all_page_ids.iter().map(|id| Object::Reference(*id)).collect();
109 let count = kids.len() as i64;
110
111 merged.objects.insert(
112 pid,
113 Object::Dictionary(lopdf::dictionary! {
114 "Type" => "Pages",
115 "Count" => count,
116 "Kids" => kids,
117 }),
118 );
119
120 for page_id in &all_page_ids {
122 if let Ok(Object::Dictionary(ref mut dict)) = merged.objects.get_mut(page_id).ok_or(PdfError::EmptyResult) {
123 dict.set("Parent", Object::Reference(pid));
124 }
125 }
126
127 let catalog_id = merged.new_object_id();
129 merged.objects.insert(
130 catalog_id,
131 Object::Dictionary(lopdf::dictionary! {
132 "Type" => "Catalog",
133 "Pages" => Object::Reference(pid),
134 }),
135 );
136 merged.trailer.set("Root", Object::Reference(catalog_id));
137 }
138
139 let mut buf = Vec::new();
140 merged.save_to(&mut buf)?;
141 Ok(buf)
142}
143
144pub fn split_pdf(pdf: &[u8]) -> Result<Vec<Vec<u8>>> {
146 let doc = Document::load_mem(pdf)?;
147 let page_count = doc.get_pages().len();
148 let mut results = Vec::with_capacity(page_count);
149
150 for i in 0..page_count {
151 results.push(extract_pages(pdf, &[i])?);
152 }
153
154 Ok(results)
155}
156
157pub fn extract_pages(pdf: &[u8], indices: &[usize]) -> Result<Vec<u8>> {
159 if indices.is_empty() {
160 return Err(PdfError::EmptyResult);
161 }
162
163 let doc = Document::load_mem(pdf)?;
164 let pages: BTreeMap<u32, ObjectId> = doc.get_pages();
165 let total = pages.len();
166
167 for &idx in indices {
169 if idx >= total {
170 return Err(PdfError::InvalidPage { index: idx, total });
171 }
172 }
173
174 let keep: Vec<u32> = indices.iter().map(|&i| (i + 1) as u32).collect();
176
177 let mut new_doc = doc.clone();
178 let all_pages: Vec<u32> = pages.keys().copied().collect();
179 let remove: Vec<u32> = all_pages.into_iter().filter(|p| !keep.contains(p)).collect();
180
181 for page_num in remove.into_iter().rev() {
182 new_doc.delete_pages(&[page_num]);
183 }
184
185 let mut buf = Vec::new();
186 new_doc.save_to(&mut buf)?;
187 Ok(buf)
188}
189
190pub fn get_metadata(pdf: &[u8]) -> Result<PdfMetadata> {
192 let doc = Document::load_mem(pdf)?;
193 let pages = doc.get_pages();
194
195 let mut meta = PdfMetadata {
196 page_count: pages.len(),
197 ..Default::default()
198 };
199
200 if let Ok(info_ref) = doc.trailer.get(b"Info") {
202 let info_id = match info_ref {
203 Object::Reference(id) => Some(*id),
204 _ => info_ref.as_reference().ok(),
205 };
206 if let Some(id) = info_id {
207 if let Ok(Object::Dictionary(info)) = doc.get_object(id) {
208 meta.title = get_string_from_dict(info, b"Title");
209 meta.author = get_string_from_dict(info, b"Author");
210 meta.subject = get_string_from_dict(info, b"Subject");
211 meta.creator = get_string_from_dict(info, b"Creator");
212 meta.producer = get_string_from_dict(info, b"Producer");
213 }
214 }
215 }
216
217 for (_, page_id) in &pages {
219 if let Ok(Object::Dictionary(page)) = doc.get_object(*page_id) {
220 if let Ok(mediabox) = page.get(b"MediaBox") {
221 if let Ok(arr) = mediabox.as_array() {
222 if arr.len() >= 4 {
223 let w = arr[2].as_float().or_else(|_| arr[2].as_i64().map(|v| v as f32)).unwrap_or(612.0)
224 - arr[0].as_float().or_else(|_| arr[0].as_i64().map(|v| v as f32)).unwrap_or(0.0);
225 let h = arr[3].as_float().or_else(|_| arr[3].as_i64().map(|v| v as f32)).unwrap_or(792.0)
226 - arr[1].as_float().or_else(|_| arr[1].as_i64().map(|v| v as f32)).unwrap_or(0.0);
227 meta.page_sizes.push(PageSize { width: w, height: h });
228 continue;
229 }
230 }
231 }
232 meta.page_sizes.push(PageSize { width: 612.0, height: 792.0 });
233 }
234 }
235
236 Ok(meta)
237}
238
239fn find_pages_id(doc: &Document) -> Option<ObjectId> {
240 for (id, obj) in &doc.objects {
241 if let Object::Dictionary(dict) = obj {
242 if let Ok(type_val) = dict.get(b"Type") {
243 if type_val.as_name_str().ok() == Some("Pages") {
244 return Some(*id);
245 }
246 }
247 }
248 }
249 None
250}
251
252fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
253 dict.get(key).ok().and_then(|v| match v {
254 Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
255 _ => None,
256 })
257}
258
259#[cfg(test)]
260mod tests {
261 use super::*;
262
263 #[test]
264 fn test_merge_empty_returns_error() {
265 assert!(merge_pdfs(&[]).is_err());
266 }
267
268 #[test]
269 fn test_extract_invalid_page_returns_error() {
270 let doc = Document::with_version("1.7");
272 let mut buf = Vec::new();
273 doc.save_to(&mut buf).unwrap();
274
275 let result = extract_pages(&buf, &[99]);
276 assert!(result.is_err());
277 }
278}