1use std::collections::{HashMap, HashSet};
2use std::io;
3use std::path::Path;
4
5use crate::reader::{PdfReadError, PdfReader};
6
7pub struct MergeOptions {
11 pub flatten_forms: bool,
15}
16
17impl Default for MergeOptions {
18 fn default() -> Self {
19 Self {
20 flatten_forms: false,
21 }
22 }
23}
24
25#[derive(Debug)]
27pub enum PdfMergeError {
28 NotSupported,
30 ReadError(PdfReadError),
32 Io(String),
34}
35
36impl std::fmt::Display for PdfMergeError {
37 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38 match self {
39 PdfMergeError::NotSupported => write!(f, "operation not yet supported"),
40 PdfMergeError::ReadError(e) => write!(f, "read error: {}", e),
41 PdfMergeError::Io(msg) => write!(f, "I/O error: {}", msg),
42 }
43 }
44}
45
46impl std::error::Error for PdfMergeError {}
47
48impl From<io::Error> for PdfMergeError {
49 fn from(e: io::Error) -> Self {
50 PdfMergeError::Io(e.to_string())
51 }
52}
53
54pub fn merge_pdfs<P: AsRef<Path>>(
65 inputs: &[P],
66 output: P,
67 options: MergeOptions,
68) -> Result<(), PdfMergeError> {
69 if options.flatten_forms {
70 return Err(PdfMergeError::NotSupported);
71 }
72
73 struct SourceData {
76 reader: PdfReader,
77 page_obj_nums: Vec<u32>,
78 closure: HashSet<u32>,
79 remap: HashMap<u32, u32>,
80 }
81
82 let mut next_id: u32 = 1;
83 let mut sources: Vec<SourceData> = Vec::new();
84
85 for path in inputs {
86 let reader = PdfReader::open(path).map_err(PdfMergeError::ReadError)?;
87 let page_obj_nums = reader
88 .page_object_numbers()
89 .map_err(PdfMergeError::ReadError)?;
90 let closure = reader
91 .collect_closure(&page_obj_nums)
92 .map_err(PdfMergeError::ReadError)?;
93
94 let mut sorted_objs: Vec<u32> = closure.iter().copied().collect();
96 sorted_objs.sort_unstable();
97
98 let mut remap = HashMap::new();
99 for obj_num in sorted_objs {
100 remap.insert(obj_num, next_id);
101 next_id += 1;
102 }
103
104 sources.push(SourceData {
105 reader,
106 page_obj_nums,
107 closure,
108 remap,
109 });
110 }
111
112 let pages_id = next_id;
114 next_id += 1;
115 let catalog_id = next_id;
116 let max_id = catalog_id;
117
118 let mut out = OutputBuilder::new();
121 out.write_header();
122
123 for source in &sources {
125 let mut by_new_id: Vec<(u32, u32)> = source
126 .closure
127 .iter()
128 .map(|&orig| (source.remap[&orig], orig))
129 .collect();
130 by_new_id.sort_unstable_by_key(|(new_id, _)| *new_id);
131
132 for (new_id, orig_num) in by_new_id {
133 let raw = source
134 .reader
135 .raw_object_bytes(orig_num)
136 .map_err(PdfMergeError::ReadError)?;
137
138 let renumbered = renumber_object_bytes(raw, &source.remap);
139 out.write_raw_object(new_id, &renumbered);
140 }
141 }
142
143 let all_page_ids: Vec<u32> = sources
145 .iter()
146 .flat_map(|s| s.page_obj_nums.iter().map(|n| s.remap[n]))
147 .collect();
148
149 let total_pages = all_page_ids.len();
150 let kids: String = all_page_ids
151 .iter()
152 .map(|id| format!("{} 0 R", id))
153 .collect::<Vec<_>>()
154 .join(" ");
155
156 out.write_object_str(
157 pages_id,
158 &format!("<< /Type /Pages /Count {} /Kids [{}] >>", total_pages, kids),
159 );
160
161 out.write_object_str(
163 catalog_id,
164 &format!("<< /Type /Catalog /Pages {} 0 R >>", pages_id),
165 );
166
167 out.write_xref_and_trailer(max_id, catalog_id);
168
169 std::fs::write(output, out.into_bytes())?;
170
171 Ok(())
172}
173
174fn renumber_object_bytes(bytes: &[u8], remap: &HashMap<u32, u32>) -> Vec<u8> {
185 let mut out = Vec::with_capacity(bytes.len() + 16);
186 let mut i = 0;
187
188 while i < bytes.len() {
189 let at_boundary = i == 0 || is_pdf_delim(bytes[i - 1]);
192 if at_boundary && bytes[i..].starts_with(b"stream") {
193 let after = i + 6;
195 if after >= bytes.len() || is_pdf_delim(bytes[after]) {
196 let body_start = skip_stream_newline(&bytes[after..])
198 .map(|n| after + n)
199 .unwrap_or(after);
200
201 let endstream_pos = bytes[body_start..]
203 .windows(9)
204 .position(|w| w == b"endstream")
205 .map(|p| body_start + p)
206 .unwrap_or(bytes.len());
207
208 out.extend_from_slice(&bytes[i..endstream_pos + 9]);
210 i = endstream_pos + 9;
211 continue;
212 }
213 }
214
215 if at_boundary && i < bytes.len() && bytes[i].is_ascii_digit() {
217 if let Some((n, consumed, keyword)) = parse_ngr(&bytes[i..]) {
218 let new_n = remap.get(&n).copied().unwrap_or(n);
219 out.extend_from_slice(format!("{} 0 {}", new_n, keyword).as_bytes());
220 i += consumed;
221 continue;
222 }
223 }
224
225 out.push(bytes[i]);
226 i += 1;
227 }
228
229 out
230}
231
232fn skip_stream_newline(data: &[u8]) -> Option<usize> {
235 match data.first()? {
236 b'\n' => Some(1),
237 b'\r' => {
238 if data.get(1) == Some(&b'\n') {
239 Some(2)
240 } else {
241 Some(1)
242 }
243 }
244 _ => None,
245 }
246}
247
248fn parse_ngr(data: &[u8]) -> Option<(u32, usize, &'static str)> {
252 let mut i = 0;
253
254 let n_start = i;
256 while i < data.len() && data[i].is_ascii_digit() {
257 i += 1;
258 }
259 if i == n_start {
260 return None;
261 }
262 let n: u32 = std::str::from_utf8(&data[n_start..i]).ok()?.parse().ok()?;
263
264 if i >= data.len() || !data[i].is_ascii_whitespace() {
266 return None;
267 }
268 while i < data.len() && data[i].is_ascii_whitespace() {
269 i += 1;
270 }
271
272 let g_start = i;
274 while i < data.len() && data[i].is_ascii_digit() {
275 i += 1;
276 }
277 if i == g_start {
278 return None;
279 }
280
281 if i >= data.len() || !data[i].is_ascii_whitespace() {
283 return None;
284 }
285 while i < data.len() && data[i].is_ascii_whitespace() {
286 i += 1;
287 }
288
289 if data[i..].starts_with(b"R") {
291 let after = i + 1;
292 if after >= data.len() || is_pdf_delim(data[after]) {
293 return Some((n, after, "R"));
294 }
295 } else if data[i..].starts_with(b"obj") {
296 let after = i + 3;
297 if after >= data.len() || is_pdf_delim(data[after]) {
298 return Some((n, after, "obj"));
299 }
300 }
301
302 None
303}
304
305fn is_pdf_delim(b: u8) -> bool {
307 b.is_ascii_whitespace() || matches!(b, b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'/' | b'%')
308}
309
310struct OutputBuilder {
313 buf: Vec<u8>,
314 offsets: HashMap<u32, usize>,
315}
316
317impl OutputBuilder {
318 fn new() -> Self {
319 OutputBuilder {
320 buf: Vec::new(),
321 offsets: HashMap::new(),
322 }
323 }
324
325 fn write_header(&mut self) {
326 self.buf.extend_from_slice(b"%PDF-1.7\n");
327 self.buf.extend_from_slice(b"%\xe2\xe3\xcf\xd3\n");
329 }
330
331 fn write_raw_object(&mut self, new_id: u32, bytes: &[u8]) {
333 self.offsets.insert(new_id, self.buf.len());
334 self.buf.extend_from_slice(bytes);
335 if !self.buf.ends_with(b"\n") {
337 self.buf.push(b'\n');
338 }
339 }
340
341 fn write_object_str(&mut self, id: u32, body: &str) {
343 self.offsets.insert(id, self.buf.len());
344 self.buf
345 .extend_from_slice(format!("{} 0 obj\n{}\nendobj\n", id, body).as_bytes());
346 }
347
348 fn write_xref_and_trailer(&mut self, max_id: u32, catalog_id: u32) {
350 let xref_offset = self.buf.len();
351 let total = max_id + 1; self.buf.extend_from_slice(b"xref\n");
354 self.buf
355 .extend_from_slice(format!("0 {}\n", total).as_bytes());
356
357 self.buf.extend_from_slice(b"0000000000 65535 f\r\n");
359
360 for obj_id in 1..=max_id {
361 let offset = self.offsets.get(&obj_id).copied().unwrap_or(0);
362 self.buf
363 .extend_from_slice(format!("{:010} 00000 n\r\n", offset).as_bytes());
364 }
365
366 self.buf.extend_from_slice(b"trailer\n");
367 self.buf.extend_from_slice(
368 format!("<< /Size {} /Root {} 0 R >>\n", total, catalog_id).as_bytes(),
369 );
370 self.buf.extend_from_slice(b"startxref\n");
371 self.buf
372 .extend_from_slice(format!("{}\n", xref_offset).as_bytes());
373 self.buf.extend_from_slice(b"%%EOF\n");
374 }
375
376 fn into_bytes(self) -> Vec<u8> {
377 self.buf
378 }
379}
380
381#[cfg(test)]
384mod tests {
385 use super::*;
386 use crate::document::{DocumentOptions, PdfDocument};
387
388 fn make_pdf(n: usize) -> Vec<u8> {
389 let mut doc = PdfDocument::new(Vec::new(), DocumentOptions::default()).unwrap();
390 for _ in 0..n {
391 doc.begin_page(612.0, 792.0);
392 doc.end_page().unwrap();
393 }
394 doc.end_document().unwrap()
395 }
396
397 #[test]
400 fn renumber_replaces_obj_header() {
401 let input = b"5 0 obj\n<< /Type /Page >>\nendobj";
402 let mut remap = HashMap::new();
403 remap.insert(5u32, 3u32);
404 let out = renumber_object_bytes(input, &remap);
405 assert!(
406 out.starts_with(b"3 0 obj"),
407 "header not renumbered: {:?}",
408 &out[..20]
409 );
410 }
411
412 #[test]
413 fn renumber_replaces_indirect_references() {
414 let input = b"5 0 obj\n<< /Parent 2 0 R /Contents 6 0 R >>\nendobj";
415 let mut remap = HashMap::new();
416 remap.insert(5u32, 10u32);
417 remap.insert(2u32, 20u32);
418 remap.insert(6u32, 60u32);
419 let out = renumber_object_bytes(input, &remap);
420 let s = std::str::from_utf8(&out).unwrap();
421 assert!(s.contains("20 0 R"), "Parent ref not renumbered: {}", s);
422 assert!(s.contains("60 0 R"), "Contents ref not renumbered: {}", s);
423 }
424
425 #[test]
426 fn renumber_does_not_corrupt_stream_body() {
427 let stream_body = b"2 0 R this looks like a ref but is compressed content";
429 let input = {
430 let mut v = b"7 0 obj\n<< /Length 51 >>\nstream\n".to_vec();
431 v.extend_from_slice(stream_body);
432 v.extend_from_slice(b"\nendstream\nendobj");
433 v
434 };
435 let mut remap = HashMap::new();
436 remap.insert(7u32, 1u32);
437 remap.insert(2u32, 99u32);
438 let out = renumber_object_bytes(&input, &remap);
439 let s = std::str::from_utf8(&out).unwrap();
440 assert!(
442 s.contains("2 0 R this looks like"),
443 "stream body was incorrectly renumbered: {}",
444 s
445 );
446 }
447
448 #[test]
449 fn renumber_preserves_unmapped_refs() {
450 let input = b"5 0 obj\n<< /Font 99 0 R >>\nendobj";
452 let mut remap = HashMap::new();
453 remap.insert(5u32, 1u32);
454 let out = renumber_object_bytes(input, &remap);
455 let s = std::str::from_utf8(&out).unwrap();
456 assert!(s.contains("99 0 R"), "unmapped ref was changed: {}", s);
457 }
458
459 #[test]
462 fn parse_ngr_matches_reference() {
463 let (n, _, kw) = parse_ngr(b"5 0 R ").unwrap();
464 assert_eq!(n, 5);
465 assert_eq!(kw, "R");
466 }
467
468 #[test]
469 fn parse_ngr_matches_obj_header() {
470 let (n, _, kw) = parse_ngr(b"10 0 obj\n").unwrap();
471 assert_eq!(n, 10);
472 assert_eq!(kw, "obj");
473 }
474
475 #[test]
476 fn parse_ngr_rejects_partial_match() {
477 assert!(parse_ngr(b"5 0 Refer").is_none());
479 }
480
481 #[test]
484 fn merge_two_pdfs_round_trip() {
485 let a_bytes = make_pdf(1);
486 let b_bytes = make_pdf(2);
487
488 let dir = std::env::temp_dir();
489 let a_path = dir.join("merge_internal_a.pdf");
490 let b_path = dir.join("merge_internal_b.pdf");
491 let out_path = dir.join("merge_internal_out.pdf");
492
493 std::fs::write(&a_path, &a_bytes).unwrap();
494 std::fs::write(&b_path, &b_bytes).unwrap();
495
496 merge_pdfs(&[&a_path, &b_path], &out_path, MergeOptions::default()).unwrap();
497
498 let reader = crate::reader::PdfReader::open(&out_path).unwrap();
499 assert_eq!(reader.page_count(), 3);
500 }
501}