Skip to main content

pivot_pdf/
merger.rs

1use std::collections::{HashMap, HashSet};
2use std::io;
3use std::path::Path;
4
5use crate::reader::{PdfReadError, PdfReader};
6
7// ── Public types ───────────────────────────────────────────────────────────────
8
9/// Options controlling PDF merge behaviour.
10pub struct MergeOptions {
11    /// Flatten interactive form fields into static page content before merging.
12    ///
13    /// **Not yet implemented.** Returns `PdfMergeError::NotSupported` if `true`.
14    pub flatten_forms: bool,
15}
16
17impl Default for MergeOptions {
18    fn default() -> Self {
19        Self {
20            flatten_forms: false,
21        }
22    }
23}
24
25/// Errors that can occur during a PDF merge operation.
26#[derive(Debug)]
27pub enum PdfMergeError {
28    /// A feature is requested that is not yet implemented.
29    NotSupported,
30    /// An error occurred while reading one of the source PDFs.
31    ReadError(PdfReadError),
32    /// An I/O error occurred while writing the output file.
33    Io(String),
34}
35
36impl std::fmt::Display for PdfMergeError {
37    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38        match self {
39            PdfMergeError::NotSupported => write!(f, "operation not yet supported"),
40            PdfMergeError::ReadError(e) => write!(f, "read error: {}", e),
41            PdfMergeError::Io(msg) => write!(f, "I/O error: {}", msg),
42        }
43    }
44}
45
46impl std::error::Error for PdfMergeError {}
47
48impl From<io::Error> for PdfMergeError {
49    fn from(e: io::Error) -> Self {
50        PdfMergeError::Io(e.to_string())
51    }
52}
53
54// ── Public API ─────────────────────────────────────────────────────────────────
55
56/// Merge one or more PDF files into a single output PDF.
57///
58/// Pages from each source are appended in order. All pages from `inputs[0]`
59/// come first, then all pages from `inputs[1]`, and so on.
60///
61/// # Limitations
62/// - Sources must use traditional xref tables (xref streams / PDF 1.5+ not supported).
63/// - `options.flatten_forms = true` returns `PdfMergeError::NotSupported`.
64pub fn merge_pdfs<P: AsRef<Path>>(
65    inputs: &[P],
66    output: P,
67    options: MergeOptions,
68) -> Result<(), PdfMergeError> {
69    if options.flatten_forms {
70        return Err(PdfMergeError::NotSupported);
71    }
72
73    // ── Phase 1: read sources, collect object closures, assign new IDs ────────
74
75    struct SourceData {
76        reader: PdfReader,
77        page_obj_nums: Vec<u32>,
78        closure: HashSet<u32>,
79        remap: HashMap<u32, u32>,
80    }
81
82    let mut next_id: u32 = 1;
83    let mut sources: Vec<SourceData> = Vec::new();
84
85    for path in inputs {
86        let reader = PdfReader::open(path).map_err(PdfMergeError::ReadError)?;
87        let page_obj_nums = reader
88            .page_object_numbers()
89            .map_err(PdfMergeError::ReadError)?;
90        let closure = reader
91            .collect_closure(&page_obj_nums)
92            .map_err(PdfMergeError::ReadError)?;
93
94        // Assign output IDs in a deterministic order.
95        let mut sorted_objs: Vec<u32> = closure.iter().copied().collect();
96        sorted_objs.sort_unstable();
97
98        let mut remap = HashMap::new();
99        for obj_num in sorted_objs {
100            remap.insert(obj_num, next_id);
101            next_id += 1;
102        }
103
104        sources.push(SourceData {
105            reader,
106            page_obj_nums,
107            closure,
108            remap,
109        });
110    }
111
112    // Reserve IDs for the new merged Pages tree and Catalog.
113    let pages_id = next_id;
114    next_id += 1;
115    let catalog_id = next_id;
116    let max_id = catalog_id;
117
118    // ── Phase 2: build the output PDF ─────────────────────────────────────────
119
120    let mut out = OutputBuilder::new();
121    out.write_header();
122
123    // Copy and renumber all objects from each source.
124    for source in &sources {
125        let mut by_new_id: Vec<(u32, u32)> = source
126            .closure
127            .iter()
128            .map(|&orig| (source.remap[&orig], orig))
129            .collect();
130        by_new_id.sort_unstable_by_key(|(new_id, _)| *new_id);
131
132        for (new_id, orig_num) in by_new_id {
133            let raw = source
134                .reader
135                .raw_object_bytes(orig_num)
136                .map_err(PdfMergeError::ReadError)?;
137
138            let renumbered = renumber_object_bytes(raw, &source.remap);
139            out.write_raw_object(new_id, &renumbered);
140        }
141    }
142
143    // Write the merged Pages tree — lists all source pages in document order.
144    let all_page_ids: Vec<u32> = sources
145        .iter()
146        .flat_map(|s| s.page_obj_nums.iter().map(|n| s.remap[n]))
147        .collect();
148
149    let total_pages = all_page_ids.len();
150    let kids: String = all_page_ids
151        .iter()
152        .map(|id| format!("{} 0 R", id))
153        .collect::<Vec<_>>()
154        .join(" ");
155
156    out.write_object_str(
157        pages_id,
158        &format!("<< /Type /Pages /Count {} /Kids [{}] >>", total_pages, kids),
159    );
160
161    // Write the Catalog.
162    out.write_object_str(
163        catalog_id,
164        &format!("<< /Type /Catalog /Pages {} 0 R >>", pages_id),
165    );
166
167    out.write_xref_and_trailer(max_id, catalog_id);
168
169    std::fs::write(output, out.into_bytes())?;
170
171    Ok(())
172}
173
174// ── Object renumbering ─────────────────────────────────────────────────────────
175
176/// Rewrite all indirect references and the object header in `bytes` using `remap`.
177///
178/// Scans for `N G R` (indirect reference) and `N G obj` (object header) patterns
179/// at word boundaries. When `N` appears in `remap`, replaces it with the mapped
180/// value and resets the generation number to 0.
181///
182/// Stream bodies (between `stream` and `endstream`) are copied verbatim to avoid
183/// corrupting binary-compressed content.
184fn renumber_object_bytes(bytes: &[u8], remap: &HashMap<u32, u32>) -> Vec<u8> {
185    let mut out = Vec::with_capacity(bytes.len() + 16);
186    let mut i = 0;
187
188    while i < bytes.len() {
189        // Copy stream bodies verbatim — they may contain compressed binary data
190        // that could accidentally match reference patterns.
191        let at_boundary = i == 0 || is_pdf_delim(bytes[i - 1]);
192        if at_boundary && bytes[i..].starts_with(b"stream") {
193            // Verify it's the keyword, not e.g. "streaming".
194            let after = i + 6;
195            if after >= bytes.len() || is_pdf_delim(bytes[after]) {
196                // Locate the stream body start (after the mandatory line ending).
197                let body_start = skip_stream_newline(&bytes[after..])
198                    .map(|n| after + n)
199                    .unwrap_or(after);
200
201                // Find "endstream" from the body start.
202                let endstream_pos = bytes[body_start..]
203                    .windows(9)
204                    .position(|w| w == b"endstream")
205                    .map(|p| body_start + p)
206                    .unwrap_or(bytes.len());
207
208                // Copy "stream\n...binary content...endstream" verbatim.
209                out.extend_from_slice(&bytes[i..endstream_pos + 9]);
210                i = endstream_pos + 9;
211                continue;
212            }
213        }
214
215        // At word boundaries, try to match and renumber `N G R` or `N G obj`.
216        if at_boundary && i < bytes.len() && bytes[i].is_ascii_digit() {
217            if let Some((n, consumed, keyword)) = parse_ngr(&bytes[i..]) {
218                let new_n = remap.get(&n).copied().unwrap_or(n);
219                out.extend_from_slice(format!("{} 0 {}", new_n, keyword).as_bytes());
220                i += consumed;
221                continue;
222            }
223        }
224
225        out.push(bytes[i]);
226        i += 1;
227    }
228
229    out
230}
231
232/// Returns the number of bytes consumed by the line ending after `stream`.
233/// PDF spec requires either `\n` or `\r\n`.
234fn skip_stream_newline(data: &[u8]) -> Option<usize> {
235    match data.first()? {
236        b'\n' => Some(1),
237        b'\r' => {
238            if data.get(1) == Some(&b'\n') {
239                Some(2)
240            } else {
241                Some(1)
242            }
243        }
244        _ => None,
245    }
246}
247
248/// Attempt to parse `N G R` or `N G obj` at the start of `data`.
249///
250/// Returns `(object_number, bytes_consumed, keyword)` on success.
251fn parse_ngr(data: &[u8]) -> Option<(u32, usize, &'static str)> {
252    let mut i = 0;
253
254    // Parse N (object number — one or more ASCII digits).
255    let n_start = i;
256    while i < data.len() && data[i].is_ascii_digit() {
257        i += 1;
258    }
259    if i == n_start {
260        return None;
261    }
262    let n: u32 = std::str::from_utf8(&data[n_start..i]).ok()?.parse().ok()?;
263
264    // Require whitespace between N and G.
265    if i >= data.len() || !data[i].is_ascii_whitespace() {
266        return None;
267    }
268    while i < data.len() && data[i].is_ascii_whitespace() {
269        i += 1;
270    }
271
272    // Parse G (generation number — one or more ASCII digits).
273    let g_start = i;
274    while i < data.len() && data[i].is_ascii_digit() {
275        i += 1;
276    }
277    if i == g_start {
278        return None;
279    }
280
281    // Require whitespace between G and the keyword.
282    if i >= data.len() || !data[i].is_ascii_whitespace() {
283        return None;
284    }
285    while i < data.len() && data[i].is_ascii_whitespace() {
286        i += 1;
287    }
288
289    // Match keyword at a word boundary.
290    if data[i..].starts_with(b"R") {
291        let after = i + 1;
292        if after >= data.len() || is_pdf_delim(data[after]) {
293            return Some((n, after, "R"));
294        }
295    } else if data[i..].starts_with(b"obj") {
296        let after = i + 3;
297        if after >= data.len() || is_pdf_delim(data[after]) {
298            return Some((n, after, "obj"));
299        }
300    }
301
302    None
303}
304
305/// True for ASCII whitespace or PDF delimiter characters.
306fn is_pdf_delim(b: u8) -> bool {
307    b.is_ascii_whitespace() || matches!(b, b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'/' | b'%')
308}
309
310// ── Output builder ─────────────────────────────────────────────────────────────
311
312struct OutputBuilder {
313    buf: Vec<u8>,
314    offsets: HashMap<u32, usize>,
315}
316
317impl OutputBuilder {
318    fn new() -> Self {
319        OutputBuilder {
320            buf: Vec::new(),
321            offsets: HashMap::new(),
322        }
323    }
324
325    fn write_header(&mut self) {
326        self.buf.extend_from_slice(b"%PDF-1.7\n");
327        // Binary comment: 4 high bytes signal binary content to transfer tools.
328        self.buf.extend_from_slice(b"%\xe2\xe3\xcf\xd3\n");
329    }
330
331    /// Append a pre-renumbered object (bytes already contain the correct `N 0 obj…endobj`).
332    fn write_raw_object(&mut self, new_id: u32, bytes: &[u8]) {
333        self.offsets.insert(new_id, self.buf.len());
334        self.buf.extend_from_slice(bytes);
335        // Ensure there is a trailing newline between objects.
336        if !self.buf.ends_with(b"\n") {
337            self.buf.push(b'\n');
338        }
339    }
340
341    /// Write a new object whose body is supplied as a string (dictionary literal).
342    fn write_object_str(&mut self, id: u32, body: &str) {
343        self.offsets.insert(id, self.buf.len());
344        self.buf
345            .extend_from_slice(format!("{} 0 obj\n{}\nendobj\n", id, body).as_bytes());
346    }
347
348    /// Write the xref table and trailer, then `startxref` and `%%EOF`.
349    fn write_xref_and_trailer(&mut self, max_id: u32, catalog_id: u32) {
350        let xref_offset = self.buf.len();
351        let total = max_id + 1; // objects 0..=max_id
352
353        self.buf.extend_from_slice(b"xref\n");
354        self.buf
355            .extend_from_slice(format!("0 {}\n", total).as_bytes());
356
357        // Object 0: free-list head.
358        self.buf.extend_from_slice(b"0000000000 65535 f\r\n");
359
360        for obj_id in 1..=max_id {
361            let offset = self.offsets.get(&obj_id).copied().unwrap_or(0);
362            self.buf
363                .extend_from_slice(format!("{:010} 00000 n\r\n", offset).as_bytes());
364        }
365
366        self.buf.extend_from_slice(b"trailer\n");
367        self.buf.extend_from_slice(
368            format!("<< /Size {} /Root {} 0 R >>\n", total, catalog_id).as_bytes(),
369        );
370        self.buf.extend_from_slice(b"startxref\n");
371        self.buf
372            .extend_from_slice(format!("{}\n", xref_offset).as_bytes());
373        self.buf.extend_from_slice(b"%%EOF\n");
374    }
375
376    fn into_bytes(self) -> Vec<u8> {
377        self.buf
378    }
379}
380
381// ── Internal unit tests ────────────────────────────────────────────────────────
382
383#[cfg(test)]
384mod tests {
385    use super::*;
386    use crate::document::{DocumentOptions, PdfDocument};
387
388    fn make_pdf(n: usize) -> Vec<u8> {
389        let mut doc = PdfDocument::new(Vec::new(), DocumentOptions::default()).unwrap();
390        for _ in 0..n {
391            doc.begin_page(612.0, 792.0);
392            doc.end_page().unwrap();
393        }
394        doc.end_document().unwrap()
395    }
396
397    // ── renumber_object_bytes ─────────────────────────────────────────────────
398
399    #[test]
400    fn renumber_replaces_obj_header() {
401        let input = b"5 0 obj\n<< /Type /Page >>\nendobj";
402        let mut remap = HashMap::new();
403        remap.insert(5u32, 3u32);
404        let out = renumber_object_bytes(input, &remap);
405        assert!(
406            out.starts_with(b"3 0 obj"),
407            "header not renumbered: {:?}",
408            &out[..20]
409        );
410    }
411
412    #[test]
413    fn renumber_replaces_indirect_references() {
414        let input = b"5 0 obj\n<< /Parent 2 0 R /Contents 6 0 R >>\nendobj";
415        let mut remap = HashMap::new();
416        remap.insert(5u32, 10u32);
417        remap.insert(2u32, 20u32);
418        remap.insert(6u32, 60u32);
419        let out = renumber_object_bytes(input, &remap);
420        let s = std::str::from_utf8(&out).unwrap();
421        assert!(s.contains("20 0 R"), "Parent ref not renumbered: {}", s);
422        assert!(s.contains("60 0 R"), "Contents ref not renumbered: {}", s);
423    }
424
425    #[test]
426    fn renumber_does_not_corrupt_stream_body() {
427        // A stream body with bytes that look like "2 0 R" must not be rewritten.
428        let stream_body = b"2 0 R this looks like a ref but is compressed content";
429        let input = {
430            let mut v = b"7 0 obj\n<< /Length 51 >>\nstream\n".to_vec();
431            v.extend_from_slice(stream_body);
432            v.extend_from_slice(b"\nendstream\nendobj");
433            v
434        };
435        let mut remap = HashMap::new();
436        remap.insert(7u32, 1u32);
437        remap.insert(2u32, 99u32);
438        let out = renumber_object_bytes(&input, &remap);
439        let s = std::str::from_utf8(&out).unwrap();
440        // The stream body must be preserved verbatim.
441        assert!(
442            s.contains("2 0 R this looks like"),
443            "stream body was incorrectly renumbered: {}",
444            s
445        );
446    }
447
448    #[test]
449    fn renumber_preserves_unmapped_refs() {
450        // References whose object number is not in remap should pass through unchanged.
451        let input = b"5 0 obj\n<< /Font 99 0 R >>\nendobj";
452        let mut remap = HashMap::new();
453        remap.insert(5u32, 1u32);
454        let out = renumber_object_bytes(input, &remap);
455        let s = std::str::from_utf8(&out).unwrap();
456        assert!(s.contains("99 0 R"), "unmapped ref was changed: {}", s);
457    }
458
459    // ── parse_ngr ─────────────────────────────────────────────────────────────
460
461    #[test]
462    fn parse_ngr_matches_reference() {
463        let (n, _, kw) = parse_ngr(b"5 0 R ").unwrap();
464        assert_eq!(n, 5);
465        assert_eq!(kw, "R");
466    }
467
468    #[test]
469    fn parse_ngr_matches_obj_header() {
470        let (n, _, kw) = parse_ngr(b"10 0 obj\n").unwrap();
471        assert_eq!(n, 10);
472        assert_eq!(kw, "obj");
473    }
474
475    #[test]
476    fn parse_ngr_rejects_partial_match() {
477        // "5 0 Refer" — "R" not at a word boundary
478        assert!(parse_ngr(b"5 0 Refer").is_none());
479    }
480
481    // ── merge_pdfs round-trip (internal) ──────────────────────────────────────
482
483    #[test]
484    fn merge_two_pdfs_round_trip() {
485        let a_bytes = make_pdf(1);
486        let b_bytes = make_pdf(2);
487
488        let dir = std::env::temp_dir();
489        let a_path = dir.join("merge_internal_a.pdf");
490        let b_path = dir.join("merge_internal_b.pdf");
491        let out_path = dir.join("merge_internal_out.pdf");
492
493        std::fs::write(&a_path, &a_bytes).unwrap();
494        std::fs::write(&b_path, &b_bytes).unwrap();
495
496        merge_pdfs(&[&a_path, &b_path], &out_path, MergeOptions::default()).unwrap();
497
498        let reader = crate::reader::PdfReader::open(&out_path).unwrap();
499        assert_eq!(reader.page_count(), 3);
500    }
501}