Skip to main content

justpdf_core/writer/
modify.rs

1//! Document modification: load existing PDF, modify, and save.
2//! Also provides page merge/split operations.
3
4use std::collections::HashMap;
5use std::path::Path;
6
7use crate::error::Result;
8use crate::object::{IndirectRef, PdfDict, PdfObject};
9use crate::page::{collect_pages, PageInfo};
10use crate::parser::PdfDocument;
11use crate::writer::page::PageBuilder;
12use crate::writer::serialize::serialize_pdf;
13use crate::writer::PdfWriter;
14
15/// Modifier for existing PDF documents.
16/// Loads all objects from a PdfDocument, allows modification, then saves.
17pub struct DocumentModifier {
18    writer: PdfWriter,
19    catalog_ref: IndirectRef,
20    info_ref: Option<IndirectRef>,
21}
22
23impl DocumentModifier {
24    /// Create a modifier from an existing PdfDocument.
25    /// Copies all objects from the document into the writer.
26    pub fn from_document(doc: &PdfDocument) -> Result<Self> {
27        let mut writer = PdfWriter::new();
28        writer.version = doc.version;
29
30        // Find catalog reference
31        let catalog_ref = doc
32            .catalog_ref()
33            .cloned()
34            .unwrap_or(IndirectRef {
35                obj_num: 1,
36                gen_num: 0,
37            });
38
39        // Find info reference from trailer
40        let info_ref = doc
41            .trailer()
42            .get_ref(b"Info")
43            .cloned();
44
45        // Copy all objects
46        let mut max_obj = 0u32;
47        let refs: Vec<IndirectRef> = doc.object_refs().collect();
48        for iref in &refs {
49            if let Ok(obj) = doc.resolve(iref) {
50                writer.objects.push((iref.obj_num, obj));
51                max_obj = max_obj.max(iref.obj_num);
52            }
53        }
54        writer.next_obj_num = max_obj + 1;
55
56        Ok(Self {
57            writer,
58            catalog_ref,
59            info_ref,
60        })
61    }
62
63    /// Get a reference to the internal writer for low-level modifications.
64    pub fn writer(&mut self) -> &mut PdfWriter {
65        &mut self.writer
66    }
67
68    /// Get the catalog reference.
69    pub fn catalog_ref(&self) -> &IndirectRef {
70        &self.catalog_ref
71    }
72
73    /// Replace an object at a given object number.
74    pub fn set_object(&mut self, obj_num: u32, obj: PdfObject) {
75        self.writer.set_object(obj_num, obj);
76    }
77
78    /// Add a new object and return its reference.
79    pub fn add_object(&mut self, obj: PdfObject) -> IndirectRef {
80        self.writer.add_object(obj)
81    }
82
83    /// Find an object by object number (public accessor).
84    pub fn find_object_pub(&self, obj_num: u32) -> Option<&PdfObject> {
85        self.find_object(obj_num)
86    }
87
88    /// Delete a page by index (0-based).
89    /// Modifies the Pages tree to remove the page reference.
90    pub fn delete_page(&mut self, page_index: usize) -> Result<()> {
91        let pages_ref = self.find_pages_ref()?;
92        let pages_obj_num = pages_ref.obj_num;
93
94        // Find the Pages dict
95        let pages_obj = self.find_object(pages_obj_num)
96            .cloned()
97            .unwrap_or(PdfObject::Null);
98
99        if let PdfObject::Dict(mut pages_dict) = pages_obj {
100            if let Some(PdfObject::Array(mut kids)) = pages_dict.remove(b"Kids") {
101                if page_index < kids.len() {
102                    kids.remove(page_index);
103                    let count = kids.len() as i64;
104                    pages_dict.insert(b"Kids".to_vec(), PdfObject::Array(kids));
105                    pages_dict.insert(b"Count".to_vec(), PdfObject::Integer(count));
106                    self.writer.set_object(pages_obj_num, PdfObject::Dict(pages_dict));
107                }
108            }
109        }
110
111        Ok(())
112    }
113
114    /// Insert a new page at the given index.
115    pub fn insert_page(&mut self, page_index: usize, page: PageBuilder) -> Result<()> {
116        let pages_ref = self.find_pages_ref()?;
117        let pages_obj_num = pages_ref.obj_num;
118
119        let page_ref = page.build(&mut self.writer, &pages_ref);
120
121        let pages_obj = self.find_object(pages_obj_num)
122            .cloned()
123            .unwrap_or(PdfObject::Null);
124
125        if let PdfObject::Dict(mut pages_dict) = pages_obj {
126            if let Some(PdfObject::Array(mut kids)) = pages_dict.remove(b"Kids") {
127                let idx = page_index.min(kids.len());
128                kids.insert(idx, PdfObject::Reference(page_ref));
129                let count = kids.len() as i64;
130                pages_dict.insert(b"Kids".to_vec(), PdfObject::Array(kids));
131                pages_dict.insert(b"Count".to_vec(), PdfObject::Integer(count));
132                self.writer.set_object(pages_obj_num, PdfObject::Dict(pages_dict));
133            }
134        }
135
136        Ok(())
137    }
138
139    /// Reorder pages. `order` is a list of 0-based page indices in the desired order.
140    pub fn reorder_pages(&mut self, order: &[usize]) -> Result<()> {
141        let pages_ref = self.find_pages_ref()?;
142        let pages_obj_num = pages_ref.obj_num;
143
144        let pages_obj = self.find_object(pages_obj_num)
145            .cloned()
146            .unwrap_or(PdfObject::Null);
147
148        if let PdfObject::Dict(mut pages_dict) = pages_obj {
149            if let Some(PdfObject::Array(kids)) = pages_dict.remove(b"Kids") {
150                let mut new_kids = Vec::with_capacity(order.len());
151                for &idx in order {
152                    if idx < kids.len() {
153                        new_kids.push(kids[idx].clone());
154                    }
155                }
156                let count = new_kids.len() as i64;
157                pages_dict.insert(b"Kids".to_vec(), PdfObject::Array(new_kids));
158                pages_dict.insert(b"Count".to_vec(), PdfObject::Integer(count));
159                self.writer.set_object(pages_obj_num, PdfObject::Dict(pages_dict));
160            }
161        }
162
163        Ok(())
164    }
165
166    /// Set or update a metadata field in the Info dictionary.
167    pub fn set_info(&mut self, key: &[u8], value: &str) {
168        let info_num = if let Some(ref r) = self.info_ref {
169            r.obj_num
170        } else {
171            let num = self.writer.alloc_object_num();
172            self.info_ref = Some(IndirectRef {
173                obj_num: num,
174                gen_num: 0,
175            });
176            num
177        };
178
179        // Get or create info dict
180        let info_obj = self.find_object(info_num)
181            .cloned()
182            .unwrap_or(PdfObject::Dict(PdfDict::new()));
183
184        if let PdfObject::Dict(mut info_dict) = info_obj {
185            info_dict.insert(
186                key.to_vec(),
187                PdfObject::String(value.as_bytes().to_vec()),
188            );
189            self.writer.set_object(info_num, PdfObject::Dict(info_dict));
190        }
191    }
192
193    /// Perform garbage collection: remove unreachable objects.
194    ///
195    /// Traverses all objects reachable from the catalog (and info dict),
196    /// then removes any objects that are not reachable.
197    pub fn garbage_collect(&mut self) {
198        let mut reachable = std::collections::HashSet::new();
199
200        // Mark catalog and info as roots
201        reachable.insert(self.catalog_ref.obj_num);
202        if let Some(ref info) = self.info_ref {
203            reachable.insert(info.obj_num);
204        }
205
206        // Iteratively mark all reachable objects
207        let mut work: Vec<u32> = reachable.iter().copied().collect();
208        while let Some(obj_num) = work.pop() {
209            if let Some(obj) = self.find_object(obj_num).cloned() {
210                let refs = collect_references(&obj);
211                for r in refs {
212                    if reachable.insert(r) {
213                        work.push(r);
214                    }
215                }
216            }
217        }
218
219        // Remove unreachable objects
220        self.writer.objects.retain(|(num, _)| reachable.contains(num));
221    }
222
223    /// Serialize to PDF bytes.
224    pub fn build(self) -> Result<Vec<u8>> {
225        serialize_pdf(
226            &self.writer.objects,
227            self.writer.version,
228            &self.catalog_ref,
229            self.info_ref.as_ref(),
230        )
231    }
232
233    /// Save to file.
234    pub fn save(self, path: &Path) -> Result<()> {
235        let bytes = self.build()?;
236        std::fs::write(path, bytes)?;
237        Ok(())
238    }
239
240    // --- helpers ---
241
242    fn find_pages_ref(&self) -> Result<IndirectRef> {
243        // Look up Catalog → /Pages
244        if let Some(PdfObject::Dict(catalog)) = self.find_object(self.catalog_ref.obj_num) {
245            if let Some(PdfObject::Reference(r)) = catalog.get(b"Pages") {
246                return Ok(r.clone());
247            }
248        }
249        // Fallback: guess object 2
250        Ok(IndirectRef {
251            obj_num: 2,
252            gen_num: 0,
253        })
254    }
255
256    fn find_object(&self, obj_num: u32) -> Option<&PdfObject> {
257        self.writer
258            .objects
259            .iter()
260            .find(|(n, _)| *n == obj_num)
261            .map(|(_, o)| o)
262    }
263}
264
265/// Collect all indirect reference object numbers from a PdfObject recursively.
266fn collect_references(obj: &PdfObject) -> Vec<u32> {
267    let mut refs = Vec::new();
268    collect_references_inner(obj, &mut refs);
269    refs
270}
271
272fn collect_references_inner(obj: &PdfObject, refs: &mut Vec<u32>) {
273    match obj {
274        PdfObject::Reference(r) => {
275            refs.push(r.obj_num);
276        }
277        PdfObject::Dict(d) => {
278            for (_, val) in d.iter() {
279                collect_references_inner(val, refs);
280            }
281        }
282        PdfObject::Array(arr) => {
283            for item in arr {
284                collect_references_inner(item, refs);
285            }
286        }
287        PdfObject::Stream { dict, .. } => {
288            for (_, val) in dict.iter() {
289                collect_references_inner(val, refs);
290            }
291        }
292        _ => {}
293    }
294}
295
296/// Perform an incremental save: append modified objects to the original PDF data.
297///
298/// This preserves the original bytes and appends only modified/new objects,
299/// a new xref table, and a new trailer with /Prev pointing to the old xref.
300pub fn incremental_save(original_data: &[u8], modifier: DocumentModifier) -> Result<Vec<u8>> {
301    use std::io::Write;
302
303    // Find old startxref
304    let old_startxref = crate::xref::find_startxref(original_data)?;
305
306    let mut buf = original_data.to_vec();
307
308    // Determine max object number for xref size
309    let max_obj_num = modifier
310        .writer
311        .objects
312        .iter()
313        .map(|(n, _)| *n)
314        .max()
315        .unwrap_or(0);
316    let xref_size = max_obj_num + 1;
317
318    // Write each object and track offsets
319    let mut offsets: Vec<(u32, usize)> = Vec::new();
320    for (obj_num, obj) in &modifier.writer.objects {
321        let offset = buf.len();
322        offsets.push((*obj_num, offset));
323        write!(buf, "{} 0 obj\n", obj_num)?;
324        // Use the serialize module's logic inline
325        write!(buf, "{}", obj)?;
326        write!(buf, "\nendobj\n")?;
327    }
328
329    // Write new xref table
330    let new_xref_offset = buf.len();
331    write!(buf, "xref\n")?;
332
333    // Write subsections for each modified object
334    // Sort offsets by object number
335    let mut sorted_offsets = offsets.clone();
336    sorted_offsets.sort_by_key(|(n, _)| *n);
337
338    // Write as individual subsections
339    for (obj_num, offset) in &sorted_offsets {
340        write!(buf, "{} 1\n", obj_num)?;
341        write!(buf, "{:010} {:05} n \r\n", offset, 0)?;
342    }
343
344    // Write trailer
345    let mut trailer = PdfDict::new();
346    trailer.insert(b"Size".to_vec(), PdfObject::Integer(xref_size as i64));
347    trailer.insert(
348        b"Root".to_vec(),
349        PdfObject::Reference(modifier.catalog_ref.clone()),
350    );
351    if let Some(ref info) = modifier.info_ref {
352        trailer.insert(b"Info".to_vec(), PdfObject::Reference(info.clone()));
353    }
354    trailer.insert(
355        b"Prev".to_vec(),
356        PdfObject::Integer(old_startxref as i64),
357    );
358
359    write!(buf, "trailer\n")?;
360    write!(buf, "{}", PdfObject::Dict(trailer))?;
361    write!(buf, "\n")?;
362
363    write!(buf, "startxref\n{}\n%%EOF\n", new_xref_offset)?;
364
365    Ok(buf)
366}
367
368/// Merge pages from multiple PDF documents into one.
369///
370/// Returns the merged PDF as bytes. Pages are concatenated in order:
371/// all pages from doc1, then all from doc2, etc.
372pub fn merge_documents(docs: &[&PdfDocument]) -> Result<Vec<u8>> {
373    let mut writer = PdfWriter::new();
374    let pages_obj_num = writer.alloc_object_num();
375    let pages_ref = IndirectRef {
376        obj_num: pages_obj_num,
377        gen_num: 0,
378    };
379
380    let mut all_page_refs: Vec<IndirectRef> = Vec::new();
381
382    for doc in docs.iter() {
383        let pages = collect_pages(*doc)?;
384        for page_info in &pages {
385            let page_ref = graft_page(&mut writer, *doc, page_info, &pages_ref)?;
386            all_page_refs.push(page_ref);
387        }
388    }
389
390    // Create Pages dict
391    let kids: Vec<PdfObject> = all_page_refs
392        .iter()
393        .map(|r| PdfObject::Reference(r.clone()))
394        .collect();
395    let count = kids.len() as i64;
396
397    let mut pages_dict = PdfDict::new();
398    pages_dict.insert(b"Type".to_vec(), PdfObject::Name(b"Pages".to_vec()));
399    pages_dict.insert(b"Kids".to_vec(), PdfObject::Array(kids));
400    pages_dict.insert(b"Count".to_vec(), PdfObject::Integer(count));
401    writer.set_object(pages_obj_num, PdfObject::Dict(pages_dict));
402
403    // Create Catalog
404    let mut catalog_dict = PdfDict::new();
405    catalog_dict.insert(b"Type".to_vec(), PdfObject::Name(b"Catalog".to_vec()));
406    catalog_dict.insert(b"Pages".to_vec(), PdfObject::Reference(pages_ref));
407    let catalog_ref = writer.add_object(PdfObject::Dict(catalog_dict));
408
409    serialize_pdf(&writer.objects, (1, 7), &catalog_ref, None)
410}
411
412/// Graft a single page from a source document into the writer.
413/// Copies the page dict and all referenced objects with remapped object numbers.
414fn graft_page(
415    writer: &mut PdfWriter,
416    doc: &PdfDocument,
417    page_info: &PageInfo,
418    new_pages_ref: &IndirectRef,
419) -> Result<IndirectRef> {
420    let mut remap: HashMap<u32, u32> = HashMap::new();
421
422    // Resolve the page object
423    let page_obj = doc.resolve(&page_info.page_ref)?;
424
425    // Deep-copy the page and all referenced objects
426    let new_page_obj = deep_copy_object(writer, doc, &page_obj, &mut remap)?;
427
428    // Update Parent reference to point to our new Pages
429    if let PdfObject::Dict(mut page_dict) = new_page_obj {
430        page_dict.insert(
431            b"Parent".to_vec(),
432            PdfObject::Reference(new_pages_ref.clone()),
433        );
434        Ok(writer.add_object(PdfObject::Dict(page_dict)))
435    } else {
436        Ok(writer.add_object(new_page_obj))
437    }
438}
439
440/// Deep-copy a PdfObject, resolving all references and remapping object numbers.
441fn deep_copy_object(
442    writer: &mut PdfWriter,
443    doc: &PdfDocument,
444    obj: &PdfObject,
445    remap: &mut HashMap<u32, u32>,
446) -> Result<PdfObject> {
447    match obj {
448        PdfObject::Reference(r) => {
449            // Check if already remapped
450            if let Some(&new_num) = remap.get(&r.obj_num) {
451                return Ok(PdfObject::Reference(IndirectRef {
452                    obj_num: new_num,
453                    gen_num: 0,
454                }));
455            }
456
457            // Allocate new number first (for circular reference prevention)
458            let new_num = writer.alloc_object_num();
459            remap.insert(r.obj_num, new_num);
460
461            // Resolve and deep-copy
462            let resolved = doc.resolve(r)?;
463            let copied = deep_copy_object(writer, doc, &resolved, remap)?;
464            writer.set_object(new_num, copied);
465
466            Ok(PdfObject::Reference(IndirectRef {
467                obj_num: new_num,
468                gen_num: 0,
469            }))
470        }
471        PdfObject::Dict(d) => {
472            let mut new_dict = PdfDict::new();
473            for (key, val) in d.iter() {
474                let new_val = deep_copy_object(writer, doc, val, remap)?;
475                new_dict.insert(key.clone(), new_val);
476            }
477            Ok(PdfObject::Dict(new_dict))
478        }
479        PdfObject::Array(arr) => {
480            let mut new_arr = Vec::with_capacity(arr.len());
481            for item in arr {
482                new_arr.push(deep_copy_object(writer, doc, item, remap)?);
483            }
484            Ok(PdfObject::Array(new_arr))
485        }
486        PdfObject::Stream { dict, data } => {
487            let mut new_dict = PdfDict::new();
488            for (key, val) in dict.iter() {
489                let new_val = deep_copy_object(writer, doc, val, remap)?;
490                new_dict.insert(key.clone(), new_val);
491            }
492            Ok(PdfObject::Stream {
493                dict: new_dict,
494                data: data.clone(),
495            })
496        }
497        // Primitive types: just clone
498        other => Ok(other.clone()),
499    }
500}
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505    use crate::writer::document::DocumentBuilder;
506    use crate::writer::page::PageBuilder;
507
508    fn create_test_pdf(text: &str, num_pages: usize) -> Vec<u8> {
509        let mut doc = DocumentBuilder::new();
510        let font = doc.add_standard_font("Helvetica");
511
512        for i in 0..num_pages {
513            let mut page = PageBuilder::new(612.0, 792.0);
514            page.add_font(&font, "Helvetica");
515            page.begin_text();
516            page.set_font(&font, 12.0);
517            page.move_to(72.0, 720.0);
518            page.show_text(&format!("{} - Page {}", text, i + 1));
519            page.end_text();
520            doc.add_page(page);
521        }
522
523        doc.build().unwrap()
524    }
525
526    #[test]
527    fn test_modifier_roundtrip() {
528        let bytes = create_test_pdf("Hello", 2);
529        let mut doc = PdfDocument::from_bytes(bytes).unwrap();
530
531        let modifier = DocumentModifier::from_document(&doc).unwrap();
532        let new_bytes = modifier.build().unwrap();
533
534        let mut reparsed = PdfDocument::from_bytes(new_bytes).unwrap();
535        let pages = collect_pages(&reparsed).unwrap();
536        assert_eq!(pages.len(), 2);
537    }
538
539    #[test]
540    fn test_delete_page() {
541        let bytes = create_test_pdf("Test", 3);
542        let mut doc = PdfDocument::from_bytes(bytes).unwrap();
543
544        let mut modifier = DocumentModifier::from_document(&doc).unwrap();
545        modifier.delete_page(1).unwrap(); // remove middle page
546
547        let new_bytes = modifier.build().unwrap();
548        let mut reparsed = PdfDocument::from_bytes(new_bytes).unwrap();
549        let pages = collect_pages(&reparsed).unwrap();
550        assert_eq!(pages.len(), 2);
551    }
552
553    #[test]
554    fn test_reorder_pages() {
555        let bytes = create_test_pdf("Reorder", 3);
556        let mut doc = PdfDocument::from_bytes(bytes).unwrap();
557
558        let mut modifier = DocumentModifier::from_document(&doc).unwrap();
559        modifier.reorder_pages(&[2, 0, 1]).unwrap(); // reverse-ish
560
561        let new_bytes = modifier.build().unwrap();
562        let mut reparsed = PdfDocument::from_bytes(new_bytes).unwrap();
563        let pages = collect_pages(&reparsed).unwrap();
564        assert_eq!(pages.len(), 3);
565    }
566
567    #[test]
568    fn test_set_info() {
569        let bytes = create_test_pdf("Info", 1);
570        let mut doc = PdfDocument::from_bytes(bytes).unwrap();
571
572        let mut modifier = DocumentModifier::from_document(&doc).unwrap();
573        modifier.set_info(b"Title", "New Title");
574        modifier.set_info(b"Author", "New Author");
575
576        let new_bytes = modifier.build().unwrap();
577        let text = String::from_utf8_lossy(&new_bytes);
578        assert!(text.contains("New Title"));
579        assert!(text.contains("New Author"));
580    }
581
582    #[test]
583    fn test_merge_documents() {
584        let bytes1 = create_test_pdf("Doc1", 2);
585        let bytes2 = create_test_pdf("Doc2", 3);
586
587        let mut doc1 = PdfDocument::from_bytes(bytes1).unwrap();
588        let mut doc2 = PdfDocument::from_bytes(bytes2).unwrap();
589
590        let merged = merge_documents(&[&doc1, &doc2]).unwrap();
591
592        let mut reparsed = PdfDocument::from_bytes(merged).unwrap();
593        let pages = collect_pages(&reparsed).unwrap();
594        assert_eq!(pages.len(), 5); // 2 + 3
595    }
596
597    #[test]
598    fn test_incremental_save() {
599        let original = create_test_pdf("Original", 1);
600        let original_len = original.len();
601
602        let mut doc = PdfDocument::from_bytes(original.clone()).unwrap();
603        let mut modifier = DocumentModifier::from_document(&doc).unwrap();
604        modifier.set_info(b"Title", "Updated Title");
605
606        let result = incremental_save(&original, modifier).unwrap();
607
608        // The result should start with the original bytes
609        assert!(result.len() > original_len);
610        assert_eq!(&result[..original_len], &original[..]);
611
612        // Should contain the new title
613        let text = String::from_utf8_lossy(&result);
614        assert!(text.contains("Updated Title"));
615
616        // Should contain /Prev
617        assert!(text.contains("/Prev"));
618
619        // Should end with %%EOF
620        let tail = String::from_utf8_lossy(&result[result.len().saturating_sub(50)..]);
621        assert!(tail.contains("%%EOF"));
622    }
623
624    #[test]
625    fn test_garbage_collect() {
626        let bytes = create_test_pdf("GC Test", 1);
627        let mut doc = PdfDocument::from_bytes(bytes).unwrap();
628        let mut modifier = DocumentModifier::from_document(&doc).unwrap();
629
630        // Run GC first to establish baseline (some objects from parsing may be unreachable)
631        modifier.garbage_collect();
632        let count_baseline = modifier.writer.objects.len();
633
634        // Add unreachable (orphan) objects
635        modifier.add_object(PdfObject::Integer(999));
636        modifier.add_object(PdfObject::String(b"orphan".to_vec()));
637        let count_with_orphans = modifier.writer.objects.len();
638        assert_eq!(count_with_orphans, count_baseline + 2);
639
640        // Run GC again
641        modifier.garbage_collect();
642        let count_after = modifier.writer.objects.len();
643
644        // The orphan objects should be removed, back to baseline
645        assert_eq!(count_after, count_baseline);
646    }
647
648    #[test]
649    fn test_resource_conflict_merge() {
650        // Create two docs that both use "F1" as font resource name.
651        // The deep_copy approach assigns new object numbers, so each page
652        // keeps its own independent Resources dict. No conflict occurs.
653        let bytes1 = create_test_pdf("Doc1", 1);
654        let bytes2 = create_test_pdf("Doc2", 1);
655
656        let mut doc1 = PdfDocument::from_bytes(bytes1).unwrap();
657        let mut doc2 = PdfDocument::from_bytes(bytes2).unwrap();
658
659        let merged = merge_documents(&[&doc1, &doc2]).unwrap();
660
661        let mut reparsed = PdfDocument::from_bytes(merged).unwrap();
662        let pages = collect_pages(&reparsed).unwrap();
663        assert_eq!(pages.len(), 2);
664
665        // Both pages should be independently valid (each has its own Resources)
666        // Verify the merged PDF is parseable
667        assert!(reparsed.catalog_ref().is_some());
668    }
669}