Skip to main content

justpdf_core/writer/
object_stream.rs

1use std::io::Write;
2
3use crate::error::Result;
4use crate::object::{PdfDict, PdfObject};
5use crate::writer::encode::encode_flate;
6use crate::writer::serialize::serialize_object;
7
8/// Pack eligible objects into object streams for compact PDF 1.5+ output.
9///
10/// Returns a new list of objects where small non-stream objects have been
11/// packed into object stream containers, plus the remaining unpacked objects.
12///
13/// `catalog_obj_num` and `pages_root_obj_num` identify objects that must NOT
14/// be packed (the catalog and pages tree root).
15///
16/// `encrypt_obj_num` optionally identifies the encryption dictionary, which
17/// must also remain unpacked.
18/// Information about compressed objects for xref stream generation.
19#[derive(Debug, Clone)]
20pub struct CompressedObjInfo {
21    /// Object number of the compressed object.
22    pub obj_num: u32,
23    /// Object number of the ObjStm that contains it.
24    pub objstm_num: u32,
25    /// Index of this object within the ObjStm.
26    pub index: u32,
27}
28
29/// Result of packing objects into object streams.
30pub struct PackResult {
31    /// The resulting objects (ineligible + ObjStm containers).
32    pub objects: Vec<(u32, PdfObject)>,
33    /// Info about objects compressed into ObjStms (for xref stream type 2 entries).
34    pub compressed: Vec<CompressedObjInfo>,
35}
36
37pub fn pack_object_streams(
38    objects: &[(u32, PdfObject)],
39    max_objects_per_stream: usize,
40    catalog_obj_num: u32,
41    pages_root_obj_num: Option<u32>,
42    encrypt_obj_num: Option<u32>,
43) -> Result<PackResult> {
44    let mut eligible: Vec<(u32, &PdfObject)> = Vec::new();
45    let mut ineligible: Vec<(u32, PdfObject)> = Vec::new();
46
47    for (obj_num, obj) in objects {
48        if is_eligible(*obj_num, obj, catalog_obj_num, pages_root_obj_num, encrypt_obj_num) {
49            eligible.push((*obj_num, obj));
50        } else {
51            ineligible.push((*obj_num, obj.clone()));
52        }
53    }
54
55    if eligible.is_empty() {
56        return Ok(PackResult {
57            objects: objects.to_vec(),
58            compressed: Vec::new(),
59        });
60    }
61
62    // Determine next available object number for the new object stream containers.
63    let mut next_obj_num = objects.iter().map(|(n, _)| *n).max().unwrap_or(0) + 1;
64
65    // Pack eligible objects in batches
66    let mut result = ineligible;
67    let mut compressed = Vec::new();
68
69    for chunk in eligible.chunks(max_objects_per_stream) {
70        let objstm_num = next_obj_num;
71        let objstm = build_object_stream(chunk)?;
72        result.push((objstm_num, objstm));
73
74        for (index, (obj_num, _)) in chunk.iter().enumerate() {
75            compressed.push(CompressedObjInfo {
76                obj_num: *obj_num,
77                objstm_num,
78                index: index as u32,
79            });
80        }
81
82        next_obj_num += 1;
83    }
84
85    Ok(PackResult { objects: result, compressed })
86}
87
88/// Check whether an object is eligible for packing into an object stream.
89fn is_eligible(
90    obj_num: u32,
91    obj: &PdfObject,
92    catalog_obj_num: u32,
93    pages_root_obj_num: Option<u32>,
94    encrypt_obj_num: Option<u32>,
95) -> bool {
96    // Must NOT be a stream object
97    if obj.is_stream() {
98        return false;
99    }
100
101    // Must NOT be the catalog
102    if obj_num == catalog_obj_num {
103        return false;
104    }
105
106    // Must NOT be the pages tree root
107    if pages_root_obj_num == Some(obj_num) {
108        return false;
109    }
110
111    // Must NOT be the encryption dictionary
112    if encrypt_obj_num == Some(obj_num) {
113        return false;
114    }
115
116    // Must NOT be a cross-reference stream (Type == XRef)
117    if let PdfObject::Dict(d) = obj {
118        if d.get_name(b"Type") == Some(b"XRef") {
119            return false;
120        }
121    }
122
123    // Null objects: technically eligible but not worth packing
124    if obj.is_null() {
125        return false;
126    }
127
128    true
129}
130
131/// Build a single object stream from a batch of (obj_num, object) pairs.
132///
133/// The stream content format is:
134///   obj_num1 offset1 obj_num2 offset2 ... <data1> <data2> ...
135///
136/// where offsets are relative to /First (the byte position where object data starts).
137fn build_object_stream(objects: &[(u32, &PdfObject)]) -> Result<PdfObject> {
138    let n = objects.len();
139
140    // First pass: serialize each object's data
141    let mut object_data: Vec<Vec<u8>> = Vec::with_capacity(n);
142    for (_obj_num, obj) in objects {
143        let mut buf = Vec::new();
144        serialize_object(&mut buf, obj)?;
145        object_data.push(buf);
146    }
147
148    // Compute offsets (relative to start of object data section)
149    let mut offsets: Vec<usize> = Vec::with_capacity(n);
150    let mut running_offset = 0usize;
151    for data in &object_data {
152        offsets.push(running_offset);
153        running_offset += data.len();
154        // Add a space separator between objects (except after the last)
155        running_offset += 1;
156    }
157
158    // Build the index section: "obj_num1 offset1 obj_num2 offset2 ..."
159    let mut index_section = Vec::new();
160    for (i, (obj_num, _)) in objects.iter().enumerate() {
161        if i > 0 {
162            write!(index_section, " ")?;
163        }
164        write!(index_section, "{} {}", obj_num, offsets[i])?;
165    }
166    write!(index_section, " ")?; // trailing space before data
167
168    let first = index_section.len();
169
170    // Build the full stream content: index_section + object data
171    let mut content = index_section;
172    for (i, data) in object_data.iter().enumerate() {
173        content.extend_from_slice(data);
174        if i < n - 1 {
175            content.push(b' ');
176        }
177    }
178
179    // Compress the stream content
180    let compressed = encode_flate(&content)?;
181
182    let mut dict = PdfDict::new();
183    dict.insert(b"Type".to_vec(), PdfObject::Name(b"ObjStm".to_vec()));
184    dict.insert(b"N".to_vec(), PdfObject::Integer(n as i64));
185    dict.insert(b"First".to_vec(), PdfObject::Integer(first as i64));
186    dict.insert(
187        b"Filter".to_vec(),
188        PdfObject::Name(b"FlateDecode".to_vec()),
189    );
190
191    Ok(PdfObject::Stream {
192        dict,
193        data: compressed,
194    })
195}
196
197/// Write a cross-reference stream instead of a traditional xref table.
198///
199/// This is required when object streams are used (PDF 1.5+).
200/// Returns the xref stream object and the byte offset where it was written.
201pub fn write_xref_stream(
202    buf: &mut Vec<u8>,
203    offsets: &[(u32, usize)],
204    compressed: &[CompressedObjInfo],
205    catalog_ref: &crate::object::IndirectRef,
206    info_ref: Option<&crate::object::IndirectRef>,
207    xref_stm_obj_num: u32,
208) -> Result<()> {
209    let max_obj_num = offsets
210        .iter()
211        .map(|(n, _)| *n)
212        .max()
213        .unwrap_or(0)
214        .max(xref_stm_obj_num)
215        .max(compressed.iter().map(|c| c.obj_num).max().unwrap_or(0));
216    let size = max_obj_num + 1;
217
218    // Build offset map for type 1 entries
219    let mut offset_map = std::collections::HashMap::new();
220    for (num, off) in offsets {
221        offset_map.insert(*num, *off);
222    }
223
224    // Build compressed object map for type 2 entries
225    let mut compressed_map: std::collections::HashMap<u32, (u32, u32)> =
226        std::collections::HashMap::new();
227    for info in compressed {
228        compressed_map.insert(info.obj_num, (info.objstm_num, info.index));
229    }
230
231    // Determine field widths.
232    // W = [w1 w2 w3] where:
233    //   field 1: type (1 byte: 0=free, 1=normal, 2=compressed)
234    //   field 2: offset or obj stream number
235    //   field 3: generation number or index within obj stream
236    let max_offset = offsets.iter().map(|(_, o)| *o).max().unwrap_or(0);
237    let max_objstm_num = compressed.iter().map(|c| c.objstm_num as usize).max().unwrap_or(0);
238    let w2 = bytes_needed(max_offset.max(max_objstm_num) as u64);
239    let w1 = 1u8;
240    let max_index = compressed.iter().map(|c| c.index).max().unwrap_or(0);
241    let w3 = bytes_needed(max_index.max(255) as u64);
242
243    // Build stream data
244    let entry_size = (w1 + w2 + w3) as usize;
245    let mut stream_data = Vec::with_capacity(entry_size * size as usize);
246
247    for obj_num in 0..size {
248        if obj_num == 0 {
249            // Free entry: type=0, next free=0, gen=255
250            stream_data.push(0u8);
251            write_field(&mut stream_data, 0, w2);
252            write_field(&mut stream_data, 255, w3);
253        } else if let Some(&off) = offset_map.get(&obj_num) {
254            // In-use entry: type=1, offset, gen=0
255            stream_data.push(1u8);
256            write_field(&mut stream_data, off as u64, w2);
257            write_field(&mut stream_data, 0, w3);
258        } else if let Some(&(objstm_num, index)) = compressed_map.get(&obj_num) {
259            // Compressed entry: type=2, objstm number, index within stream
260            stream_data.push(2u8);
261            write_field(&mut stream_data, objstm_num as u64, w2);
262            write_field(&mut stream_data, index as u64, w3);
263        } else if obj_num == xref_stm_obj_num {
264            // The xref stream itself: type=1, offset = current buf position
265            stream_data.push(1u8);
266            write_field(&mut stream_data, buf.len() as u64, w2);
267            write_field(&mut stream_data, 0, w3);
268        } else {
269            // Free entry
270            stream_data.push(0u8);
271            write_field(&mut stream_data, 0, w2);
272            write_field(&mut stream_data, 0, w3);
273        }
274    }
275
276    // Compress stream data
277    let compressed = encode_flate(&stream_data)?;
278
279    let mut dict = PdfDict::new();
280    dict.insert(b"Type".to_vec(), PdfObject::Name(b"XRef".to_vec()));
281    dict.insert(b"Size".to_vec(), PdfObject::Integer(size as i64));
282    dict.insert(
283        b"W".to_vec(),
284        PdfObject::Array(vec![
285            PdfObject::Integer(w1 as i64),
286            PdfObject::Integer(w2 as i64),
287            PdfObject::Integer(w3 as i64),
288        ]),
289    );
290    dict.insert(
291        b"Root".to_vec(),
292        PdfObject::Reference(catalog_ref.clone()),
293    );
294    if let Some(info) = info_ref {
295        dict.insert(b"Info".to_vec(), PdfObject::Reference(info.clone()));
296    }
297    dict.insert(
298        b"Filter".to_vec(),
299        PdfObject::Name(b"FlateDecode".to_vec()),
300    );
301
302    let xref_offset = buf.len();
303
304    // Write as an indirect object
305    write!(buf, "{} 0 obj\n", xref_stm_obj_num)?;
306    // Manually write the stream with correct /Length
307    let mut stream_dict = dict;
308    stream_dict.insert(
309        b"Length".to_vec(),
310        PdfObject::Integer(compressed.len() as i64),
311    );
312    crate::writer::serialize::serialize_dict(buf, &stream_dict)?;
313    buf.extend_from_slice(b"\nstream\r\n");
314    buf.extend_from_slice(&compressed);
315    buf.extend_from_slice(b"\r\nendstream");
316    write!(buf, "\nendobj\n")?;
317
318    // startxref
319    write!(buf, "startxref\n{}\n%%EOF\n", xref_offset)?;
320
321    Ok(())
322}
323
324/// Compute the number of bytes needed to represent `val`.
325fn bytes_needed(val: u64) -> u8 {
326    if val <= 0xFF {
327        1
328    } else if val <= 0xFFFF {
329        2
330    } else if val <= 0xFF_FFFF {
331        3
332    } else {
333        4
334    }
335}
336
337/// Write a value as big-endian in exactly `width` bytes.
338fn write_field(buf: &mut Vec<u8>, val: u64, width: u8) {
339    for i in (0..width).rev() {
340        buf.push(((val >> (8 * i as u64)) & 0xFF) as u8);
341    }
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347    use crate::object::{IndirectRef, PdfDict, PdfObject};
348
349    #[test]
350    fn test_eligible_objects() {
351        // Simple dict is eligible
352        let dict = PdfObject::Dict(PdfDict::new());
353        assert!(is_eligible(10, &dict, 1, Some(2), None));
354
355        // Integer is eligible
356        assert!(is_eligible(10, &PdfObject::Integer(42), 1, Some(2), None));
357
358        // Array is eligible
359        let arr = PdfObject::Array(vec![PdfObject::Integer(1)]);
360        assert!(is_eligible(10, &arr, 1, Some(2), None));
361    }
362
363    #[test]
364    fn test_ineligible_stream() {
365        let stream = PdfObject::Stream {
366            dict: PdfDict::new(),
367            data: vec![1, 2, 3],
368        };
369        assert!(!is_eligible(10, &stream, 1, Some(2), None));
370    }
371
372    #[test]
373    fn test_ineligible_catalog() {
374        let dict = PdfObject::Dict(PdfDict::new());
375        assert!(!is_eligible(1, &dict, 1, Some(2), None));
376    }
377
378    #[test]
379    fn test_ineligible_pages_root() {
380        let dict = PdfObject::Dict(PdfDict::new());
381        assert!(!is_eligible(2, &dict, 1, Some(2), None));
382    }
383
384    #[test]
385    fn test_ineligible_encrypt() {
386        let dict = PdfObject::Dict(PdfDict::new());
387        assert!(!is_eligible(5, &dict, 1, Some(2), Some(5)));
388    }
389
390    #[test]
391    fn test_ineligible_xref_stream() {
392        let mut d = PdfDict::new();
393        d.insert(b"Type".to_vec(), PdfObject::Name(b"XRef".to_vec()));
394        let obj = PdfObject::Dict(d);
395        assert!(!is_eligible(10, &obj, 1, Some(2), None));
396    }
397
398    #[test]
399    fn test_ineligible_null() {
400        assert!(!is_eligible(10, &PdfObject::Null, 1, Some(2), None));
401    }
402
403    #[test]
404    fn test_pack_object_streams_structure() {
405        // Create a set of objects: catalog (1), pages root (2), and some simple objects
406        let mut catalog = PdfDict::new();
407        catalog.insert(b"Type".to_vec(), PdfObject::Name(b"Catalog".to_vec()));
408
409        let mut pages = PdfDict::new();
410        pages.insert(b"Type".to_vec(), PdfObject::Name(b"Pages".to_vec()));
411
412        let objects = vec![
413            (1, PdfObject::Dict(catalog)),
414            (2, PdfObject::Dict(pages)),
415            (3, PdfObject::Integer(42)),
416            (4, PdfObject::String(b"hello".to_vec())),
417            (5, PdfObject::Array(vec![PdfObject::Integer(1), PdfObject::Integer(2)])),
418        ];
419
420        let packed = pack_object_streams(&objects, 100, 1, Some(2), None).unwrap();
421
422        // Catalog and pages root should remain as separate objects
423        let catalog_entry = packed.objects.iter().find(|(n, _)| *n == 1);
424        assert!(catalog_entry.is_some());
425        let pages_entry = packed.objects.iter().find(|(n, _)| *n == 2);
426        assert!(pages_entry.is_some());
427
428        // Objects 3, 4, 5 should be packed into an object stream
429        // So we shouldn't find them as standalone objects anymore
430        let obj3 = packed.objects.iter().find(|(n, _)| *n == 3);
431        assert!(obj3.is_none(), "object 3 should be packed");
432        let obj4 = packed.objects.iter().find(|(n, _)| *n == 4);
433        assert!(obj4.is_none(), "object 4 should be packed");
434
435        // There should be an object stream (new obj num = 6)
436        let objstm = packed.objects.iter().find(|(_, obj)| {
437            if let PdfObject::Stream { dict, .. } = obj {
438                dict.get_name(b"Type") == Some(b"ObjStm")
439            } else {
440                false
441            }
442        });
443        assert!(objstm.is_some(), "should contain an object stream");
444
445        let (_, stream_obj) = objstm.unwrap();
446        if let PdfObject::Stream { dict, .. } = stream_obj {
447            assert_eq!(dict.get_i64(b"N"), Some(3)); // 3 objects packed
448            assert!(dict.get_i64(b"First").unwrap() > 0);
449            assert_eq!(dict.get_name(b"Filter"), Some(b"FlateDecode".as_slice()));
450        } else {
451            panic!("expected stream object");
452        }
453    }
454
455    #[test]
456    fn test_pack_splits_by_max() {
457        let mut objects = vec![
458            (1, PdfObject::Dict(PdfDict::new())), // catalog
459        ];
460        // Add 5 eligible objects
461        for i in 2..=6 {
462            objects.push((i, PdfObject::Integer(i as i64)));
463        }
464
465        let packed = pack_object_streams(&objects, 2, 1, None, None).unwrap();
466
467        // 5 eligible objects split into batches of 2 => 3 object streams
468        let objstm_count = packed.objects
469            .iter()
470            .filter(|(_, obj)| {
471                if let PdfObject::Stream { dict, .. } = obj {
472                    dict.get_name(b"Type") == Some(b"ObjStm")
473                } else {
474                    false
475                }
476            })
477            .count();
478        assert_eq!(objstm_count, 3);
479    }
480
481    #[test]
482    fn test_pack_no_eligible() {
483        let stream = PdfObject::Stream {
484            dict: PdfDict::new(),
485            data: vec![1, 2, 3],
486        };
487        let objects = vec![
488            (1, PdfObject::Dict(PdfDict::new())), // catalog
489            (2, stream),
490        ];
491
492        let packed = pack_object_streams(&objects, 100, 1, None, None).unwrap();
493        assert_eq!(packed.objects.len(), 2); // unchanged
494    }
495
496    #[test]
497    fn test_build_object_stream_content() {
498        use flate2::read::ZlibDecoder;
499        use std::io::Read;
500
501        let obj1 = PdfObject::Integer(42);
502        let obj2 = PdfObject::String(b"test".to_vec());
503        let items: Vec<(u32, &PdfObject)> = vec![(10, &obj1), (20, &obj2)];
504
505        let result = build_object_stream(&items).unwrap();
506        if let PdfObject::Stream { dict, data } = result {
507            assert_eq!(dict.get_name(b"Type"), Some(b"ObjStm".as_slice()));
508            assert_eq!(dict.get_i64(b"N"), Some(2));
509
510            // Decompress and check the content structure
511            let mut decoder = ZlibDecoder::new(&data[..]);
512            let mut decompressed = Vec::new();
513            decoder.read_to_end(&mut decompressed).unwrap();
514            let text = String::from_utf8_lossy(&decompressed);
515
516            // Should start with "10 0 20 <offset>" pattern
517            assert!(text.starts_with("10 "), "content should start with first obj num: {}", text);
518            assert!(text.contains("20 "), "content should contain second obj num");
519            // Should contain the serialized objects
520            assert!(text.contains("42"), "content should contain integer 42");
521            assert!(text.contains("(test)"), "content should contain string (test)");
522        } else {
523            panic!("expected stream object");
524        }
525    }
526
527    #[test]
528    fn test_bytes_needed() {
529        assert_eq!(bytes_needed(0), 1);
530        assert_eq!(bytes_needed(255), 1);
531        assert_eq!(bytes_needed(256), 2);
532        assert_eq!(bytes_needed(65535), 2);
533        assert_eq!(bytes_needed(65536), 3);
534        assert_eq!(bytes_needed(0xFF_FFFF), 3);
535        assert_eq!(bytes_needed(0x100_0000), 4);
536    }
537
538    #[test]
539    fn test_write_field() {
540        let mut buf = Vec::new();
541        write_field(&mut buf, 0x1234, 2);
542        assert_eq!(buf, vec![0x12, 0x34]);
543
544        let mut buf = Vec::new();
545        write_field(&mut buf, 42, 1);
546        assert_eq!(buf, vec![42]);
547
548        let mut buf = Vec::new();
549        write_field(&mut buf, 0xABCDEF, 3);
550        assert_eq!(buf, vec![0xAB, 0xCD, 0xEF]);
551    }
552
553    #[test]
554    fn test_write_xref_stream() {
555        let mut buf = Vec::new();
556        // Write PDF header first
557        buf.extend_from_slice(b"%PDF-1.5\n");
558
559        let offsets = vec![(1, 20), (2, 100)];
560        let catalog_ref = IndirectRef { obj_num: 1, gen_num: 0 };
561
562        write_xref_stream(&mut buf, &offsets, &[], &catalog_ref, None, 3).unwrap();
563
564        let text = String::from_utf8_lossy(&buf);
565        assert!(text.contains("3 0 obj"));
566        assert!(text.contains("/Type /XRef"));
567        assert!(text.contains("/Root 1 0 R"));
568        assert!(text.contains("startxref"));
569        assert!(text.contains("%%EOF"));
570    }
571}