Skip to main content

pdf_objects/
serializer.rs

1use std::collections::BTreeMap;
2use std::fmt::Write;
3
4use crate::stream::flate_encode;
5use crate::types::{ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfString, PdfValue, XrefForm};
6
7/// Maximum number of compressible objects packed into a single
8/// `Type /ObjStm` container. Real-world writers typically split between
9/// 50 and 100 members per stream; staying near the lower end keeps each
10/// stream's decompressed size bounded for readers with conservative
11/// per-object memory budgets.
12const OBJSTM_CHUNK_SIZE: usize = 100;
13
14pub fn serialize_pdf(file: &PdfFile) -> Vec<u8> {
15    match file.xref_form {
16        XrefForm::Classic => serialize_classic(file),
17        XrefForm::Stream => serialize_with_xref_stream(file),
18    }
19}
20
21fn serialize_classic(file: &PdfFile) -> Vec<u8> {
22    let mut output = Vec::new();
23    output.extend_from_slice(
24        format!("%PDF-{}\n%\u{00FF}\u{00FF}\u{00FF}\u{00FF}\n", file.version).as_bytes(),
25    );
26
27    let mut offsets = BTreeMap::new();
28    for (object_ref, object) in &file.objects {
29        let offset = output.len();
30        offsets.insert(object_ref.object_number, offset);
31        output.extend_from_slice(
32            format!(
33                "{} {} obj\n",
34                object_ref.object_number, object_ref.generation
35            )
36            .as_bytes(),
37        );
38        match object {
39            PdfObject::Value(value) => {
40                output.extend_from_slice(serialize_value(value).as_bytes());
41                output.extend_from_slice(b"\nendobj\n");
42            }
43            PdfObject::Stream(stream) => {
44                let mut dict = stream.dict.clone();
45                dict.insert(
46                    "Length".to_string(),
47                    PdfValue::Integer(stream.data.len() as i64),
48                );
49                output.extend_from_slice(serialize_dictionary(&dict).as_bytes());
50                output.extend_from_slice(b"\nstream\n");
51                output.extend_from_slice(&stream.data);
52                if !stream.data.ends_with(b"\n") {
53                    output.push(b'\n');
54                }
55                output.extend_from_slice(b"endstream\nendobj\n");
56            }
57        }
58    }
59
60    let startxref = output.len();
61    let size = file.max_object_number + 1;
62    output.extend_from_slice(format!("xref\n0 {}\n", size).as_bytes());
63    output.extend_from_slice(b"0000000000 65535 f \n");
64    for object_number in 1..=file.max_object_number {
65        if let Some(offset) = offsets.get(&object_number).copied() {
66            output.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
67        } else {
68            output.extend_from_slice(b"0000000000 65535 f \n");
69        }
70    }
71
72    let mut trailer = file.trailer.clone();
73    trailer.insert("Size".to_string(), PdfValue::Integer(size as i64));
74    trailer.remove("Prev");
75    trailer.remove("XRefStm");
76    output.extend_from_slice(b"trailer\n");
77    output.extend_from_slice(serialize_dictionary(&trailer).as_bytes());
78    output.extend_from_slice(format!("\nstartxref\n{startxref}\n%%EOF\n").as_bytes());
79    output
80}
81
82/// One row of the cross-reference stream. Mirrors the parsed
83/// `XrefEntry` shape but with explicit byte offsets needed only at
84/// emit time.
85#[derive(Debug, Clone, Copy)]
86enum XrefRow {
87    Free,
88    Direct { offset: usize, generation: u16 },
89    InObjStm { stream_objnum: u32, index: u32 },
90}
91
92/// A freshly-built `Type /ObjStm` container. `members` records each
93/// packed value's original `ObjectRef` and its index inside the stream.
94struct PackedObjStm {
95    container_objnum: u32,
96    body: Vec<u8>,
97    first: usize,
98    members: Vec<(ObjectRef, u32)>,
99}
100
101fn serialize_with_xref_stream(file: &PdfFile) -> Vec<u8> {
102    // 1. Partition: PdfObject::Stream stays as a direct indirect object;
103    //    PdfObject::Value with generation 0 is eligible for ObjStm
104    //    packing; PdfObject::Value with generation != 0 also stays direct
105    //    (cannot be inside an ObjStm per ISO 32000-1 §7.5.7).
106    let mut direct: Vec<(ObjectRef, &PdfObject)> = Vec::new();
107    let mut compressible: Vec<(ObjectRef, &PdfValue)> = Vec::new();
108    for (object_ref, object) in &file.objects {
109        match object {
110            PdfObject::Value(value) if object_ref.generation == 0 => {
111                compressible.push((*object_ref, value));
112            }
113            _ => direct.push((*object_ref, object)),
114        }
115    }
116
117    // 2. Pack compressible objects into one or more ObjStm containers.
118    //    Allocate fresh object numbers from `max_object_number + 1`.
119    let mut next_objnum = file.max_object_number + 1;
120    let mut packed_streams = Vec::new();
121    for chunk in compressible.chunks(OBJSTM_CHUNK_SIZE) {
122        let pack = pack_objstm_chunk(next_objnum, chunk);
123        next_objnum += 1;
124        packed_streams.push(pack);
125    }
126
127    // 3. Allocate xref-stream object number after all ObjStm containers.
128    let xref_stream_objnum = next_objnum;
129    let xref_size = xref_stream_objnum + 1;
130
131    // 4. Emit header + direct objects + ObjStm containers, capturing
132    //    each object's byte offset.
133    let mut output = Vec::new();
134    output.extend_from_slice(
135        format!("%PDF-{}\n%\u{00FF}\u{00FF}\u{00FF}\u{00FF}\n", file.version).as_bytes(),
136    );
137
138    let mut direct_offsets: BTreeMap<u32, usize> = BTreeMap::new();
139    for (object_ref, object) in &direct {
140        let offset = output.len();
141        direct_offsets.insert(object_ref.object_number, offset);
142        write_indirect_object(&mut output, *object_ref, object);
143    }
144
145    let mut objstm_offsets: BTreeMap<u32, usize> = BTreeMap::new();
146    for pack in &packed_streams {
147        let offset = output.len();
148        objstm_offsets.insert(pack.container_objnum, offset);
149        write_objstm_container(&mut output, pack);
150    }
151
152    // 5. Build xref rows for every object number 0..xref_size.
153    let mut rows: Vec<XrefRow> = vec![XrefRow::Free; xref_size as usize];
154    for (object_ref, _) in &direct {
155        if let Some(offset) = direct_offsets.get(&object_ref.object_number).copied() {
156            rows[object_ref.object_number as usize] = XrefRow::Direct {
157                offset,
158                generation: object_ref.generation,
159            };
160        }
161    }
162    for pack in &packed_streams {
163        for (member_ref, index) in &pack.members {
164            rows[member_ref.object_number as usize] = XrefRow::InObjStm {
165                stream_objnum: pack.container_objnum,
166                index: *index,
167            };
168        }
169        if let Some(offset) = objstm_offsets.get(&pack.container_objnum).copied() {
170            rows[pack.container_objnum as usize] = XrefRow::Direct {
171                offset,
172                generation: 0,
173            };
174        }
175    }
176
177    // 6. Pick widths and serialize entry table.
178    let max_offset = direct_offsets
179        .values()
180        .chain(objstm_offsets.values())
181        .copied()
182        .max()
183        .unwrap_or(0);
184    let max_member_index = packed_streams
185        .iter()
186        .flat_map(|p| p.members.iter().map(|(_, i)| *i))
187        .max()
188        .unwrap_or(0)
189        .max(file.max_object_number);
190    let widths = xref_entry_widths(max_offset, max_member_index);
191    let xref_data = build_xref_stream_data(&rows, widths);
192
193    // 7. Build the xref-stream dict (carry trailer keys minus ones we
194    //    rewrite ourselves).
195    let mut xref_dict = file.trailer.clone();
196    for key in [
197        "Prev",
198        "XRefStm",
199        "Encrypt",
200        "Length",
201        "Filter",
202        "DecodeParms",
203        "W",
204        "Index",
205        "Type",
206    ] {
207        xref_dict.remove(key);
208    }
209    xref_dict.insert("Type".to_string(), PdfValue::Name("XRef".to_string()));
210    xref_dict.insert("Size".to_string(), PdfValue::Integer(xref_size as i64));
211    xref_dict.insert(
212        "W".to_string(),
213        PdfValue::Array(
214            widths
215                .iter()
216                .map(|w| PdfValue::Integer(i64::from(*w)))
217                .collect(),
218        ),
219    );
220    xref_dict.insert(
221        "Filter".to_string(),
222        PdfValue::Name("FlateDecode".to_string()),
223    );
224
225    // Compress xref body with Flate to match what real producers emit.
226    let compressed_xref =
227        flate_encode(&xref_data).expect("flate_encode is infallible for in-memory buffers");
228    xref_dict.insert(
229        "Length".to_string(),
230        PdfValue::Integer(compressed_xref.len() as i64),
231    );
232
233    // 8. Emit xref stream as the final object; capture its offset.
234    let startxref = output.len();
235    output.extend_from_slice(format!("{} 0 obj\n", xref_stream_objnum).as_bytes());
236    output.extend_from_slice(serialize_dictionary(&xref_dict).as_bytes());
237    output.extend_from_slice(b"\nstream\n");
238    output.extend_from_slice(&compressed_xref);
239    output.extend_from_slice(b"\nendstream\nendobj\n");
240
241    // 9. Trailer + EOF.
242    output.extend_from_slice(format!("startxref\n{startxref}\n%%EOF\n").as_bytes());
243    output
244}
245
246fn write_indirect_object(output: &mut Vec<u8>, object_ref: ObjectRef, object: &PdfObject) {
247    output.extend_from_slice(
248        format!(
249            "{} {} obj\n",
250            object_ref.object_number, object_ref.generation
251        )
252        .as_bytes(),
253    );
254    match object {
255        PdfObject::Value(value) => {
256            output.extend_from_slice(serialize_value(value).as_bytes());
257            output.extend_from_slice(b"\nendobj\n");
258        }
259        PdfObject::Stream(stream) => {
260            let mut dict = stream.dict.clone();
261            dict.insert(
262                "Length".to_string(),
263                PdfValue::Integer(stream.data.len() as i64),
264            );
265            output.extend_from_slice(serialize_dictionary(&dict).as_bytes());
266            output.extend_from_slice(b"\nstream\n");
267            output.extend_from_slice(&stream.data);
268            if !stream.data.ends_with(b"\n") {
269                output.push(b'\n');
270            }
271            output.extend_from_slice(b"endstream\nendobj\n");
272        }
273    }
274}
275
276fn write_objstm_container(output: &mut Vec<u8>, pack: &PackedObjStm) {
277    let mut dict = PdfDictionary::new();
278    dict.insert("Type".to_string(), PdfValue::Name("ObjStm".to_string()));
279    dict.insert(
280        "N".to_string(),
281        PdfValue::Integer(pack.members.len() as i64),
282    );
283    dict.insert("First".to_string(), PdfValue::Integer(pack.first as i64));
284    dict.insert(
285        "Filter".to_string(),
286        PdfValue::Name("FlateDecode".to_string()),
287    );
288    dict.insert(
289        "Length".to_string(),
290        PdfValue::Integer(pack.body.len() as i64),
291    );
292    output.extend_from_slice(format!("{} 0 obj\n", pack.container_objnum).as_bytes());
293    output.extend_from_slice(serialize_dictionary(&dict).as_bytes());
294    output.extend_from_slice(b"\nstream\n");
295    output.extend_from_slice(&pack.body);
296    if !pack.body.ends_with(b"\n") {
297        output.push(b'\n');
298    }
299    output.extend_from_slice(b"endstream\nendobj\n");
300}
301
302fn pack_objstm_chunk(container_objnum: u32, chunk: &[(ObjectRef, &PdfValue)]) -> PackedObjStm {
303    // Build the prefix "objnum1 offset1 objnum2 offset2 ..." and the
304    // body of serialized values back-to-back. The header length is the
305    // /First entry; each value's offset is its position in the body.
306    let mut header = String::new();
307    let mut body_text = String::new();
308    let mut members: Vec<(ObjectRef, u32)> = Vec::new();
309    let mut running_offset = 0usize;
310    for (index, (object_ref, value)) in chunk.iter().enumerate() {
311        write!(header, "{} {} ", object_ref.object_number, running_offset)
312            .expect("string writes should succeed");
313        let serialized = serialize_value(value);
314        body_text.push_str(&serialized);
315        body_text.push(' ');
316        running_offset += serialized.len() + 1;
317        members.push((*object_ref, index as u32));
318    }
319    let header_bytes = header.into_bytes();
320    let first = header_bytes.len();
321    let mut decompressed = header_bytes;
322    decompressed.extend_from_slice(body_text.as_bytes());
323    let body =
324        flate_encode(&decompressed).expect("flate_encode is infallible for in-memory buffers");
325    PackedObjStm {
326        container_objnum,
327        body,
328        first,
329        members,
330    }
331}
332
333fn xref_entry_widths(max_offset: usize, max_member_index: u32) -> [u8; 3] {
334    let field2 = bytes_to_fit(max_offset as u64).max(1);
335    let field3 = bytes_to_fit(u64::from(max_member_index)).max(1);
336    [1, field2, field3]
337}
338
339fn bytes_to_fit(value: u64) -> u8 {
340    if value == 0 {
341        return 1;
342    }
343    let mut bits = 0u32;
344    let mut v = value;
345    while v > 0 {
346        bits += 1;
347        v >>= 1;
348    }
349    bits.div_ceil(8) as u8
350}
351
352fn build_xref_stream_data(rows: &[XrefRow], widths: [u8; 3]) -> Vec<u8> {
353    let mut output = Vec::with_capacity(rows.len() * (widths[0] + widths[1] + widths[2]) as usize);
354    for row in rows {
355        match row {
356            XrefRow::Free => {
357                push_be(&mut output, 0, widths[0]);
358                push_be(&mut output, 0, widths[1]);
359                push_be(&mut output, 0, widths[2]);
360            }
361            XrefRow::Direct { offset, generation } => {
362                push_be(&mut output, 1, widths[0]);
363                push_be(&mut output, *offset as u64, widths[1]);
364                push_be(&mut output, u64::from(*generation), widths[2]);
365            }
366            XrefRow::InObjStm {
367                stream_objnum,
368                index,
369            } => {
370                push_be(&mut output, 2, widths[0]);
371                push_be(&mut output, u64::from(*stream_objnum), widths[1]);
372                push_be(&mut output, u64::from(*index), widths[2]);
373            }
374        }
375    }
376    output
377}
378
379fn push_be(output: &mut Vec<u8>, value: u64, width: u8) {
380    let width = width as usize;
381    for i in (0..width).rev() {
382        output.push(((value >> (i * 8)) & 0xff) as u8);
383    }
384}
385
386pub fn serialize_value(value: &PdfValue) -> String {
387    match value {
388        PdfValue::Null => "null".to_string(),
389        PdfValue::Bool(value) => value.to_string(),
390        PdfValue::Integer(value) => value.to_string(),
391        PdfValue::Number(value) => {
392            if value.fract() == 0.0 {
393                format!("{:.0}", value)
394            } else {
395                let mut number = format!("{value:.6}");
396                while number.contains('.') && number.ends_with('0') {
397                    number.pop();
398                }
399                if number.ends_with('.') {
400                    number.pop();
401                }
402                number
403            }
404        }
405        PdfValue::Name(name) => {
406            let mut encoded = String::from("/");
407            for byte in name.bytes() {
408                if byte == b'#'
409                    || byte <= b' '
410                    || byte >= 0x7F
411                    || matches!(
412                        byte,
413                        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
414                    )
415                {
416                    encoded.push_str(&format!("#{:02X}", byte));
417                } else {
418                    encoded.push(byte as char);
419                }
420            }
421            encoded
422        }
423        PdfValue::String(string) => serialize_string(string),
424        PdfValue::Array(values) => format!(
425            "[{}]",
426            values
427                .iter()
428                .map(serialize_value)
429                .collect::<Vec<_>>()
430                .join(" ")
431        ),
432        PdfValue::Dictionary(dictionary) => serialize_dictionary(dictionary),
433        PdfValue::Reference(object_ref) => {
434            format!("{} {} R", object_ref.object_number, object_ref.generation)
435        }
436    }
437}
438
439pub fn serialize_dictionary(dictionary: &PdfDictionary) -> String {
440    let mut output = String::from("<<");
441    for (key, value) in dictionary {
442        write!(output, "/{} {}", key, serialize_value(value))
443            .expect("string writes should succeed");
444        output.push(' ');
445    }
446    output.push_str(">>");
447    output
448}
449
450pub fn serialize_string(string: &PdfString) -> String {
451    let mut output = String::from("(");
452    for byte in &string.0 {
453        match byte {
454            b'(' | b')' | b'\\' => {
455                output.push('\\');
456                output.push(*byte as char);
457            }
458            b'\n' => output.push_str("\\n"),
459            b'\r' => output.push_str("\\r"),
460            b'\t' => output.push_str("\\t"),
461            0x08 => output.push_str("\\b"),
462            0x0C => output.push_str("\\f"),
463            byte if byte.is_ascii_graphic() || *byte == b' ' => output.push(*byte as char),
464            other => output.push_str(&format!("\\{:03o}", other)),
465        }
466    }
467    output.push(')');
468    output
469}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474
475    #[test]
476    fn xref_entry_widths_picks_minimal_field_widths() {
477        // Small offsets and small member counts → 1/1/1.
478        assert_eq!(xref_entry_widths(0, 0), [1, 1, 1]);
479        assert_eq!(xref_entry_widths(255, 250), [1, 1, 1]);
480        // Offsets crossing 8-bit boundary need 2 bytes.
481        assert_eq!(xref_entry_widths(256, 0), [1, 2, 1]);
482        // Offsets crossing 16-bit boundary need 3 bytes (typical small docs).
483        assert_eq!(xref_entry_widths(65_535, 65_535), [1, 2, 2]);
484        assert_eq!(xref_entry_widths(65_536, 65_536), [1, 3, 3]);
485        // Offsets in 24-bit range need 3 bytes.
486        assert_eq!(xref_entry_widths(16_777_215, 0), [1, 3, 1]);
487        assert_eq!(xref_entry_widths(16_777_216, 0), [1, 4, 1]);
488    }
489
490    #[test]
491    fn pack_objstm_chunk_preserves_member_indices() {
492        let v1 = PdfValue::Integer(42);
493        let v2 = PdfValue::Name("Foo".to_string());
494        let v3 = PdfValue::Bool(true);
495        let chunk: Vec<(ObjectRef, &PdfValue)> = vec![
496            (ObjectRef::new(7, 0), &v1),
497            (ObjectRef::new(8, 0), &v2),
498            (ObjectRef::new(9, 0), &v3),
499        ];
500        let pack = pack_objstm_chunk(100, &chunk);
501        assert_eq!(pack.container_objnum, 100);
502        assert_eq!(pack.members.len(), 3);
503        assert_eq!(pack.members[0].1, 0);
504        assert_eq!(pack.members[1].1, 1);
505        assert_eq!(pack.members[2].1, 2);
506        assert_eq!(pack.members[0].0.object_number, 7);
507        assert_eq!(pack.members[1].0.object_number, 8);
508        assert_eq!(pack.members[2].0.object_number, 9);
509        assert!(pack.first > 0, "ObjStm header must have positive length");
510    }
511
512    #[test]
513    fn build_xref_stream_data_serialises_widths_big_endian() {
514        let rows = vec![
515            XrefRow::Free,
516            XrefRow::Direct {
517                offset: 0x1234,
518                generation: 0,
519            },
520            XrefRow::InObjStm {
521                stream_objnum: 5,
522                index: 3,
523            },
524        ];
525        let widths = [1u8, 2u8, 1u8];
526        let data = build_xref_stream_data(&rows, widths);
527        // Each row is 1+2+1 = 4 bytes.
528        assert_eq!(data.len(), 12);
529        // Free row.
530        assert_eq!(&data[0..4], &[0, 0, 0, 0]);
531        // Direct row: type=1, offset=0x1234 BE → 0x12 0x34, gen=0.
532        assert_eq!(&data[4..8], &[1, 0x12, 0x34, 0]);
533        // InObjStm row: type=2, stream_objnum=5, index=3.
534        assert_eq!(&data[8..12], &[2, 0, 5, 3]);
535    }
536}