Skip to main content

rpdfium_parser/
object_stream.rs

1// Derived from PDFium's cpdf_object_stream.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Object stream (ObjStm) parsing.
7//!
8//! Object streams (PDF 1.5+) pack multiple non-stream objects into a single
9//! compressed stream. The stream contains a header with object numbers and
10//! byte offsets, followed by the objects themselves.
11
12use rpdfium_core::error::PdfError;
13use rpdfium_core::{Name, ParsingMode};
14
15use crate::object::Object;
16use crate::object_parser::parse_object;
17use crate::tokenizer::{Token, Tokenizer};
18
19/// Parsed contents of an object stream: a list of (object_number, object) pairs.
20pub struct ObjectStreamContents {
21    /// Objects extracted from the stream, indexed by their position.
22    /// The tuple is (object_number, parsed_object).
23    pub objects: Vec<(u32, Object)>,
24}
25
26/// Parse the decompressed contents of an object stream.
27///
28/// The stream dict must contain:
29/// - `/N`: number of objects in the stream
30/// - `/First`: byte offset of the first object data (past the header)
31///
32/// The stream data format is:
33/// ```text
34/// objnum1 offset1 objnum2 offset2 ... objnumN offsetN
35/// <object1_data> <object2_data> ... <objectN_data>
36/// ```
37///
38/// `data` is the decompressed stream bytes.
39/// `dict` is the stream's dictionary.
40pub fn parse_object_stream(
41    data: &[u8],
42    dict: &std::collections::HashMap<Name, Object>,
43    mode: ParsingMode,
44) -> Result<ObjectStreamContents, PdfError> {
45    // Extract /N (number of objects)
46    let n = match dict.get(&Name::n()) {
47        Some(Object::Integer(n)) if *n >= 0 => *n as usize,
48        _ => {
49            return Err(PdfError::InvalidObjectStream);
50        }
51    };
52
53    // Extract /First (byte offset to first object data)
54    let first = match dict.get(&Name::first()) {
55        Some(Object::Integer(f)) if *f >= 0 => *f as usize,
56        _ => {
57            return Err(PdfError::InvalidObjectStream);
58        }
59    };
60
61    if first > data.len() {
62        return Err(PdfError::InvalidObjectStream);
63    }
64
65    // Parse the header: N pairs of (object_number, byte_offset)
66    let mut tok = Tokenizer::new(data);
67    let mut header = Vec::with_capacity(n);
68
69    for _ in 0..n {
70        let obj_num = match tok.next_token() {
71            Some(Ok(Token::Integer(num))) if num >= 0 => num as u32,
72            _ => {
73                return Err(PdfError::InvalidObjectStream);
74            }
75        };
76
77        let offset = match tok.next_token() {
78            Some(Ok(Token::Integer(off))) if off >= 0 => off as usize,
79            _ => {
80                return Err(PdfError::InvalidObjectStream);
81            }
82        };
83
84        header.push((obj_num, offset));
85    }
86
87    // Parse each object from the data section
88    let obj_data = &data[first..];
89    let mut objects = Vec::with_capacity(n);
90
91    for (obj_num, offset) in &header {
92        if *offset >= obj_data.len() {
93            if mode == ParsingMode::Lenient {
94                tracing::warn!(
95                    object_number = obj_num,
96                    offset = offset,
97                    "object stream entry offset out of bounds, skipping"
98                );
99                objects.push((*obj_num, Object::Null));
100                continue;
101            }
102            return Err(PdfError::InvalidObjectStream);
103        }
104
105        match parse_object(obj_data, *offset as u64, mode) {
106            Ok(obj) => objects.push((*obj_num, obj)),
107            Err(e) => {
108                if mode == ParsingMode::Lenient {
109                    tracing::warn!(
110                        object_number = obj_num,
111                        error = ?e,
112                        "failed to parse object in object stream, substituting Null"
113                    );
114                    objects.push((*obj_num, Object::Null));
115                } else {
116                    return Err(e);
117                }
118            }
119        }
120    }
121
122    Ok(ObjectStreamContents { objects })
123}
124
125/// Get a specific object from a parsed object stream by its index.
126pub fn get_object_from_stream(contents: &ObjectStreamContents, index: u32) -> Option<&Object> {
127    contents.objects.get(index as usize).map(|(_, obj)| obj)
128}
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133    use std::collections::HashMap;
134
135    #[test]
136    fn parse_simple_object_stream() {
137        // Two objects: obj 10 = integer 42, obj 11 = boolean true
138        // Header: "10 0 11 2 " (obj 10 at offset 0, obj 11 at offset 2 relative to /First)
139        // Data starts at /First=10: "42 true "
140        let data = b"10 0 11 3 42 true";
141        let mut dict = HashMap::new();
142        dict.insert(Name::n(), Object::Integer(2));
143        dict.insert(Name::first(), Object::Integer(10));
144
145        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
146        assert_eq!(contents.objects.len(), 2);
147
148        assert_eq!(contents.objects[0].0, 10);
149        assert_eq!(contents.objects[0].1.as_i64(), Some(42));
150
151        assert_eq!(contents.objects[1].0, 11);
152        assert_eq!(contents.objects[1].1.as_bool(), Some(true));
153    }
154
155    #[test]
156    fn get_object_by_index() {
157        let data = b"5 0 42";
158        let mut dict = HashMap::new();
159        dict.insert(Name::n(), Object::Integer(1));
160        dict.insert(Name::first(), Object::Integer(4));
161
162        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
163
164        let obj = get_object_from_stream(&contents, 0).unwrap();
165        assert_eq!(obj.as_i64(), Some(42));
166
167        assert!(get_object_from_stream(&contents, 1).is_none());
168    }
169
170    #[test]
171    fn missing_n_key() {
172        let data = b"";
173        let mut dict = HashMap::new();
174        dict.insert(Name::first(), Object::Integer(0));
175        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
176        assert!(result.is_err());
177    }
178
179    #[test]
180    fn missing_first_key() {
181        let data = b"";
182        let mut dict = HashMap::new();
183        dict.insert(Name::n(), Object::Integer(0));
184        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
185        assert!(result.is_err());
186    }
187
188    #[test]
189    fn empty_object_stream() {
190        let data = b"";
191        let mut dict = HashMap::new();
192        dict.insert(Name::n(), Object::Integer(0));
193        dict.insert(Name::first(), Object::Integer(0));
194
195        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
196        assert!(contents.objects.is_empty());
197    }
198
199    // -----------------------------------------------------------------------
200    // Upstream-derived object stream tests (cpdf_object_stream_unittest.cpp)
201    // -----------------------------------------------------------------------
202
203    /// Upstream: StreamDictNormal — three objects: dict, array, integer.
204    #[test]
205    fn parse_normal_three_objects() {
206        // "10 0 11 14 12 21<</Name /Foo>>[1 2 3]4"
207        // /First=16, /N=3
208        let data = b"10 0 11 14 12 21<</Name /Foo>>[1 2 3]4";
209        let mut dict = HashMap::new();
210        dict.insert(Name::n(), Object::Integer(3));
211        dict.insert(Name::first(), Object::Integer(16));
212
213        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
214        assert_eq!(contents.objects.len(), 3);
215
216        // Object 10: dictionary
217        assert_eq!(contents.objects[0].0, 10);
218        assert!(contents.objects[0].1.as_dict().is_some());
219
220        // Object 11: array
221        assert_eq!(contents.objects[1].0, 11);
222        assert!(contents.objects[1].1.as_array().is_some());
223
224        // Object 12: integer
225        assert_eq!(contents.objects[2].0, 12);
226        assert_eq!(contents.objects[2].1.as_i64(), Some(4));
227    }
228
229    /// Upstream: StreamDictNoCount — missing /N key.
230    #[test]
231    fn missing_n_key_is_error() {
232        let data = b"10 0 42";
233        let mut dict = HashMap::new();
234        dict.insert(Name::first(), Object::Integer(4));
235        // No /N key
236        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
237        assert!(result.is_err());
238    }
239
240    /// Upstream: StreamDictNegativeCount — negative /N.
241    #[test]
242    fn negative_n_is_error() {
243        let data = b"10 0 42";
244        let mut dict = HashMap::new();
245        dict.insert(Name::n(), Object::Integer(-1));
246        dict.insert(Name::first(), Object::Integer(4));
247        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
248        assert!(result.is_err());
249    }
250
251    /// Upstream: StreamDictFloatCount — float /N should be rejected.
252    #[test]
253    fn float_n_is_error() {
254        let data = b"10 0 42";
255        let mut dict = HashMap::new();
256        dict.insert(Name::n(), Object::Real(2.2));
257        dict.insert(Name::first(), Object::Integer(4));
258        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
259        assert!(result.is_err());
260    }
261
262    /// Upstream: StreamDictNoOffset — missing /First key.
263    #[test]
264    fn missing_first_key_is_error() {
265        let data = b"10 0 42";
266        let mut dict = HashMap::new();
267        dict.insert(Name::n(), Object::Integer(1));
268        // No /First key
269        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
270        assert!(result.is_err());
271    }
272
273    /// Upstream: StreamDictNegativeOffset — negative /First.
274    #[test]
275    fn negative_first_is_error() {
276        let data = b"10 0 42";
277        let mut dict = HashMap::new();
278        dict.insert(Name::n(), Object::Integer(1));
279        dict.insert(Name::first(), Object::Integer(-5));
280        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
281        assert!(result.is_err());
282    }
283
284    /// Upstream: StreamDictFloatOffset — float /First should be rejected.
285    #[test]
286    fn float_first_is_error() {
287        let data = b"10 0 42";
288        let mut dict = HashMap::new();
289        dict.insert(Name::n(), Object::Integer(1));
290        dict.insert(Name::first(), Object::Real(5.5));
291        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
292        assert!(result.is_err());
293    }
294
295    /// Upstream: StreamDictOffsetTooBig — /First beyond data length.
296    #[test]
297    fn first_beyond_data_is_error() {
298        let data = b"10 0 42";
299        let mut dict = HashMap::new();
300        dict.insert(Name::n(), Object::Integer(1));
301        dict.insert(Name::first(), Object::Integer(999));
302        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
303        assert!(result.is_err());
304    }
305
306    /// Upstream: StreamDictTooFewCount — /N smaller than actual objects.
307    /// Only the first /N objects should be parsed.
308    #[test]
309    fn too_few_count_parses_subset() {
310        let data = b"10 0 11 14 12 21<</Name /Foo>>[1 2 3]4";
311        let mut dict = HashMap::new();
312        dict.insert(Name::n(), Object::Integer(2)); // Only 2 of 3
313        dict.insert(Name::first(), Object::Integer(16));
314
315        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
316        assert_eq!(contents.objects.len(), 2);
317        assert_eq!(contents.objects[0].0, 10);
318        assert_eq!(contents.objects[1].0, 11);
319    }
320
321    /// Upstream: StreamDictObjectOffsetTooBig — object offset beyond data.
322    /// In strict mode, should error. In lenient mode, substitutes Null.
323    #[test]
324    fn object_offset_too_big_strict() {
325        // Object at offset 999 (way beyond data)
326        let data = b"10 0 11 999 42 true";
327        let mut dict = HashMap::new();
328        dict.insert(Name::n(), Object::Integer(2));
329        dict.insert(Name::first(), Object::Integer(12));
330
331        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
332        // Object 10 at offset 0 should parse fine, but object 11 at offset 999
333        // should fail since it's beyond the data
334        assert!(result.is_err());
335    }
336
337    /// Upstream: StreamDictObjectOffsetTooBig — lenient mode substitutes Null.
338    #[test]
339    fn object_offset_too_big_lenient() {
340        let data = b"10 0 11 999 42 true";
341        let mut dict = HashMap::new();
342        dict.insert(Name::n(), Object::Integer(2));
343        dict.insert(Name::first(), Object::Integer(12));
344
345        let contents = parse_object_stream(data, &dict, ParsingMode::Lenient).unwrap();
346        assert_eq!(contents.objects.len(), 2);
347        assert_eq!(contents.objects[0].0, 10);
348        assert_eq!(contents.objects[0].1.as_i64(), Some(42));
349        // Object 11 should be Null (offset out of bounds)
350        assert_eq!(contents.objects[1].0, 11);
351        assert!(contents.objects[1].1.is_null());
352    }
353
354    /// Upstream: StreamDictGarbageObjNum — garbage in object number header.
355    #[test]
356    fn garbage_in_header_obj_num() {
357        // "10 0 hi 14 12 21..." — "hi" is not a valid integer for 2nd obj number
358        let data = b"10 0 hi 14 12 21 42 true 99";
359        let mut dict = HashMap::new();
360        dict.insert(Name::n(), Object::Integer(3));
361        dict.insert(Name::first(), Object::Integer(19));
362
363        // Should fail in strict mode because header parsing expects integer pairs
364        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
365        assert!(result.is_err());
366    }
367
368    /// get_object_from_stream with index beyond bounds returns None.
369    #[test]
370    fn get_object_index_out_of_bounds() {
371        let data = b"5 0 42";
372        let mut dict = HashMap::new();
373        dict.insert(Name::n(), Object::Integer(1));
374        dict.insert(Name::first(), Object::Integer(4));
375
376        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
377
378        assert!(get_object_from_stream(&contents, 0).is_some());
379        assert!(get_object_from_stream(&contents, 1).is_none());
380        assert!(get_object_from_stream(&contents, 100).is_none());
381    }
382
383    /// Object stream with dict containing string value for /N (wrong type).
384    #[test]
385    fn string_n_is_error() {
386        let data = b"10 0 42";
387        let mut dict = HashMap::new();
388        dict.insert(
389            Name::n(),
390            Object::String(rpdfium_core::PdfString::from_bytes(b"3".to_vec())),
391        );
392        dict.insert(Name::first(), Object::Integer(4));
393        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
394        assert!(result.is_err());
395    }
396
397    /// Object stream with zero-length data and /N=0 is valid.
398    #[test]
399    fn zero_objects_zero_data() {
400        let data = b"";
401        let mut dict = HashMap::new();
402        dict.insert(Name::n(), Object::Integer(0));
403        dict.insert(Name::first(), Object::Integer(0));
404
405        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
406        assert!(contents.objects.is_empty());
407    }
408
409    /// Duplicate object numbers in header — both should parse successfully.
410    #[test]
411    fn duplicate_object_numbers() {
412        // Two entries both claiming object number 10
413        let data = b"10 0 10 3 42 true";
414        let mut dict = HashMap::new();
415        dict.insert(Name::n(), Object::Integer(2));
416        dict.insert(Name::first(), Object::Integer(10));
417
418        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
419        assert_eq!(contents.objects.len(), 2);
420        assert_eq!(contents.objects[0].0, 10);
421        assert_eq!(contents.objects[0].1.as_i64(), Some(42));
422        assert_eq!(contents.objects[1].0, 10);
423        assert_eq!(contents.objects[1].1.as_bool(), Some(true));
424    }
425
426    /// /N=1000 but only 2 header entries in the data — should parse what's available.
427    #[test]
428    fn very_large_n_limited_by_data() {
429        // Header has only 2 entries, but /N claims 1000
430        let data = b"5 0 6 3 42 true";
431        let mut dict = HashMap::new();
432        dict.insert(Name::n(), Object::Integer(1000));
433        dict.insert(Name::first(), Object::Integer(10));
434
435        // In strict mode, parsing the 3rd header entry will fail (no more tokens)
436        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
437        assert!(result.is_err());
438    }
439
440    /// Non-ascending offsets in header — both should parse correctly.
441    #[test]
442    fn unordered_offsets() {
443        // Object 10 at offset 5, object 11 at offset 0 — reversed order
444        let data = b"10 5 11 0 true 42";
445        let mut dict = HashMap::new();
446        dict.insert(Name::n(), Object::Integer(2));
447        dict.insert(Name::first(), Object::Integer(10));
448
449        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
450        assert_eq!(contents.objects.len(), 2);
451        // Object 10 at offset 5 in obj_data
452        assert_eq!(contents.objects[0].0, 10);
453        // Object 11 at offset 0 in obj_data
454        assert_eq!(contents.objects[1].0, 11);
455    }
456
457    /// /N=0 with non-empty data — should produce empty result.
458    #[test]
459    fn n_zero_with_data() {
460        let data = b"10 0 42";
461        let mut dict = HashMap::new();
462        dict.insert(Name::n(), Object::Integer(0));
463        dict.insert(Name::first(), Object::Integer(4));
464
465        let contents = parse_object_stream(data, &dict, ParsingMode::Strict).unwrap();
466        assert!(contents.objects.is_empty());
467    }
468
469    /// /First equals data length — obj_data is empty, so all offsets are out of bounds.
470    #[test]
471    fn first_equals_data_length() {
472        let data = b"5 0 42"; // length = 6
473        let mut dict = HashMap::new();
474        dict.insert(Name::n(), Object::Integer(1));
475        dict.insert(Name::first(), Object::Integer(6)); // /First = data.len()
476
477        // Object at offset 0 in empty obj_data — out of bounds
478        let result = parse_object_stream(data, &dict, ParsingMode::Strict);
479        assert!(result.is_err());
480
481        // In lenient mode, should substitute Null
482        let contents = parse_object_stream(data, &dict, ParsingMode::Lenient).unwrap();
483        assert_eq!(contents.objects.len(), 1);
484        assert!(contents.objects[0].1.is_null());
485    }
486}