Skip to main content

rpdfium_parser/
object_walker.rs

1// Derived from PDFium's cpdf_object_walker.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Object graph walker -- iterative BFS traversal with cycle detection.
7//!
8//! Provides a visitor pattern for traversing the PDF object graph starting
9//! from a root object. Uses BFS (breadth-first search) with a `VecDeque`
10//! work queue and `HashSet` for cycle detection. Never recurses on the
11//! call stack (WASM-safe).
12
13use std::collections::{HashMap, HashSet, VecDeque};
14
15use rpdfium_core::fx_system::MAX_OBJECT_NUMBER;
16use rpdfium_core::{Name, PdfSource};
17
18use crate::object::{Object, ObjectId};
19use crate::store::ObjectStore;
20
21/// Visitor trait for walking the PDF object graph.
22///
23/// Implement this trait to receive callbacks for each object type
24/// encountered during the walk.
25pub trait ObjectVisitor {
26    /// Called for each dictionary encountered.
27    fn visit_dict(&mut self, id: Option<ObjectId>, dict: &HashMap<Name, Object>);
28
29    /// Called for each array encountered.
30    fn visit_array(&mut self, id: Option<ObjectId>, arr: &[Object]);
31
32    /// Called for each stream encountered.
33    fn visit_stream(&mut self, id: Option<ObjectId>);
34
35    /// Called for each reference encountered.
36    fn visit_reference(&mut self, target: ObjectId);
37
38    /// Called for each primitive value (Integer, Real, Boolean, String, Name, Null).
39    fn visit_primitive(&mut self, id: Option<ObjectId>, obj: &Object);
40}
41
42/// Iterative BFS walker for the PDF object graph.
43///
44/// Traverses objects starting from a root, tracking visited objects to
45/// detect cycles. Enforces a safety limit of `MAX_OBJECT_NUMBER` visited
46/// objects to prevent denial of service.
47pub struct ObjectWalker {
48    visited: HashSet<ObjectId>,
49    max_objects: usize,
50}
51
52impl ObjectWalker {
53    /// Create a new walker with the default safety limit.
54    pub fn new() -> Self {
55        Self {
56            visited: HashSet::new(),
57            max_objects: MAX_OBJECT_NUMBER as usize,
58        }
59    }
60
61    /// Create a new walker with a custom maximum objects limit.
62    pub fn with_max_objects(max_objects: usize) -> Self {
63        Self {
64            visited: HashSet::new(),
65            max_objects,
66        }
67    }
68
69    /// Walk the object graph starting from `root_id`, calling visitor methods
70    /// for each object encountered.
71    ///
72    /// Uses iterative BFS with cycle detection. Silently skips objects that
73    /// cannot be resolved from the store.
74    pub fn walk<S: PdfSource>(
75        &mut self,
76        store: &ObjectStore<S>,
77        root_id: ObjectId,
78        visitor: &mut dyn ObjectVisitor,
79    ) {
80        let mut queue: VecDeque<ObjectId> = VecDeque::new();
81        queue.push_back(root_id);
82
83        while let Some(id) = queue.pop_front() {
84            // Cycle detection
85            if self.visited.contains(&id) {
86                continue;
87            }
88
89            // Safety limit
90            if self.visited.len() >= self.max_objects {
91                break;
92            }
93
94            self.visited.insert(id);
95
96            // Resolve the object from the store
97            let obj = match store.resolve(id) {
98                Ok(obj) => obj,
99                Err(_) => continue,
100            };
101
102            self.visit_object(obj, Some(id), visitor, &mut queue);
103        }
104    }
105
106    /// Visit a single object, dispatching to the appropriate visitor method
107    /// and enqueueing any references found.
108    fn visit_object(
109        &self,
110        obj: &Object,
111        id: Option<ObjectId>,
112        visitor: &mut dyn ObjectVisitor,
113        queue: &mut VecDeque<ObjectId>,
114    ) {
115        match obj {
116            Object::Dictionary(dict) => {
117                visitor.visit_dict(id, dict);
118                self.enqueue_dict_refs(dict, queue);
119            }
120            Object::Stream { dict, .. } => {
121                visitor.visit_stream(id);
122                visitor.visit_dict(id, dict);
123                self.enqueue_dict_refs(dict, queue);
124            }
125            Object::Array(arr) => {
126                visitor.visit_array(id, arr);
127                self.enqueue_array_refs(arr, queue);
128            }
129            Object::Reference(target) => {
130                visitor.visit_reference(*target);
131                if !self.visited.contains(target) {
132                    queue.push_back(*target);
133                }
134            }
135            _ => {
136                // Null, Boolean, Integer, Real, String, Name
137                visitor.visit_primitive(id, obj);
138            }
139        }
140    }
141
142    /// Enqueue all indirect references found in a dictionary's values.
143    fn enqueue_dict_refs(&self, dict: &HashMap<Name, Object>, queue: &mut VecDeque<ObjectId>) {
144        for value in dict.values() {
145            if let Object::Reference(target) = value {
146                if !self.visited.contains(target) {
147                    queue.push_back(*target);
148                }
149            }
150        }
151    }
152
153    /// Enqueue all indirect references found in an array's elements.
154    fn enqueue_array_refs(&self, arr: &[Object], queue: &mut VecDeque<ObjectId>) {
155        for elem in arr {
156            if let Object::Reference(target) = elem {
157                if !self.visited.contains(target) {
158                    queue.push_back(*target);
159                }
160            }
161        }
162    }
163
164    /// Returns the number of unique objects visited so far.
165    pub fn visited_count(&self) -> usize {
166        self.visited.len()
167    }
168
169    /// Returns a reference to the set of visited object IDs.
170    pub fn visited(&self) -> &HashSet<ObjectId> {
171        &self.visited
172    }
173}
174
175impl Default for ObjectWalker {
176    fn default() -> Self {
177        Self::new()
178    }
179}
180
181/// A built-in visitor that collects object type statistics.
182#[derive(Debug, Clone, Default)]
183pub struct ObjectStats {
184    /// Number of dictionaries visited.
185    pub dict_count: usize,
186    /// Number of arrays visited.
187    pub array_count: usize,
188    /// Number of streams visited.
189    pub stream_count: usize,
190    /// Number of references visited.
191    pub reference_count: usize,
192    /// Number of primitive values visited.
193    pub primitive_count: usize,
194    /// Total number of visitor callbacks.
195    pub total_visited: usize,
196}
197
198impl ObjectVisitor for ObjectStats {
199    fn visit_dict(&mut self, _id: Option<ObjectId>, _dict: &HashMap<Name, Object>) {
200        self.dict_count += 1;
201        self.total_visited += 1;
202    }
203
204    fn visit_array(&mut self, _id: Option<ObjectId>, _arr: &[Object]) {
205        self.array_count += 1;
206        self.total_visited += 1;
207    }
208
209    fn visit_stream(&mut self, _id: Option<ObjectId>) {
210        self.stream_count += 1;
211        self.total_visited += 1;
212    }
213
214    fn visit_reference(&mut self, _target: ObjectId) {
215        self.reference_count += 1;
216        self.total_visited += 1;
217    }
218
219    fn visit_primitive(&mut self, _id: Option<ObjectId>, _obj: &Object) {
220        self.primitive_count += 1;
221        self.total_visited += 1;
222    }
223}
224
225#[cfg(test)]
226mod tests {
227    use rpdfium_core::ParsingMode;
228
229    use super::*;
230
231    /// Build a minimal valid PDF for testing.
232    fn build_minimal_pdf() -> Vec<u8> {
233        let mut pdf = Vec::new();
234        pdf.extend_from_slice(b"%PDF-1.4\n");
235
236        let obj1_offset = pdf.len();
237        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
238
239        let obj2_offset = pdf.len();
240        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
241
242        let xref_offset = pdf.len();
243        pdf.extend_from_slice(b"xref\n");
244        pdf.extend_from_slice(b"0 3\n");
245        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
246        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
247        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
248        pdf.extend_from_slice(b"trailer\n");
249        pdf.extend_from_slice(b"<< /Size 3 /Root 1 0 R >>\n");
250        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
251
252        pdf
253    }
254
255    #[test]
256    fn test_walk_minimal_pdf() {
257        let pdf = build_minimal_pdf();
258        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
259
260        let mut stats = ObjectStats::default();
261        let mut walker = ObjectWalker::new();
262        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
263
264        // Object 1 (Catalog dict) and Object 2 (Pages dict) should be visited
265        assert_eq!(walker.visited_count(), 2);
266        assert!(walker.visited().contains(&ObjectId::new(1, 0)));
267        assert!(walker.visited().contains(&ObjectId::new(2, 0)));
268    }
269
270    #[test]
271    fn test_stats_collection_from_minimal_pdf() {
272        let pdf = build_minimal_pdf();
273        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
274
275        let mut stats = ObjectStats::default();
276        let mut walker = ObjectWalker::new();
277        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
278
279        // Catalog: 1 dict. Pages: 1 dict.
280        assert_eq!(stats.dict_count, 2);
281        // Pages /Kids is an empty array
282        assert_eq!(stats.array_count, 0); // Arrays inside dicts are not separately walked
283        assert_eq!(stats.stream_count, 0);
284        assert!(stats.total_visited > 0);
285    }
286
287    #[test]
288    fn test_cycle_detection() {
289        // Build a PDF where object 3 references itself via a dict value
290        let mut pdf = Vec::new();
291        pdf.extend_from_slice(b"%PDF-1.4\n");
292
293        let obj1_offset = pdf.len();
294        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
295
296        let obj2_offset = pdf.len();
297        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
298
299        let obj3_offset = pdf.len();
300        pdf.extend_from_slice(b"3 0 obj\n<< /Self 3 0 R >>\nendobj\n");
301
302        let xref_offset = pdf.len();
303        pdf.extend_from_slice(b"xref\n");
304        pdf.extend_from_slice(b"0 4\n");
305        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
306        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
307        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
308        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj3_offset).as_bytes());
309        pdf.extend_from_slice(b"trailer\n");
310        pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R >>\n");
311        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
312
313        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
314
315        let mut stats = ObjectStats::default();
316        let mut walker = ObjectWalker::new();
317        walker.walk(&store, ObjectId::new(3, 0), &mut stats);
318
319        // Should visit object 3 exactly once, then detect the cycle
320        assert_eq!(walker.visited_count(), 1);
321        assert!(walker.visited().contains(&ObjectId::new(3, 0)));
322    }
323
324    #[test]
325    fn test_max_objects_safety_limit() {
326        // Build a PDF with 3 objects but set max_objects to 1
327        let pdf = build_minimal_pdf();
328        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
329
330        let mut stats = ObjectStats::default();
331        let mut walker = ObjectWalker::with_max_objects(1);
332        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
333
334        // Should stop after visiting 1 object
335        assert_eq!(walker.visited_count(), 1);
336    }
337
338    #[test]
339    fn test_walk_with_no_references() {
340        // Build a PDF with a single object containing no references
341        let mut pdf = Vec::new();
342        pdf.extend_from_slice(b"%PDF-1.4\n");
343
344        let obj1_offset = pdf.len();
345        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
346
347        let xref_offset = pdf.len();
348        pdf.extend_from_slice(b"xref\n");
349        pdf.extend_from_slice(b"0 2\n");
350        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
351        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
352        pdf.extend_from_slice(b"trailer\n");
353        pdf.extend_from_slice(b"<< /Size 2 /Root 1 0 R >>\n");
354        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
355
356        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
357
358        let mut stats = ObjectStats::default();
359        let mut walker = ObjectWalker::new();
360        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
361
362        assert_eq!(walker.visited_count(), 1);
363        assert_eq!(stats.dict_count, 1);
364        assert_eq!(stats.reference_count, 0);
365    }
366
367    #[test]
368    fn test_object_stats_counters() {
369        let mut stats = ObjectStats::default();
370        assert_eq!(stats.dict_count, 0);
371        assert_eq!(stats.array_count, 0);
372        assert_eq!(stats.stream_count, 0);
373        assert_eq!(stats.reference_count, 0);
374        assert_eq!(stats.primitive_count, 0);
375        assert_eq!(stats.total_visited, 0);
376
377        // Manually call visitor methods
378        stats.visit_dict(None, &HashMap::new());
379        stats.visit_array(None, &[]);
380        stats.visit_stream(None);
381        stats.visit_reference(ObjectId::new(1, 0));
382        stats.visit_primitive(None, &Object::Integer(42));
383
384        assert_eq!(stats.dict_count, 1);
385        assert_eq!(stats.array_count, 1);
386        assert_eq!(stats.stream_count, 1);
387        assert_eq!(stats.reference_count, 1);
388        assert_eq!(stats.primitive_count, 1);
389        assert_eq!(stats.total_visited, 5);
390    }
391
392    #[test]
393    fn test_visit_callbacks_fire_correctly() {
394        // Track which callbacks fired and in what order
395        struct CallbackTracker {
396            events: Vec<String>,
397        }
398        impl ObjectVisitor for CallbackTracker {
399            fn visit_dict(&mut self, id: Option<ObjectId>, _dict: &HashMap<Name, Object>) {
400                self.events
401                    .push(format!("dict:{}", id.map_or(0, |i| i.number)));
402            }
403            fn visit_array(&mut self, id: Option<ObjectId>, _arr: &[Object]) {
404                self.events
405                    .push(format!("array:{}", id.map_or(0, |i| i.number)));
406            }
407            fn visit_stream(&mut self, id: Option<ObjectId>) {
408                self.events
409                    .push(format!("stream:{}", id.map_or(0, |i| i.number)));
410            }
411            fn visit_reference(&mut self, target: ObjectId) {
412                self.events.push(format!("ref:{}", target.number));
413            }
414            fn visit_primitive(&mut self, id: Option<ObjectId>, _obj: &Object) {
415                self.events
416                    .push(format!("prim:{}", id.map_or(0, |i| i.number)));
417            }
418        }
419
420        let pdf = build_minimal_pdf();
421        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
422
423        let mut tracker = CallbackTracker { events: Vec::new() };
424        let mut walker = ObjectWalker::new();
425        walker.walk(&store, ObjectId::new(1, 0), &mut tracker);
426
427        // Object 1 is a dict (Catalog) containing /Pages 2 0 R
428        assert!(tracker.events.contains(&"dict:1".to_string()));
429        // Object 2 is a dict (Pages)
430        assert!(tracker.events.contains(&"dict:2".to_string()));
431    }
432
433    #[test]
434    fn test_walk_nonexistent_root() {
435        let pdf = build_minimal_pdf();
436        let store = ObjectStore::open(pdf, ParsingMode::Lenient).unwrap();
437
438        let mut stats = ObjectStats::default();
439        let mut walker = ObjectWalker::new();
440        walker.walk(&store, ObjectId::new(999, 0), &mut stats);
441
442        // Nonexistent root should result in 0 visited objects
443        // (it gets inserted into visited set but fails to resolve)
444        assert_eq!(stats.total_visited, 0);
445    }
446
447    // -----------------------------------------------------------------------
448    // Upstream-derived object walker tests (cpdf_object_walker_unittest.cpp)
449    // -----------------------------------------------------------------------
450
451    /// Upstream: TEST(ObjectWalkerTest, Simple)
452    ///
453    /// Walking a single simple object should visit exactly that object.
454    /// In our BFS walker design, we test with store-based objects.
455    #[test]
456    fn test_walker_simple_dict() {
457        // Build a PDF with a single dict object (no references)
458        let mut pdf = Vec::new();
459        pdf.extend_from_slice(b"%PDF-1.4\n");
460        let obj1_offset = pdf.len();
461        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog >>\nendobj\n");
462        let xref_offset = pdf.len();
463        pdf.extend_from_slice(b"xref\n0 2\n");
464        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
465        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
466        pdf.extend_from_slice(b"trailer\n<< /Size 2 /Root 1 0 R >>\n");
467        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
468
469        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
470        let mut stats = ObjectStats::default();
471        let mut walker = ObjectWalker::new();
472        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
473
474        // Single dict object, no references to follow
475        assert_eq!(walker.visited_count(), 1);
476        assert_eq!(stats.dict_count, 1);
477    }
478
479    /// Upstream: TEST(ObjectWalkerTest, CombinedObject)
480    ///
481    /// Walking a dict that contains references to other objects should visit
482    /// all reachable objects.
483    #[test]
484    fn test_walker_combined_object() {
485        let pdf = build_minimal_pdf();
486        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
487
488        let mut stats = ObjectStats::default();
489        let mut walker = ObjectWalker::new();
490        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
491
492        // Object 1 (Catalog dict with /Pages 2 0 R) → follows ref to Object 2
493        assert_eq!(walker.visited_count(), 2);
494        assert!(walker.visited().contains(&ObjectId::new(1, 0)));
495        assert!(walker.visited().contains(&ObjectId::new(2, 0)));
496        // Both are dicts
497        assert_eq!(stats.dict_count, 2);
498    }
499
500    /// Upstream: TEST(ObjectWalkerTest, GetParent)
501    ///
502    /// In the C++ implementation, GetParent returns the parent object.
503    /// In our BFS walker, parent tracking happens through the visitor pattern.
504    /// Our walker follows indirect references from dict values but does not
505    /// descend into inline arrays. Here we verify that direct dict-to-dict
506    /// reference chains are walked correctly.
507    #[test]
508    fn test_walker_get_parent_tracking() {
509        // Build a 2-level deep PDF where references are direct dict values:
510        // Obj 1 (Catalog) → /Pages 2 0 R → Obj 2 (Pages)
511        let pdf = build_minimal_pdf();
512        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
513
514        // Track visit order
515        struct OrderTracker {
516            order: Vec<u32>,
517        }
518        impl ObjectVisitor for OrderTracker {
519            fn visit_dict(&mut self, id: Option<ObjectId>, _dict: &HashMap<Name, Object>) {
520                if let Some(id) = id {
521                    self.order.push(id.number);
522                }
523            }
524            fn visit_array(&mut self, _id: Option<ObjectId>, _arr: &[Object]) {}
525            fn visit_stream(&mut self, _id: Option<ObjectId>) {}
526            fn visit_reference(&mut self, _target: ObjectId) {}
527            fn visit_primitive(&mut self, _id: Option<ObjectId>, _obj: &Object) {}
528        }
529
530        let mut tracker = OrderTracker { order: Vec::new() };
531        let mut walker = ObjectWalker::new();
532        walker.walk(&store, ObjectId::new(1, 0), &mut tracker);
533
534        // Both objects visited, Object 1 first (root), then Object 2
535        assert_eq!(walker.visited_count(), 2);
536        assert!(tracker.order.contains(&1));
537        assert!(tracker.order.contains(&2));
538    }
539
540    /// Upstream: TEST(ObjectWalkerTest, SkipWalkIntoCurrentObject)
541    ///
542    /// In the C++ walker, SkipWalkIntoCurrentObject prevents descending into
543    /// the current object's children. Our BFS walker doesn't have this
544    /// mechanism, but we test that the max_objects limit achieves similar control.
545    #[test]
546    fn test_walker_skip_via_max_objects() {
547        let pdf = build_minimal_pdf();
548        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
549
550        // Limit to 1 object — should only visit the root
551        let mut stats = ObjectStats::default();
552        let mut walker = ObjectWalker::with_max_objects(1);
553        walker.walk(&store, ObjectId::new(1, 0), &mut stats);
554
555        assert_eq!(walker.visited_count(), 1);
556        assert!(walker.visited().contains(&ObjectId::new(1, 0)));
557        assert!(!walker.visited().contains(&ObjectId::new(2, 0)));
558    }
559
560    /// Upstream: TEST(ObjectWalkerTest, DictionaryKey)
561    ///
562    /// In the C++ walker, dictionary_key() returns the key name for the current
563    /// child being iterated. We verify that dict keys are preserved when
564    /// objects are resolved from the store.
565    #[test]
566    fn test_walker_dictionary_key_tracking() {
567        let pdf = build_minimal_pdf();
568        let store = ObjectStore::open(pdf, ParsingMode::Strict).unwrap();
569
570        // Resolve the Catalog dict and verify its keys are correct
571        let catalog = store.resolve(ObjectId::new(1, 0)).unwrap();
572        let dict = catalog.as_dict().unwrap();
573
574        // The Catalog should have /Type and /Pages keys
575        assert!(dict.contains_key(&Name::r#type()));
576        assert!(dict.contains_key(&Name::pages()));
577
578        // Verify the values are accessible
579        let type_val = dict.get(&Name::r#type()).unwrap();
580        assert!(type_val.is_name());
581
582        let pages_val = dict.get(&Name::pages()).unwrap();
583        assert!(pages_val.is_reference());
584        assert_eq!(pages_val.as_reference(), Some(ObjectId::new(2, 0)));
585    }
586}