Skip to main content

pdf_syntax/
data.rs

1use crate::object::ObjectIdentifier;
2use crate::object::Stream;
3use crate::reader::ReaderContext;
4use crate::sync::HashMap;
5use crate::sync::{Arc, Mutex, MutexExt};
6use crate::util::SegmentList;
7use alloc::vec::Vec;
8use core::fmt::{Debug, Formatter};
9
10/// Parsed `(object_number, absolute_byte_offset)` table for an object stream
11/// (PDF 1.5 compressed `/ObjStm`).
12///
13/// This is a value type — once produced from the stream header it never
14/// changes for the lifetime of the source `Data`.
15pub(crate) type ObjectStreamOffsets = Vec<(u32, usize)>;
16
17/// A container for the bytes of a PDF file.
18#[derive(Clone)]
19pub struct PdfData {
20    #[cfg(feature = "std")]
21    inner: Arc<dyn AsRef<[u8]> + Send + Sync>,
22    #[cfg(not(feature = "std"))]
23    inner: Arc<dyn AsRef<[u8]>>,
24}
25
26impl Debug for PdfData {
27    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
28        write!(f, "PdfData {{ ... }}")
29    }
30}
31
32impl AsRef<[u8]> for PdfData {
33    fn as_ref(&self) -> &[u8] {
34        (*self.inner).as_ref()
35    }
36}
37
38#[cfg(feature = "std")]
39impl<T: AsRef<[u8]> + Send + Sync + 'static> From<Arc<T>> for PdfData {
40    fn from(data: Arc<T>) -> Self {
41        Self { inner: data }
42    }
43}
44
45#[cfg(not(feature = "std"))]
46impl<T: AsRef<[u8]> + 'static> From<Arc<T>> for PdfData {
47    fn from(data: Arc<T>) -> Self {
48        Self { inner: data }
49    }
50}
51
52impl From<Vec<u8>> for PdfData {
53    fn from(data: Vec<u8>) -> Self {
54        Self {
55            inner: Arc::new(data),
56        }
57    }
58}
59
60/// A structure for storing the data of the PDF.
61// To explain further: This crate uses a zero-parse approach, meaning that objects like
62// dictionaries or arrays always store the underlying data and parse objects lazily as needed,
63// instead of allocating the data and storing it in an owned way. However, the problem is that
64// not all data is readily available in the original data of the PDF: Objects can also be
65// stored in an object streams, in which case we first need to decode the stream before we can
66// access the data.
67//
68// The purpose of `Data` is to allow us to access the original data as well as maybe decoded data
69// by faking the same lifetime, so that we don't run into lifetime issues when dealing with
70// PDF objects that actually stem from different data sources.
71pub(crate) struct Data {
72    data: PdfData,
73    // 32 segments are more than enough as we can't have more objects than this.
74    decoded: SegmentList<Option<Vec<u8>>, 32>,
75    map: Mutex<HashMap<ObjectIdentifier, usize>>,
76    // QF2-B: cache of parsed `(obj_num, abs_offset)` index tables, keyed by the
77    // object stream's own `ObjectIdentifier`. Lives for the lifetime of the
78    // owning document and is dropped together with it — there is no
79    // cross-document leakage and the underlying byte slice (`data`) is
80    // immutable for the same lifetime, so cache entries can never become
81    // stale.
82    object_stream_offsets: Mutex<HashMap<ObjectIdentifier, Arc<ObjectStreamOffsets>>>,
83}
84
85impl Debug for Data {
86    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
87        write!(f, "Data {{ ... }}")
88    }
89}
90
91impl Data {
92    /// Create a new `Data` structure.
93    pub(crate) fn new(data: PdfData) -> Self {
94        Self {
95            data,
96            decoded: SegmentList::new(),
97            map: Mutex::new(HashMap::new()),
98            object_stream_offsets: Mutex::new(HashMap::new()),
99        }
100    }
101
102    /// Get access to the original data of the PDF.
103    pub(crate) fn get(&self) -> &PdfData {
104        &self.data
105    }
106
107    /// Look up the cached parsed `(obj_num, abs_offset)` index table for the
108    /// object stream `id`, computing it via `parse` on cache miss.
109    ///
110    /// Returns `None` if `parse` returns `None` (and does **not** populate
111    /// the cache in that case, so a later attempt can retry).
112    ///
113    /// # Invariants
114    ///
115    /// - Cache scope: per-`Data` (i.e. per source byte slice). Two distinct
116    ///   `Pdf` instances do not share an offsets cache.
117    /// - Cache lifetime: bounded by the owning `Pdf` document. Dropping the
118    ///   `Pdf` (and therefore the `Data`) frees all entries.
119    /// - Mutation safety: `PdfData` wraps an immutable `Arc<dyn AsRef<[u8]>>`,
120    ///   so the byte slice cannot change underneath a cache entry. There is
121    ///   no need for explicit invalidation.
122    pub(crate) fn get_object_stream_offsets_or_init<F>(
123        &self,
124        id: ObjectIdentifier,
125        parse: F,
126    ) -> Option<Arc<ObjectStreamOffsets>>
127    where
128        F: FnOnce() -> Option<ObjectStreamOffsets>,
129    {
130        if let Some(hit) = self.object_stream_offsets.get().get(&id).cloned() {
131            return Some(hit);
132        }
133
134        // Parse outside the lock to avoid holding it during potentially
135        // expensive work.
136        let parsed = Arc::new(parse()?);
137
138        // Insert-if-absent: another thread may have populated the entry while
139        // we parsed. Whichever entry is in the map first wins, and we hand
140        // that one back so callers always observe the same parsed table.
141        let mut locked = self.object_stream_offsets.get();
142        Some(locked.entry(id).or_insert(parsed).clone())
143    }
144
145    /// Number of cached object-stream offset tables. Test-only.
146    #[cfg(test)]
147    pub(crate) fn object_stream_offsets_cache_len(&self) -> usize {
148        self.object_stream_offsets.get().len()
149    }
150
151    /// Get access to the data of a decoded object stream.
152    pub(crate) fn get_with(&self, id: ObjectIdentifier, ctx: &ReaderContext<'_>) -> Option<&[u8]> {
153        if let Some(&idx) = self.map.get().get(&id) {
154            self.decoded.get(idx)?.as_deref()
155        } else {
156            // Block scope to keep the lock short-lived.
157            let idx = {
158                let mut locked = self.map.get();
159                let idx = locked.len();
160                locked.insert(id, idx);
161                idx
162            };
163            self.decoded
164                .get_or_init(idx, || {
165                    let stream = ctx.xref().get_with::<Stream<'_>>(id, ctx)?;
166                    stream.decoded().ok()
167                })
168                .as_deref()
169        }
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    //! QF2-B — unit tests for the per-document parsed ObjectStream offsets
176    //! cache.
177    //!
178    //! These tests exercise the cache primitive directly without needing a
179    //! full PDF. End-to-end coverage on a real `/ObjStm`-containing file is
180    //! provided indirectly by the existing pdf-syntax tests (the same code
181    //! path is now wired through `get_object_stream_offsets_or_init`).
182
183    use super::*;
184    use core::sync::atomic::{AtomicUsize, Ordering};
185
186    fn make_data() -> Data {
187        Data::new(PdfData::from(alloc::vec![0u8; 16]))
188    }
189
190    fn id(n: i32) -> ObjectIdentifier {
191        ObjectIdentifier::new(n, 0)
192    }
193
194    #[test]
195    fn qf2b_cache_miss_parses_once_and_returns_same_arc() {
196        let d = make_data();
197        let calls = AtomicUsize::new(0);
198
199        let a = d
200            .get_object_stream_offsets_or_init(id(7), || {
201                calls.fetch_add(1, Ordering::SeqCst);
202                Some(alloc::vec![(1, 10), (2, 20), (3, 30)])
203            })
204            .expect("first parse must succeed");
205
206        // Same key, different closure body — should not be invoked.
207        let b = d
208            .get_object_stream_offsets_or_init(id(7), || {
209                calls.fetch_add(1, Ordering::SeqCst);
210                Some(alloc::vec![(99, 99)])
211            })
212            .expect("cache hit must succeed");
213
214        assert_eq!(calls.load(Ordering::SeqCst), 1, "parse called exactly once");
215        assert!(Arc::ptr_eq(&a, &b), "cache returns identical Arc");
216        assert_eq!(&*a, &alloc::vec![(1u32, 10usize), (2, 20), (3, 30)]);
217        assert_eq!(d.object_stream_offsets_cache_len(), 1);
218    }
219
220    #[test]
221    fn qf2b_cache_miss_with_none_does_not_poison() {
222        let d = make_data();
223        let first = d.get_object_stream_offsets_or_init(id(9), || None);
224        assert!(first.is_none(), "first parse may legitimately fail");
225        assert_eq!(
226            d.object_stream_offsets_cache_len(),
227            0,
228            "failed parses must not pollute the cache"
229        );
230
231        // Retry must be allowed.
232        let retry = d
233            .get_object_stream_offsets_or_init(id(9), || Some(alloc::vec![(5, 50)]))
234            .expect("retry after None must succeed");
235        assert_eq!(&*retry, &alloc::vec![(5u32, 50usize)]);
236        assert_eq!(d.object_stream_offsets_cache_len(), 1);
237    }
238
239    #[test]
240    fn qf2b_distinct_ids_are_isolated() {
241        let d = make_data();
242
243        let a = d
244            .get_object_stream_offsets_or_init(id(1), || Some(alloc::vec![(1, 10)]))
245            .unwrap();
246        let b = d
247            .get_object_stream_offsets_or_init(id(2), || Some(alloc::vec![(2, 20)]))
248            .unwrap();
249
250        assert!(!Arc::ptr_eq(&a, &b));
251        assert_eq!(&*a, &alloc::vec![(1u32, 10usize)]);
252        assert_eq!(&*b, &alloc::vec![(2u32, 20usize)]);
253        assert_eq!(d.object_stream_offsets_cache_len(), 2);
254    }
255
256    #[test]
257    fn qf2b_distinct_data_instances_do_not_share_cache() {
258        // Security boundary: cache is per-`Data`, never shared across
259        // documents.
260        let d1 = make_data();
261        let d2 = make_data();
262
263        let _ = d1
264            .get_object_stream_offsets_or_init(id(7), || Some(alloc::vec![(1, 10)]))
265            .unwrap();
266
267        // d2 must miss for the same id.
268        let calls = AtomicUsize::new(0);
269        let _ = d2
270            .get_object_stream_offsets_or_init(id(7), || {
271                calls.fetch_add(1, Ordering::SeqCst);
272                Some(alloc::vec![(2, 20)])
273            })
274            .unwrap();
275
276        assert_eq!(
277            calls.load(Ordering::SeqCst),
278            1,
279            "d2 must invoke its own parse — no cross-document cache"
280        );
281
282        let again = d2
283            .get_object_stream_offsets_or_init(id(7), || {
284                calls.fetch_add(1, Ordering::SeqCst);
285                Some(alloc::vec![(3, 30)])
286            })
287            .unwrap();
288        assert_eq!(
289            calls.load(Ordering::SeqCst),
290            1,
291            "second lookup on d2 must hit the d2 cache"
292        );
293        assert_eq!(&*again, &alloc::vec![(2u32, 20usize)]);
294    }
295
296    #[test]
297    fn qf2b_cache_drops_with_data() {
298        // The cache holds `Arc<Vec<...>>`; when the `Data` is dropped the
299        // inner allocation must also be reclaimed (no leak into a static
300        // map). We assert this by checking strong_count after drop.
301        let arc_after_drop = {
302            let d = make_data();
303            let inner = d
304                .get_object_stream_offsets_or_init(id(1), || Some(alloc::vec![(1, 10)]))
305                .unwrap();
306            assert!(Arc::strong_count(&inner) >= 2, "cache + caller refs");
307            inner
308        };
309        // After dropping `d`, only the caller-held Arc remains.
310        assert_eq!(Arc::strong_count(&arc_after_drop), 1);
311    }
312}