Skip to main content

pdf_interpret/
cache.rs

1use crate::util::hash128;
2use kurbo::Affine;
3use pdf_syntax::object::{Array, Dict, MaybeRef, Name, Null, ObjRef, Object, Stream};
4use std::any::Any;
5use std::collections::hash_map::Entry;
6use std::collections::{HashMap, VecDeque};
7use std::sync::{Arc, Mutex};
8
9type CacheMap = HashMap<u128, Option<Box<dyn Any + Send + Sync>>>;
10
11/// Maximum number of decoded images retained by the document-level image cache.
12///
13/// The cache is bounded by BOTH entry count (this constant) and total decoded
14/// bytes ([`IMAGE_CACHE_MAX_BYTES`]); an insertion evicts least-recently-used
15/// entries until both bounds hold. The count cap catches "many small images"
16/// (logos, headers/footers repeated across pages); the byte cap catches "few
17/// but huge images" that would otherwise pass the count cap while consuming
18/// hundreds of MB. The cache is per-`PdfDocument` and lives only as long as
19/// that document.
20const IMAGE_CACHE_CAPACITY: usize = 32;
21
22/// Maximum total decoded-image bytes retained by the document-level image cache.
23///
24/// 256 MiB is a conservative working-set ceiling for image-heavy documents: it
25/// comfortably holds typical repeated assets while bounding peak transient
26/// memory for pathological inputs (e.g. a handful of full-page scans). An entry
27/// larger than the whole budget is returned to the caller but not retained.
28const IMAGE_CACHE_MAX_BYTES: usize = 256 * 1024 * 1024;
29
30struct ImageCache {
31    /// Maps a cache key to the decoded value and its accounted byte weight.
32    map: HashMap<u128, (Box<dyn Any + Send + Sync>, usize)>,
33    /// Least-recently-used order; front = oldest, back = most recently used.
34    order: VecDeque<u128>,
35    /// Sum of the byte weights of all entries currently in `map`.
36    total_bytes: usize,
37}
38
39/// A cache to store decoded images and other objects to avoid parsing/decoding them repeatedly.
40#[derive(Clone)]
41pub struct Cache {
42    generic: Arc<Mutex<CacheMap>>,
43    images: Arc<Mutex<ImageCache>>,
44}
45
46impl Default for Cache {
47    fn default() -> Self {
48        Self::new()
49    }
50}
51
52impl Cache {
53    /// Create a new, empty cache.
54    pub fn new() -> Self {
55        Self {
56            generic: Arc::new(Mutex::new(HashMap::new())),
57            images: Arc::new(Mutex::new(ImageCache {
58                map: HashMap::new(),
59                order: VecDeque::new(),
60                total_bytes: 0,
61            })),
62        }
63    }
64
65    pub(crate) fn get_or_insert_with<T: Clone + Send + Sync + 'static>(
66        &self,
67        id: u128,
68        f: impl FnOnce() -> Option<T>,
69    ) -> Option<T> {
70        let mut locked = self.generic.lock().unwrap();
71
72        // We can't use `get_or_insert_with` here, because if the closure makes another access to the
73        // cache, we end up with a deadlock.
74        match locked.entry(id) {
75            Entry::Occupied(o) => o
76                .get()
77                .as_ref()
78                .and_then(|val| val.downcast_ref::<T>().cloned()),
79            Entry::Vacant(_) => {
80                drop(locked);
81                let val = f();
82                self.generic.lock().unwrap().insert(
83                    id,
84                    val.clone()
85                        .map(|val| Box::new(val) as Box<dyn Any + Send + Sync>),
86                );
87
88                val
89            }
90        }
91    }
92
93    /// Look up a decoded image by `id`, or compute it with `f` and cache it.
94    ///
95    /// `weight` returns the byte size of a freshly-computed value; it is used to
96    /// keep the cache within [`IMAGE_CACHE_MAX_BYTES`]. The cache is also bounded
97    /// to [`IMAGE_CACHE_CAPACITY`] entries. After an insertion, least-recently-
98    /// used entries are evicted until both bounds hold (a single value larger
99    /// than the whole budget is returned but not retained).
100    pub(crate) fn get_or_insert_image<T: Clone + Send + Sync + 'static>(
101        &self,
102        id: u128,
103        f: impl FnOnce() -> Option<T>,
104        weight: impl FnOnce(&T) -> usize,
105    ) -> Option<T> {
106        let mut locked = self.images.lock().unwrap();
107        if locked.map.contains_key(&id) {
108            if let Some(pos) = locked.order.iter().position(|&x| x == id) {
109                locked.order.remove(pos);
110            }
111            locked.order.push_back(id);
112            let (val, _) = locked.map.get(&id).unwrap();
113            return val.downcast_ref::<T>().cloned();
114        }
115
116        drop(locked);
117        let val = f();
118        if let Some(ref v) = val {
119            let w = weight(v);
120            let mut locked = self.images.lock().unwrap();
121            // A repeated key would otherwise leak its old weight; subtract it.
122            if let Some((_, old_w)) = locked
123                .map
124                .insert(id, (Box::new(v.clone()) as Box<dyn Any + Send + Sync>, w))
125            {
126                locked.total_bytes = locked.total_bytes.saturating_sub(old_w);
127                if let Some(pos) = locked.order.iter().position(|&x| x == id) {
128                    locked.order.remove(pos);
129                }
130            }
131            locked.total_bytes += w;
132            locked.order.push_back(id);
133
134            // Evict least-recently-used entries until both the count and the
135            // byte budget are satisfied.
136            while !locked.order.is_empty()
137                && (locked.order.len() > IMAGE_CACHE_CAPACITY
138                    || locked.total_bytes > IMAGE_CACHE_MAX_BYTES)
139            {
140                if let Some(oldest_id) = locked.order.pop_front()
141                    && let Some((_, evicted_w)) = locked.map.remove(&oldest_id)
142                {
143                    locked.total_bytes = locked.total_bytes.saturating_sub(evicted_w);
144                }
145            }
146        }
147        val
148    }
149
150    /// Current image-cache occupancy as `(entry_count, total_bytes)`.
151    ///
152    /// Exposed for measurement and tests; both values are always within
153    /// [`IMAGE_CACHE_CAPACITY`] and [`IMAGE_CACHE_MAX_BYTES`] respectively after
154    /// any completed `get_or_insert_image` call.
155    #[cfg(test)]
156    pub(crate) fn image_cache_stats(&self) -> (usize, usize) {
157        let locked = self.images.lock().unwrap();
158        (locked.map.len(), locked.total_bytes)
159    }
160}
161
162/// A trait for objects that can generate a unique cache key.
163pub trait CacheKey {
164    /// Returns the cache key for this object.
165    fn cache_key(&self) -> u128;
166}
167
168impl<T: CacheKey, U: CacheKey> CacheKey for (T, U) {
169    fn cache_key(&self) -> u128 {
170        hash128(&(self.0.cache_key(), self.1.cache_key()))
171    }
172}
173
174impl CacheKey for Dict<'_> {
175    fn cache_key(&self) -> u128 {
176        hash128(self.data())
177    }
178}
179
180impl CacheKey for Stream<'_> {
181    fn cache_key(&self) -> u128 {
182        self.dict().cache_key()
183    }
184}
185
186impl CacheKey for Null {
187    fn cache_key(&self) -> u128 {
188        hash128(self)
189    }
190}
191
192impl CacheKey for bool {
193    fn cache_key(&self) -> u128 {
194        hash128(self)
195    }
196}
197
198impl CacheKey for pdf_syntax::object::Number {
199    fn cache_key(&self) -> u128 {
200        hash128(&self.as_f64().to_bits())
201    }
202}
203
204impl CacheKey for pdf_syntax::object::String {
205    fn cache_key(&self) -> u128 {
206        hash128(self.as_ref())
207    }
208}
209
210impl CacheKey for Name {
211    fn cache_key(&self) -> u128 {
212        hash128(self)
213    }
214}
215
216impl CacheKey for Array<'_> {
217    fn cache_key(&self) -> u128 {
218        hash128(self.data())
219    }
220}
221
222impl CacheKey for Object<'_> {
223    fn cache_key(&self) -> u128 {
224        match self {
225            Object::Null(n) => n.cache_key(),
226            Object::Boolean(b) => b.cache_key(),
227            Object::Number(n) => n.cache_key(),
228            Object::String(s) => s.cache_key(),
229            Object::Name(n) => n.cache_key(),
230            Object::Dict(d) => d.cache_key(),
231            Object::Array(a) => a.cache_key(),
232            Object::Stream(s) => s.cache_key(),
233        }
234    }
235}
236
237impl CacheKey for ObjRef {
238    fn cache_key(&self) -> u128 {
239        hash128(self)
240    }
241}
242
243impl<T: CacheKey> CacheKey for MaybeRef<T> {
244    fn cache_key(&self) -> u128 {
245        match self {
246            Self::Ref(r) => r.cache_key(),
247            Self::NotRef(o) => o.cache_key(),
248        }
249    }
250}
251
252impl CacheKey for Affine {
253    fn cache_key(&self) -> u128 {
254        let c = self.as_coeffs();
255        hash128(&[
256            c[0].to_bits(),
257            c[1].to_bits(),
258            c[2].to_bits(),
259            c[3].to_bits(),
260            c[4].to_bits(),
261            c[5].to_bits(),
262        ])
263    }
264}
265
266impl CacheKey for u128 {
267    fn cache_key(&self) -> u128 {
268        hash128(self)
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn test_generic_cache() {
278        let cache = Cache::new();
279        let val = cache.get_or_insert_with(12345, || Some("hello".to_string()));
280        assert_eq!(val, Some("hello".to_string()));
281
282        let cached = cache.get_or_insert_with(12345, || Some("world".to_string()));
283        assert_eq!(cached, Some("hello".to_string()));
284    }
285
286    // A small weight; keeps these count-focused cases far below the byte budget.
287    fn tiny(_: &i32) -> usize {
288        4
289    }
290
291    #[test]
292    fn test_image_cache_lru_eviction() {
293        let cache = Cache::new();
294
295        // 1. Insert 32 unique items
296        for i in 0..32 {
297            let val = cache.get_or_insert_image(i as u128, || Some(i), tiny);
298            assert_eq!(val, Some(i));
299        }
300
301        // Verify all 32 items are still there
302        for i in 0..32 {
303            let cached = cache.get_or_insert_image::<i32>(i as u128, || None, tiny);
304            assert_eq!(cached, Some(i));
305        }
306
307        // 2. Access key 0 to make it recently used (brings it to the back)
308        let first_accessed = cache.get_or_insert_image::<i32>(0, || None, tiny);
309        assert_eq!(first_accessed, Some(0));
310
311        // 3. Insert a new item (key 32). This evicts the oldest entry, which is
312        //    key 1 (key 0 was just touched).
313        let val32 = cache.get_or_insert_image(32, || Some(32), tiny);
314        assert_eq!(val32, Some(32));
315
316        // Key 1 should be None (evicted)
317        let evicted1 = cache.get_or_insert_image::<i32>(1, || None, tiny);
318        assert_eq!(evicted1, None);
319
320        // Key 0 should still be Some(0) (since we touched it)
321        let kept0 = cache.get_or_insert_image::<i32>(0, || None, tiny);
322        assert_eq!(kept0, Some(0));
323    }
324
325    #[test]
326    fn test_image_cache_evicts_by_byte_budget() {
327        let cache = Cache::new();
328        let big = 100 * 1024 * 1024; // 100 MiB each; three => 300 MiB > 256 MiB.
329
330        cache.get_or_insert_image(1u128, || Some(1i32), |_| big);
331        cache.get_or_insert_image(2u128, || Some(2i32), |_| big);
332        cache.get_or_insert_image(3u128, || Some(3i32), |_| big);
333
334        // Only three entries (far under the 32 count cap), but their bytes exceed
335        // the budget, so the least-recently-used (key 1) is evicted.
336        let (count, bytes) = cache.image_cache_stats();
337        assert_eq!(count, 2, "byte budget should drop the cache to two entries");
338        assert!(
339            bytes <= IMAGE_CACHE_MAX_BYTES,
340            "total bytes must stay within budget"
341        );
342        assert_eq!(
343            cache.get_or_insert_image::<i32>(1, || None, |_| 0),
344            None,
345            "oldest entry evicted by byte budget"
346        );
347        assert_eq!(cache.get_or_insert_image::<i32>(2, || None, |_| 0), Some(2));
348        assert_eq!(cache.get_or_insert_image::<i32>(3, || None, |_| 0), Some(3));
349    }
350
351    #[test]
352    fn test_image_cache_byte_budget_respects_lru_touch() {
353        let cache = Cache::new();
354        let big = 100 * 1024 * 1024;
355
356        cache.get_or_insert_image(1u128, || Some(1i32), |_| big);
357        cache.get_or_insert_image(2u128, || Some(2i32), |_| big);
358        // Touch key 1 so it becomes most-recently-used.
359        assert_eq!(cache.get_or_insert_image::<i32>(1, || None, |_| 0), Some(1));
360        // Inserting key 3 pushes bytes over budget; the LRU victim is now key 2.
361        cache.get_or_insert_image(3u128, || Some(3i32), |_| big);
362
363        assert_eq!(
364            cache.get_or_insert_image::<i32>(2, || None, |_| 0),
365            None,
366            "least-recently-used key 2 evicted, not the touched key 1"
367        );
368        assert_eq!(cache.get_or_insert_image::<i32>(1, || None, |_| 0), Some(1));
369        assert_eq!(cache.get_or_insert_image::<i32>(3, || None, |_| 0), Some(3));
370    }
371
372    #[test]
373    fn test_image_cache_oversized_item_not_retained() {
374        let cache = Cache::new();
375        let huge = IMAGE_CACHE_MAX_BYTES + 1; // larger than the whole budget
376
377        // The value is still returned to the caller for this call...
378        assert_eq!(
379            cache.get_or_insert_image(1u128, || Some(1i32), |_| huge),
380            Some(1)
381        );
382        // ...but it is evicted immediately, so the cache retains nothing.
383        let (count, bytes) = cache.image_cache_stats();
384        assert_eq!(count, 0, "an item larger than the budget is not retained");
385        assert_eq!(bytes, 0);
386        assert_eq!(cache.get_or_insert_image::<i32>(1, || None, |_| 0), None);
387    }
388
389    #[test]
390    fn test_image_cache_stats_stay_within_both_bounds() {
391        let cache = Cache::new();
392        // 100 small (1 KiB) items: the count cap bounds entries to 32; bytes
393        // stay tiny. This is the measurement asserting both bounds hold.
394        for i in 0..100u128 {
395            cache.get_or_insert_image(i, || Some(i as i32), |_| 1024);
396        }
397        let (count, bytes) = cache.image_cache_stats();
398        assert_eq!(count, IMAGE_CACHE_CAPACITY, "count cap holds at 32 entries");
399        assert_eq!(
400            bytes,
401            IMAGE_CACHE_CAPACITY * 1024,
402            "32 retained * 1 KiB each"
403        );
404        assert!(bytes <= IMAGE_CACHE_MAX_BYTES);
405    }
406}