Skip to main content

sqz_engine/
cache_manager.rs

1use std::path::Path;
2
3use sha2::{Digest, Sha256};
4
5use crate::error::Result;
6use crate::pipeline::{CompressionPipeline, SessionContext};
7use crate::preset::Preset;
8use crate::session_store::SessionStore;
9use crate::types::CompressedContent;
10
11/// Outcome of a cache lookup in [`CacheManager`].
12pub enum CacheResult {
13    /// Previously seen content — returns a short inline reference (~13 tokens).
14    Dedup {
15        /// Inline token of the form `§ref:<hash_prefix>§`.
16        inline_ref: String,
17        /// Approximate token cost of the reference (always 13).
18        token_cost: u32,
19    },
20    /// Content not seen before — full compression result.
21    Fresh { output: CompressedContent },
22}
23
24/// SHA-256 content-hash deduplication cache backed by [`SessionStore`].
25pub struct CacheManager {
26    store: SessionStore,
27    max_size_bytes: u64,
28}
29
30impl CacheManager {
31    pub fn new(store: SessionStore, max_size_bytes: u64) -> Self {
32        Self {
33            store,
34            max_size_bytes,
35        }
36    }
37
38    /// Compute the SHA-256 hex digest of `bytes`.
39    fn sha256_hex(bytes: &[u8]) -> String {
40        let mut hasher = Sha256::new();
41        hasher.update(bytes);
42        format!("{:x}", hasher.finalize())
43    }
44
45    /// Look up `content` in the cache.
46    ///
47    /// - On dedup: return `CacheResult::Dedup` with a compact reference token.
48    /// - On fresh: compress via `pipeline`, persist to store, return
49    ///   `CacheResult::Fresh`.
50    pub fn get_or_compress(
51        &self,
52        _path: &Path,
53        content: &[u8],
54        pipeline: &CompressionPipeline,
55    ) -> Result<CacheResult> {
56        let hash = Self::sha256_hex(content);
57
58        if self.store.get_cache_entry(&hash)?.is_some() {
59            let hash_prefix = &hash[..16];
60            let inline_ref = format!("§ref:{hash_prefix}§");
61            return Ok(CacheResult::Dedup {
62                inline_ref,
63                token_cost: 13,
64            });
65        }
66
67        let text = String::from_utf8_lossy(content).into_owned();
68        let ctx = SessionContext {
69            session_id: "cache".to_string(),
70        };
71        let preset = Preset::default();
72        let compressed = pipeline.compress(&text, &ctx, &preset)?;
73        self.store.save_cache_entry(&hash, &compressed)?;
74
75        Ok(CacheResult::Fresh { output: compressed })
76    }
77
78    /// Invalidate the cache entry for `path` if its current content is known.
79    ///
80    /// Reads the file at `path`, computes its hash, and removes the matching
81    /// entry from the store.  If the file does not exist the call is a no-op.
82    pub fn invalidate(&self, path: &Path) -> Result<()> {
83        if !path.exists() {
84            return Ok(());
85        }
86        let bytes = std::fs::read(path)?;
87        let hash = Self::sha256_hex(&bytes);
88        self.store.delete_cache_entry(&hash)?;
89        Ok(())
90    }
91
92    /// Evict least-recently-used entries until total cache size is at or below
93    /// `max_size_bytes`.
94    ///
95    /// Returns the number of bytes freed.
96    pub fn evict_lru(&self) -> Result<u64> {
97        let entries = self.store.list_cache_entries_lru()?;
98
99        // Compute current total size.
100        let total: u64 = entries.iter().map(|(_, sz)| sz).sum();
101        if total <= self.max_size_bytes {
102            return Ok(0);
103        }
104
105        let mut freed: u64 = 0;
106        let mut remaining = total;
107
108        for (hash, size) in &entries {
109            if remaining <= self.max_size_bytes {
110                break;
111            }
112            self.store.delete_cache_entry(hash)?;
113            freed += size;
114            remaining -= size;
115        }
116
117        Ok(freed)
118    }
119}
120
121// ── Tests ─────────────────────────────────────────────────────────────────────
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126    use crate::preset::{
127        BudgetConfig, CollapseArraysConfig, CompressionConfig, CondenseConfig,
128        CustomTransformsConfig, ModelConfig, PresetMeta, StripNullsConfig, TerseModeConfig,
129        ToolSelectionConfig, TruncateStringsConfig,
130    };
131    use crate::session_store::SessionStore;
132
133    fn in_memory_store() -> (SessionStore, tempfile::TempDir) {
134        let dir = tempfile::tempdir().unwrap();
135        let path = dir.path().join("test.db");
136        let store = SessionStore::open_or_create(&path).unwrap();
137        (store, dir)
138    }
139
140    fn test_preset() -> Preset {
141        Preset {
142            preset: PresetMeta {
143                name: "test".into(),
144                version: "1.0".into(),
145                description: String::new(),
146            },
147            compression: CompressionConfig {
148                stages: vec![],
149                keep_fields: None,
150                strip_fields: None,
151                condense: Some(CondenseConfig {
152                    enabled: true,
153                    max_repeated_lines: 3,
154                }),
155                git_diff_fold: None,
156                strip_nulls: Some(StripNullsConfig { enabled: true }),
157                flatten: None,
158                truncate_strings: Some(TruncateStringsConfig {
159                    enabled: true,
160                    max_length: 500,
161                }),
162                collapse_arrays: Some(CollapseArraysConfig {
163                    enabled: true,
164                    max_items: 5,
165                    summary_template: "... and {remaining} more items".into(),
166                }),
167                custom_transforms: Some(CustomTransformsConfig { enabled: true }),
168            },
169            tool_selection: ToolSelectionConfig {
170                max_tools: 5,
171                similarity_threshold: 0.7,
172                default_tools: vec![],
173            },
174            budget: BudgetConfig {
175                warning_threshold: 0.70,
176                ceiling_threshold: 0.85,
177                default_window_size: 200_000,
178                agents: Default::default(),
179            },
180            terse_mode: TerseModeConfig {
181                enabled: false,
182                level: crate::preset::TerseLevel::Moderate,
183            },
184            model: ModelConfig {
185                family: "anthropic".into(),
186                primary: "claude-sonnet-4-20250514".into(),
187                local: String::new(),
188                complexity_threshold: 0.4,
189                pricing: None,
190            },
191        }
192    }
193
194    fn make_pipeline() -> CompressionPipeline {
195        CompressionPipeline::new(&test_preset())
196    }
197
198    #[test]
199    fn first_read_is_miss() {
200        let (store, _dir) = in_memory_store();
201        let cm = CacheManager::new(store, u64::MAX);
202        let pipeline = make_pipeline();
203        let content = b"hello world";
204        let result = cm
205            .get_or_compress(Path::new("file.txt"), content, &pipeline)
206            .unwrap();
207        assert!(matches!(result, CacheResult::Fresh { .. }));
208    }
209
210    #[test]
211    fn second_read_is_hit() {
212        let (store, _dir) = in_memory_store();
213        let cm = CacheManager::new(store, u64::MAX);
214        let pipeline = make_pipeline();
215        let content = b"hello world";
216        let path = Path::new("file.txt");
217
218        // First read — miss
219        cm.get_or_compress(path, content, &pipeline).unwrap();
220
221        // Second read — hit
222        let result = cm.get_or_compress(path, content, &pipeline).unwrap();
223        match result {
224            CacheResult::Dedup {
225                inline_ref,
226                token_cost,
227            } => {
228                assert!(inline_ref.starts_with("§ref:"));
229                assert!(inline_ref.ends_with('§'));
230                assert_eq!(token_cost, 13);
231            }
232            CacheResult::Fresh { .. } => panic!("expected cache hit"),
233        }
234    }
235
236    #[test]
237    fn different_content_is_miss() {
238        let (store, _dir) = in_memory_store();
239        let cm = CacheManager::new(store, u64::MAX);
240        let pipeline = make_pipeline();
241        let path = Path::new("file.txt");
242
243        cm.get_or_compress(path, b"content v1", &pipeline).unwrap();
244        let result = cm
245            .get_or_compress(path, b"content v2", &pipeline)
246            .unwrap();
247        assert!(matches!(result, CacheResult::Fresh { .. }));
248    }
249
250    #[test]
251    fn evict_lru_frees_bytes_when_over_limit() {
252        let (store, _dir) = in_memory_store();
253        // Very small limit so eviction triggers immediately.
254        let cm = CacheManager::new(store, 1);
255        let pipeline = make_pipeline();
256        let path = Path::new("f.txt");
257
258        // Populate cache with a few entries.
259        cm.get_or_compress(path, b"entry one", &pipeline).unwrap();
260        cm.get_or_compress(path, b"entry two", &pipeline).unwrap();
261        cm.get_or_compress(path, b"entry three", &pipeline).unwrap();
262
263        let freed = cm.evict_lru().unwrap();
264        assert!(freed > 0, "expected bytes to be freed");
265    }
266
267    #[test]
268    fn evict_lru_no_op_when_under_limit() {
269        let (store, _dir) = in_memory_store();
270        let cm = CacheManager::new(store, u64::MAX);
271        let pipeline = make_pipeline();
272
273        cm.get_or_compress(Path::new("f.txt"), b"data", &pipeline)
274            .unwrap();
275
276        let freed = cm.evict_lru().unwrap();
277        assert_eq!(freed, 0);
278    }
279
280    #[test]
281    fn invalidate_removes_entry() {
282        let dir = tempfile::tempdir().unwrap();
283        let file_path = dir.path().join("test.txt");
284        std::fs::write(&file_path, b"some content").unwrap();
285
286        let store_path = dir.path().join("store.db");
287        let store = SessionStore::open_or_create(&store_path).unwrap();
288        let cm = CacheManager::new(store, u64::MAX);
289        let pipeline = make_pipeline();
290
291        // Populate cache.
292        let content = std::fs::read(&file_path).unwrap();
293        cm.get_or_compress(&file_path, &content, &pipeline).unwrap();
294
295        // Verify it's a hit.
296        let hit = cm
297            .get_or_compress(&file_path, &content, &pipeline)
298            .unwrap();
299        assert!(matches!(hit, CacheResult::Dedup { .. }));
300
301        cm.invalidate(&file_path).unwrap();
302
303        let miss = cm
304            .get_or_compress(&file_path, &content, &pipeline)
305            .unwrap();
306        assert!(matches!(miss, CacheResult::Fresh { .. }));
307    }
308
309    #[test]
310    fn invalidate_nonexistent_path_is_noop() {
311        let (store, _dir) = in_memory_store();
312        let cm = CacheManager::new(store, u64::MAX);
313        // Should not error.
314        cm.invalidate(Path::new("/nonexistent/path/file.txt"))
315            .unwrap();
316    }
317
318    // ── Property-based tests ──────────────────────────────────────────────────
319
320    use proptest::prelude::*;
321
322    // ── Property 8: Cache deduplication ──────────────────────────────────────
323    // **Validates: Requirements 8.1, 8.2, 18.1, 18.2**
324    //
325    // For any file content, reading the file twice through the CacheManager
326    // (with no content change between reads) SHALL return a cache hit on the
327    // second read with a reference token of approximately 13 tokens.
328
329    proptest! {
330        /// **Validates: Requirements 8.1, 8.2, 18.1, 18.2**
331        ///
332        /// For any file content, the second read through CacheManager SHALL be
333        /// a cache hit with tokens == 13.
334        #[test]
335        fn prop_cache_deduplication(
336            content in proptest::collection::vec(any::<u8>(), 1..=1000usize),
337        ) {
338            let (store, _dir) = in_memory_store();
339            let cm = CacheManager::new(store, u64::MAX);
340            let pipeline = make_pipeline();
341            let path = Path::new("file.txt");
342
343            // First read — must be a miss.
344            let first = cm.get_or_compress(path, &content, &pipeline).unwrap();
345            prop_assert!(
346                matches!(first, CacheResult::Fresh { .. }),
347                "first read should be a cache miss"
348            );
349
350            let second = cm.get_or_compress(path, &content, &pipeline).unwrap();
351            match second {
352                CacheResult::Dedup { inline_ref, token_cost } => {
353                    prop_assert_eq!(
354                        token_cost, 13,
355                        "cache hit should report ~13 reference tokens"
356                    );
357                    prop_assert!(
358                        inline_ref.starts_with("§ref:"),
359                        "reference token should start with §ref:"
360                    );
361                    prop_assert!(
362                        inline_ref.ends_with('§'),
363                        "reference token should end with §"
364                    );
365                }
366                CacheResult::Fresh { .. } => {
367                    prop_assert!(false, "second read should be a cache hit, not a miss");
368                }
369            }
370        }
371    }
372
373    // ── Property 9: Cache invalidation on content change ─────────────────────
374    // **Validates: Requirements 8.3, 18.3**
375    //
376    // For any cached file, if the file content changes (producing a different
377    // SHA-256 hash), the CacheManager SHALL treat the next read as a cache miss
378    // and re-compress the updated content.
379
380    proptest! {
381        /// **Validates: Requirements 8.3, 18.3**
382        ///
383        /// For any two distinct byte sequences, the first read of each is a
384        /// cache miss — content change always triggers re-compression.
385        #[test]
386        fn prop_cache_invalidation_on_content_change(
387            content_a in proptest::collection::vec(any::<u8>(), 1..=500usize),
388            content_b in proptest::collection::vec(any::<u8>(), 1..=500usize),
389        ) {
390            // Only meaningful when the two contents differ (different hashes).
391            prop_assume!(content_a != content_b);
392
393            let (store, _dir) = in_memory_store();
394            let cm = CacheManager::new(store, u64::MAX);
395            let pipeline = make_pipeline();
396            let path = Path::new("file.txt");
397
398            // Cache content_a.
399            let r1 = cm.get_or_compress(path, &content_a, &pipeline).unwrap();
400            prop_assert!(
401                matches!(r1, CacheResult::Fresh { .. }),
402                "first read of content_a should be a miss"
403            );
404
405            let r2 = cm.get_or_compress(path, &content_a, &pipeline).unwrap();
406            prop_assert!(
407                matches!(r2, CacheResult::Dedup { .. }),
408                "second read of content_a should be a hit"
409            );
410
411            let r3 = cm.get_or_compress(path, &content_b, &pipeline).unwrap();
412            prop_assert!(
413                matches!(r3, CacheResult::Fresh { .. }),
414                "read with changed content should be a cache miss"
415            );
416        }
417    }
418
419    // ── Property 10: Cache LRU eviction ──────────────────────────────────────
420    // **Validates: Requirements 8.5**
421    //
422    // For any cache state where total size exceeds the configured maximum, the
423    // CacheManager SHALL evict entries in LRU order until total size is at or
424    // below the limit.
425
426    proptest! {
427        /// **Validates: Requirements 8.5**
428        ///
429        /// After evict_lru, the total remaining cache size SHALL be at or below
430        /// max_size_bytes.
431        #[test]
432        fn prop_cache_lru_eviction(
433            // Generate 2-8 distinct content entries.
434            entries in proptest::collection::vec(
435                proptest::collection::vec(any::<u8>(), 10..=200usize),
436                2..=8usize,
437            ),
438        ) {
439            // Deduplicate entries so each has a unique hash.
440            let mut unique_entries: Vec<Vec<u8>> = Vec::new();
441            for e in &entries {
442                if !unique_entries.contains(e) {
443                    unique_entries.push(e.clone());
444                }
445            }
446            prop_assume!(unique_entries.len() >= 2);
447
448            let (store, _dir) = in_memory_store();
449            // Use a very small limit (1 byte) to guarantee eviction is needed.
450            let cm = CacheManager::new(store, 1);
451            let pipeline = make_pipeline();
452            let path = Path::new("f.txt");
453
454            // Populate the cache.
455            for entry in &unique_entries {
456                cm.get_or_compress(path, entry, &pipeline).unwrap();
457            }
458
459            // Evict LRU entries.
460            let freed = cm.evict_lru().unwrap();
461
462            // Bytes freed must be > 0 since total > 1 byte.
463            prop_assert!(freed > 0, "evict_lru should free bytes when over limit");
464
465            // After eviction, total remaining size must be <= max_size_bytes (1).
466            // We verify by checking that evict_lru now returns 0 (nothing left to evict).
467            let freed_again = cm.evict_lru().unwrap();
468            prop_assert_eq!(
469                freed_again, 0,
470                "second evict_lru call should free 0 bytes (already at or below limit)"
471            );
472        }
473    }
474
475    // ── Property 34: Cache persistence across sessions ────────────────────────
476    // **Validates: Requirements 18.4**
477    //
478    // For any set of cache entries saved to the SessionStore, reloading the
479    // store (opening the same database file) SHALL produce the same cache
480    // entries, and a subsequent read with the same content hash SHALL return a
481    // cache hit.
482
483    proptest! {
484        /// **Validates: Requirements 18.4**
485        ///
486        /// Cache entries written in one CacheManager instance SHALL survive a
487        /// store close/reopen and produce cache hits in a new instance.
488        #[test]
489        fn prop_cache_persistence_across_sessions(
490            content in proptest::collection::vec(any::<u8>(), 1..=500usize),
491        ) {
492            use crate::session_store::SessionStore;
493
494            let dir = tempfile::tempdir().unwrap();
495            let db_path = dir.path().join("cache.db");
496            let path = Path::new("file.txt");
497
498            // Session 1: populate the cache.
499            {
500                let store = SessionStore::open_or_create(&db_path).unwrap();
501                let cm = CacheManager::new(store, u64::MAX);
502                let pipeline = make_pipeline();
503
504                let r = cm.get_or_compress(path, &content, &pipeline).unwrap();
505                prop_assert!(
506                    matches!(r, CacheResult::Fresh { .. }),
507                    "first read should be a miss"
508                );
509            }
510            // Store is dropped here — connection closed.
511
512            // Session 2: reopen the same database file.
513            {
514                let store = SessionStore::open_or_create(&db_path).unwrap();
515                let cm = CacheManager::new(store, u64::MAX);
516                let pipeline = make_pipeline();
517
518                // Same content should now be a hit.
519                let r = cm.get_or_compress(path, &content, &pipeline).unwrap();
520                match r {
521                    CacheResult::Dedup { token_cost, .. } => {
522                        prop_assert_eq!(
523                            token_cost, 13,
524                            "persisted cache hit should report 13 tokens"
525                        );
526                    }
527                    CacheResult::Fresh { .. } => {
528                        prop_assert!(
529                            false,
530                            "cache entry should persist across store reopen"
531                        );
532                    }
533                }
534            }
535        }
536    }
537}