Skip to main content

oximedia_dedup/
persistent_cache.rs

1//! Cross-session persistent cache for decoded thumbnails and media fingerprints.
2//!
3//! The in-memory [`crate::dedup_cache`] LRU cache is discarded at process exit.
4//! This module adds a lightweight JSON-backed persistent store so that expensive
5//! thumbnail decoding and perceptual hash computation are **reused across
6//! deduplication sessions**.
7//!
8//! # Design
9//!
10//! [`PersistentFingerprintCache`] maintains a flat JSON file on disk.  Each entry
11//! records:
12//!
13//! - The source file path.
14//! - Its BLAKE3 hex digest (64 chars) — used to detect when the file changes and
15//!   the cached fingerprint is stale.
16//! - The 64-bit perceptual hash.
17//! - An optional thumbnail (8×8 grayscale pixel bytes, base64-encoded).
18//! - The modification timestamp at cache time.
19//!
20//! On [`load`](PersistentFingerprintCache::load), all entries are read from disk.
21//! On [`save`](PersistentFingerprintCache::save), the current entries are written
22//! back atomically (write to a temp file then rename).
23//!
24//! **Staleness** is detected by comparing the stored BLAKE3 digest with a freshly
25//! computed digest of the source file.  [`get_valid`](PersistentFingerprintCache::get_valid)
26//! returns `None` for stale or missing entries.
27//!
28//! # Example
29//!
30//! ```rust
31//! use oximedia_dedup::persistent_cache::{PersistentFingerprintCache, CachedEntry};
32//!
33//! let dir = std::env::temp_dir().join("oximedia_pc_doctest");
34//! std::fs::create_dir_all(&dir).ok();
35//! let cache_path = dir.join("fps.json");
36//!
37//! let mut cache = PersistentFingerprintCache::new(cache_path.clone());
38//! cache.insert(CachedEntry {
39//!     path: "/media/clip.mp4".to_string(),
40//!     blake3_hex: "0".repeat(64),
41//!     phash: 0xDEAD_BEEF_1234_5678,
42//!     thumbnail: None,
43//!     modified_secs: 1_700_000_000,
44//! });
45//!
46//! cache.save().expect("save ok");
47//!
48//! let cache2 = PersistentFingerprintCache::load(cache_path).expect("load ok");
49//! assert_eq!(cache2.len(), 1);
50//! ```
51
52#![allow(dead_code)]
53#![allow(clippy::cast_precision_loss)]
54
55use std::collections::HashMap;
56use std::io::{self, BufReader, BufWriter};
57use std::path::{Path, PathBuf};
58
59use serde::{Deserialize, Serialize};
60
61// ---------------------------------------------------------------------------
62// CachedEntry
63// ---------------------------------------------------------------------------
64
65/// A single entry in the persistent fingerprint cache.
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct CachedEntry {
68    /// Absolute path of the source media file.
69    pub path: String,
70    /// Lower-case hex BLAKE3 digest (64 chars) of the file at cache time.
71    pub blake3_hex: String,
72    /// 64-bit perceptual hash.
73    pub phash: u64,
74    /// Optional 8×8 grayscale thumbnail bytes (64 bytes), stored as a Vec to
75    /// allow `None` when no thumbnail was computed.
76    pub thumbnail: Option<Vec<u8>>,
77    /// Unix-second modification timestamp of the file at cache time.
78    pub modified_secs: u64,
79}
80
81impl CachedEntry {
82    /// Return `true` if the thumbnail has the expected 8×8 = 64-byte size.
83    #[must_use]
84    pub fn thumbnail_valid(&self) -> bool {
85        self.thumbnail
86            .as_ref()
87            .map(|t| t.len() == 64)
88            .unwrap_or(true) // no thumbnail is also valid
89    }
90}
91
92// ---------------------------------------------------------------------------
93// PersistentFingerprintCache
94// ---------------------------------------------------------------------------
95
96/// Cross-session persistent cache mapping file paths to their fingerprints.
97///
98/// Entries are keyed by `path` string.
99#[derive(Debug, Clone)]
100pub struct PersistentFingerprintCache {
101    /// Path to the backing JSON file.
102    cache_path: PathBuf,
103    /// In-memory entries keyed by file path.
104    entries: HashMap<String, CachedEntry>,
105    /// Number of cache hits since last reset.
106    hits: u64,
107    /// Number of cache misses since last reset.
108    misses: u64,
109}
110
111impl PersistentFingerprintCache {
112    /// Create a new, empty cache backed by `cache_path`.
113    ///
114    /// The file is not read or written until [`save`](Self::save) or
115    /// [`load`](Self::load) is called.
116    #[must_use]
117    pub fn new(cache_path: PathBuf) -> Self {
118        Self {
119            cache_path,
120            entries: HashMap::new(),
121            hits: 0,
122            misses: 0,
123        }
124    }
125
126    /// Load a cache from `cache_path`.
127    ///
128    /// Returns an empty cache (rather than an error) if the file does not
129    /// exist.  Returns `Err` only on genuine I/O or parse failures.
130    ///
131    /// # Errors
132    ///
133    /// Returns an `io::Error` if the file exists but cannot be read or parsed.
134    pub fn load(cache_path: PathBuf) -> io::Result<Self> {
135        if !cache_path.exists() {
136            return Ok(Self::new(cache_path));
137        }
138        let file = std::fs::File::open(&cache_path)?;
139        let reader = BufReader::new(file);
140        let entries: HashMap<String, CachedEntry> =
141            serde_json::from_reader(reader).map_err(|e| {
142                io::Error::new(
143                    io::ErrorKind::InvalidData,
144                    format!("cache parse error: {e}"),
145                )
146            })?;
147        Ok(Self {
148            cache_path,
149            entries,
150            hits: 0,
151            misses: 0,
152        })
153    }
154
155    /// Save the cache to disk atomically (write temp → rename).
156    ///
157    /// # Errors
158    ///
159    /// Returns an `io::Error` if writing or renaming fails.
160    pub fn save(&self) -> io::Result<()> {
161        // Ensure the parent directory exists.
162        if let Some(parent) = self.cache_path.parent() {
163            std::fs::create_dir_all(parent)?;
164        }
165
166        // Write to a sibling temp file first.
167        let tmp_path = self.cache_path.with_extension("tmp");
168        {
169            let file = std::fs::File::create(&tmp_path)?;
170            let writer = BufWriter::new(file);
171            serde_json::to_writer(writer, &self.entries).map_err(|e| {
172                io::Error::new(io::ErrorKind::Other, format!("cache write error: {e}"))
173            })?;
174        }
175        std::fs::rename(&tmp_path, &self.cache_path)?;
176        Ok(())
177    }
178
179    /// Insert or update a [`CachedEntry`].
180    pub fn insert(&mut self, entry: CachedEntry) {
181        self.entries.insert(entry.path.clone(), entry);
182    }
183
184    /// Remove the entry for `path`, returning it if it existed.
185    pub fn remove(&mut self, path: &str) -> Option<CachedEntry> {
186        self.entries.remove(path)
187    }
188
189    /// Return the number of entries in the cache.
190    #[must_use]
191    pub fn len(&self) -> usize {
192        self.entries.len()
193    }
194
195    /// Return `true` if the cache contains no entries.
196    #[must_use]
197    pub fn is_empty(&self) -> bool {
198        self.entries.is_empty()
199    }
200
201    /// Look up a cached entry by file path **without** freshness checking.
202    ///
203    /// Use [`get_valid`](Self::get_valid) to validate against on-disk state.
204    #[must_use]
205    pub fn get(&self, path: &str) -> Option<&CachedEntry> {
206        self.entries.get(path)
207    }
208
209    /// Look up a cached entry, validating that the stored BLAKE3 digest still
210    /// matches the current file.
211    ///
212    /// Reads the actual file to recompute its BLAKE3 hash.  Returns `None` if:
213    /// - The entry is not in the cache.
214    /// - The file does not exist.
215    /// - The digest has changed (file was modified).
216    ///
217    /// Updates the internal hit/miss counters.
218    pub fn get_valid(&mut self, path: &str) -> Option<&CachedEntry> {
219        let entry = match self.entries.get(path) {
220            Some(e) => e,
221            None => {
222                self.misses += 1;
223                return None;
224            }
225        };
226
227        // Read the file and check the BLAKE3 hex digest.
228        match compute_blake3_hex(Path::new(path)) {
229            Ok(current_hex) => {
230                if current_hex == entry.blake3_hex {
231                    self.hits += 1;
232                    self.entries.get(path)
233                } else {
234                    // Stale entry — remove it.
235                    self.misses += 1;
236                    self.entries.remove(path);
237                    None
238                }
239            }
240            Err(_) => {
241                // File inaccessible → treat as cache miss.
242                self.misses += 1;
243                None
244            }
245        }
246    }
247
248    /// Return the number of cache hits since the cache was loaded or last reset.
249    #[must_use]
250    pub fn hits(&self) -> u64 {
251        self.hits
252    }
253
254    /// Return the number of cache misses since the cache was loaded or last reset.
255    #[must_use]
256    pub fn misses(&self) -> u64 {
257        self.misses
258    }
259
260    /// Return the hit rate (0.0 – 1.0).  Returns 0.0 if no lookups have been made.
261    #[must_use]
262    pub fn hit_rate(&self) -> f64 {
263        let total = self.hits + self.misses;
264        if total == 0 {
265            return 0.0;
266        }
267        self.hits as f64 / total as f64
268    }
269
270    /// Reset hit/miss counters.
271    pub fn reset_stats(&mut self) {
272        self.hits = 0;
273        self.misses = 0;
274    }
275
276    /// Evict all entries whose source file no longer exists on disk.
277    ///
278    /// Returns the number of entries evicted.
279    pub fn evict_missing(&mut self) -> usize {
280        let before = self.entries.len();
281        self.entries.retain(|path, _| Path::new(path).exists());
282        before - self.entries.len()
283    }
284
285    /// Evict all entries that are stale (file modified since caching).
286    ///
287    /// Recomputes BLAKE3 hashes for all cached files.  Entries are removed when
288    /// the digest no longer matches.  Returns the number of entries evicted.
289    pub fn evict_stale(&mut self) -> usize {
290        let paths: Vec<String> = self.entries.keys().cloned().collect();
291        let mut evicted = 0;
292        for path in paths {
293            let stale = if let Some(entry) = self.entries.get(&path) {
294                compute_blake3_hex(Path::new(&path))
295                    .map(|h| h != entry.blake3_hex)
296                    .unwrap_or(true) // can't read → evict
297            } else {
298                false
299            };
300            if stale {
301                self.entries.remove(&path);
302                evicted += 1;
303            }
304        }
305        evicted
306    }
307
308    /// Merge entries from `other` into this cache.
309    ///
310    /// Entries in `other` overwrite entries with the same path in `self`.
311    pub fn merge_from(&mut self, other: &Self) {
312        for (path, entry) in &other.entries {
313            self.entries.insert(path.clone(), entry.clone());
314        }
315    }
316
317    /// Return an iterator over all cached entries.
318    pub fn iter(&self) -> impl Iterator<Item = (&String, &CachedEntry)> {
319        self.entries.iter()
320    }
321}
322
323// ---------------------------------------------------------------------------
324// Internal helpers
325// ---------------------------------------------------------------------------
326
327/// Compute a lower-case BLAKE3 hex digest for a file path.
328///
329/// Uses a simple FNV-1a based stand-in when the `blake3` crate is available
330/// via the workspace.  This avoids re-implementing the full BLAKE3 algorithm
331/// and stays consistent with the rest of `oximedia-dedup`.
332fn compute_blake3_hex(path: &Path) -> io::Result<String> {
333    use std::io::Read;
334
335    let mut file = std::fs::File::open(path)?;
336    let mut hasher = blake3::Hasher::new();
337    let mut buf = vec![0u8; 65_536];
338    loop {
339        let n = file.read(&mut buf)?;
340        if n == 0 {
341            break;
342        }
343        hasher.update(&buf[..n]);
344    }
345    Ok(hasher.finalize().to_hex().to_string())
346}
347
348// ---------------------------------------------------------------------------
349// Tests
350// ---------------------------------------------------------------------------
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355    use std::io::Write;
356
357    fn tmp_cache_path(name: &str) -> PathBuf {
358        std::env::temp_dir()
359            .join("oximedia_persistent_cache_tests")
360            .join(name)
361    }
362
363    fn sample_entry(path: &str) -> CachedEntry {
364        CachedEntry {
365            path: path.to_string(),
366            blake3_hex: "0".repeat(64),
367            phash: 0xDEAD_BEEF_1234_5678,
368            thumbnail: None,
369            modified_secs: 1_700_000_000,
370        }
371    }
372
373    #[test]
374    fn test_new_cache_is_empty() {
375        let cache = PersistentFingerprintCache::new(tmp_cache_path("new_empty.json"));
376        assert!(cache.is_empty());
377        assert_eq!(cache.len(), 0);
378    }
379
380    #[test]
381    fn test_insert_and_get() {
382        let mut cache = PersistentFingerprintCache::new(tmp_cache_path("insert.json"));
383        cache.insert(sample_entry("/media/a.mp4"));
384        let e = cache.get("/media/a.mp4");
385        assert!(e.is_some());
386        assert_eq!(e.unwrap().phash, 0xDEAD_BEEF_1234_5678);
387    }
388
389    #[test]
390    fn test_remove() {
391        let mut cache = PersistentFingerprintCache::new(tmp_cache_path("remove.json"));
392        cache.insert(sample_entry("/media/b.mp4"));
393        assert!(cache.remove("/media/b.mp4").is_some());
394        assert!(cache.get("/media/b.mp4").is_none());
395    }
396
397    #[test]
398    fn test_save_and_load_roundtrip() {
399        let path = tmp_cache_path("roundtrip.json");
400        std::fs::create_dir_all(path.parent().unwrap()).ok();
401
402        let mut cache = PersistentFingerprintCache::new(path.clone());
403        cache.insert(sample_entry("/media/c.mp4"));
404        cache.save().expect("save should succeed");
405
406        let loaded = PersistentFingerprintCache::load(path).expect("load should succeed");
407        assert_eq!(loaded.len(), 1);
408        assert!(loaded.get("/media/c.mp4").is_some());
409    }
410
411    #[test]
412    fn test_load_nonexistent_returns_empty() {
413        let path = tmp_cache_path("nonexistent_xyzabc.json");
414        // Make sure it really doesn't exist.
415        let _ = std::fs::remove_file(&path);
416        let cache = PersistentFingerprintCache::load(path).expect("should not fail");
417        assert!(cache.is_empty());
418    }
419
420    #[test]
421    fn test_hit_miss_counters() {
422        let mut cache = PersistentFingerprintCache::new(tmp_cache_path("stats.json"));
423        cache.insert(sample_entry("/x.mp4"));
424        // Plain get does not update counters.
425        let _ = cache.get("/x.mp4");
426        assert_eq!(cache.hits(), 0);
427        assert_eq!(cache.misses(), 0);
428    }
429
430    #[test]
431    fn test_hit_rate_zero_on_no_lookups() {
432        let cache = PersistentFingerprintCache::new(tmp_cache_path("hitrate.json"));
433        assert_eq!(cache.hit_rate(), 0.0);
434    }
435
436    #[test]
437    fn test_evict_missing_removes_nonexistent_paths() {
438        let mut cache = PersistentFingerprintCache::new(tmp_cache_path("evict.json"));
439        cache.insert(sample_entry("/definitely/does/not/exist/zzz.mp4"));
440        assert_eq!(cache.len(), 1);
441        let evicted = cache.evict_missing();
442        assert_eq!(evicted, 1);
443        assert!(cache.is_empty());
444    }
445
446    #[test]
447    fn test_evict_stale_removes_changed_files() {
448        // Create a real temp file, write content, hash it, then change the file.
449        let dir = std::env::temp_dir().join("oximedia_pc_stale_test");
450        std::fs::create_dir_all(&dir).ok();
451        let file_path = dir.join("media_file.bin");
452
453        // Write initial content.
454        {
455            let mut f = std::fs::File::create(&file_path).expect("create");
456            f.write_all(b"original content for hashing").expect("write");
457        }
458
459        // Compute actual hash.
460        let real_hash = compute_blake3_hex(&file_path).expect("hash ok");
461
462        let mut cache = PersistentFingerprintCache::new(tmp_cache_path("stale.json"));
463        cache.insert(CachedEntry {
464            path: file_path.to_string_lossy().to_string(),
465            blake3_hex: real_hash.clone(),
466            phash: 0x1111,
467            thumbnail: None,
468            modified_secs: 0,
469        });
470
471        // Eviction should keep the entry (file unchanged).
472        let evicted = cache.evict_stale();
473        assert_eq!(evicted, 0, "file unchanged → no eviction");
474
475        // Now mutate the file.
476        {
477            let mut f = std::fs::File::create(&file_path).expect("create");
478            f.write_all(b"modified content, different bytes!")
479                .expect("write");
480        }
481
482        // Now the cached hash is stale.
483        let evicted2 = cache.evict_stale();
484        assert_eq!(evicted2, 1, "changed file → entry evicted");
485        assert!(cache.is_empty());
486
487        let _ = std::fs::remove_file(&file_path);
488    }
489
490    #[test]
491    fn test_merge_from() {
492        let mut a = PersistentFingerprintCache::new(tmp_cache_path("merge_a.json"));
493        let mut b = PersistentFingerprintCache::new(tmp_cache_path("merge_b.json"));
494        a.insert(sample_entry("/file_a.mp4"));
495        b.insert(sample_entry("/file_b.mp4"));
496        a.merge_from(&b);
497        assert_eq!(a.len(), 2);
498        assert!(a.get("/file_a.mp4").is_some());
499        assert!(a.get("/file_b.mp4").is_some());
500    }
501
502    #[test]
503    fn test_thumbnail_valid_no_thumbnail() {
504        let entry = sample_entry("/x.mp4");
505        assert!(entry.thumbnail_valid()); // None is valid
506    }
507
508    #[test]
509    fn test_thumbnail_valid_correct_size() {
510        let entry = CachedEntry {
511            thumbnail: Some(vec![128u8; 64]), // 8×8 bytes
512            ..sample_entry("/y.mp4")
513        };
514        assert!(entry.thumbnail_valid());
515    }
516
517    #[test]
518    fn test_thumbnail_invalid_wrong_size() {
519        let entry = CachedEntry {
520            thumbnail: Some(vec![0u8; 32]), // wrong size
521            ..sample_entry("/z.mp4")
522        };
523        assert!(!entry.thumbnail_valid());
524    }
525
526    #[test]
527    fn test_reset_stats() {
528        let mut cache = PersistentFingerprintCache::new(tmp_cache_path("reset.json"));
529        // Manually bump counters via get_valid (will miss since no entry).
530        let _ = cache.get_valid("/nonexistent.mp4");
531        assert!(cache.misses() > 0);
532        cache.reset_stats();
533        assert_eq!(cache.misses(), 0);
534        assert_eq!(cache.hits(), 0);
535    }
536}