Skip to main content

snapdir_core/
cache.rs

1//! XDG content-addressable cache with the `cache-id` integrity-check mechanism.
2//!
3//! A snapdir *cache* is just a local content-addressable store (the same
4//! sharded layout as a `file://` store): objects live under
5//! `<cache_dir>/.objects/<h[0..3]>/<h[3..6]>/<h[6..9]>/<h[9..]>` and manifests
6//! under `<cache_dir>/.manifests/<id…>`. This module mirrors the cache-side
7//! integrity machinery of the Bash oracle:
8//!
9//! - [`check_snapshot_integrity`] mirrors `_snapdir_check_integrity` (`snapdir`
10//!   ~L1691): given a snapshot id and a cache directory, assert the manifest is
11//!   present locally, then verify every **file** object referenced by the
12//!   manifest hashes (BLAKE3) to the checksum it is filed under. This is the
13//!   "verify a cached snapshot by its id" check at the heart of
14//!   `checkout`/`verify`.
15//! - [`verify_cache`] mirrors `verify-cache` (`snapdir` ~L1011): enumerate every
16//!   object under `.objects/*/*/*/*`, recompute its hash, and compare the actual
17//!   hash to the **expected** hash encoded by the object's own sharded path (the
18//!   path *is* the content address). Collect mismatches; when `purge` is set,
19//!   delete the corrupt objects.
20//! - [`flush_cache`] mirrors `flush-cache` (`snapdir` ~L1061): empty the cache
21//!   directory, idempotent on a missing dir.
22//!
23//! Per the library-purity principle this module performs no terminal I/O and
24//! reads no `$HOME`/`XDG`/environment for behavior. The cache directory is a
25//! parameter; the CLI lane resolves `${XDG_CACHE_HOME:-$HOME/.cache}/snapdir`.
26//! Hashing is in-process via the [`Hasher`] abstraction (the shipped default is
27//! BLAKE3); we never shell out to `b3sum`. The sharded path layout is reused
28//! from [`crate::store`] (`object_path`/`manifest_path`); it is not
29//! reimplemented here.
30
31use std::path::{Path, PathBuf};
32
33use thiserror::Error;
34
35use crate::manifest::{Manifest, PathType};
36use crate::merkle::Hasher;
37use crate::store::{manifest_path, object_path, OBJECTS_DIR};
38
39/// Errors the cache integrity machinery can surface.
40#[derive(Debug, Error)]
41#[non_exhaustive]
42pub enum CacheError {
43    /// The manifest for the requested snapshot id was not present in the cache.
44    ///
45    /// Mirrors the oracle's "Manifest not found locally. Did you forget to
46    /// fetch …?" failure in `_snapdir_check_integrity`.
47    #[error("manifest not found locally for {id}. Did you forget to fetch {id} from the store?")]
48    ManifestNotFound {
49        /// The snapshot id that was looked up.
50        id: String,
51    },
52
53    /// A file object referenced by the manifest was missing from the cache.
54    #[error("object not found in cache: {checksum}")]
55    ObjectNotFound {
56        /// The object checksum (content address) that was looked up.
57        checksum: String,
58    },
59
60    /// A cached object's bytes did not hash to the address it is filed under —
61    /// the object is corrupt or tampered.
62    #[error("checksum mismatch for {expected}: cached bytes hash to {actual}")]
63    Integrity {
64        /// The checksum the object is filed under (its content address).
65        expected: String,
66        /// The checksum actually computed over the cached bytes.
67        actual: String,
68    },
69
70    /// A manifest's text could not be parsed.
71    #[error("failed to parse cached manifest: {0}")]
72    Parse(#[from] crate::manifest::ParseError),
73
74    /// An underlying filesystem failure.
75    #[error("cache I/O error: {0}")]
76    Io(#[from] std::io::Error),
77}
78
79/// Loads a cached manifest by snapshot `id` from `cache_dir`.
80///
81/// Reads `<cache_dir>/.manifests/<id…>` (the sharded manifest path) and parses
82/// it. This is the "manifest must be present locally" precondition of
83/// [`check_snapshot_integrity`], exposed on its own for callers that have only
84/// an id and a cache directory.
85///
86/// # Errors
87///
88/// - [`CacheError::ManifestNotFound`] if no manifest is filed under `id`,
89///   matching the oracle's `test -f … || { echo "…did you forget to fetch…" }`.
90/// - [`CacheError::Parse`] if the cached bytes are not a valid manifest.
91/// - [`CacheError::Io`] on any other read failure.
92pub fn load_cached_manifest(cache_dir: &Path, id: &str) -> Result<Manifest, CacheError> {
93    let path = cache_dir.join(manifest_path(id));
94    let text = match std::fs::read_to_string(&path) {
95        Ok(text) => text,
96        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
97            return Err(CacheError::ManifestNotFound { id: id.to_owned() });
98        }
99        Err(err) => return Err(CacheError::Io(err)),
100    };
101    Ok(Manifest::parse(&text)?)
102}
103
104/// Verifies a cached snapshot by its id — mirrors `_snapdir_check_integrity`.
105///
106/// First asserts the manifest for `id` is present locally (loading it from
107/// `<cache_dir>/.manifests/<id…>`), then, for every **file** entry of the
108/// manifest (directory entries — whose path ends `/` — are excluded, exactly as
109/// the oracle's `grep -v "/$"`), verifies that the cached object at its sharded
110/// path hashes via `hasher` to the checksum it is filed under (column 3 of the
111/// manifest line, i.e. the object's content address).
112///
113/// The oracle pipes `checksum  path` pairs into `b3sum --check`; this reproduces
114/// that check in-process. The first corrupt or missing object short-circuits
115/// with an error, matching `b3sum --check`'s non-zero exit.
116///
117/// # Errors
118///
119/// - [`CacheError::ManifestNotFound`] if the snapshot's manifest is absent.
120/// - [`CacheError::ObjectNotFound`] if a referenced file object is missing.
121/// - [`CacheError::Integrity`] if a cached object does not hash to its address.
122/// - [`CacheError::Parse`] / [`CacheError::Io`] on read/parse failure.
123pub fn check_snapshot_integrity(
124    cache_dir: &Path,
125    id: &str,
126    hasher: &dyn Hasher,
127) -> Result<(), CacheError> {
128    let manifest = load_cached_manifest(cache_dir, id)?;
129    check_manifest_integrity(cache_dir, &manifest, hasher)
130}
131
132/// Like [`check_snapshot_integrity`] but for an already-loaded [`Manifest`].
133///
134/// Skips the `.manifests/<id…>` lookup (the caller already holds the manifest)
135/// and verifies every file object referenced by `manifest` against its content
136/// address. Used internally by [`check_snapshot_integrity`]; exposed for callers
137/// that fetched the manifest themselves.
138///
139/// # Errors
140///
141/// - [`CacheError::ObjectNotFound`] if a referenced file object is missing.
142/// - [`CacheError::Integrity`] if a cached object does not hash to its address.
143/// - [`CacheError::Io`] on a read failure.
144pub fn check_manifest_integrity(
145    cache_dir: &Path,
146    manifest: &Manifest,
147    hasher: &dyn Hasher,
148) -> Result<(), CacheError> {
149    for entry in manifest.entries() {
150        // Directory lines are excluded from the object check (oracle:
151        // `grep -v "/$"`). Directory `D` entries always have a trailing-slash
152        // path; gate on the type, which is the structural truth behind that.
153        if entry.path_type == PathType::Directory {
154            continue;
155        }
156        let checksum = &entry.checksum;
157        let object = cache_dir.join(object_path(checksum));
158        let bytes = match std::fs::read(&object) {
159            Ok(bytes) => bytes,
160            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
161                return Err(CacheError::ObjectNotFound {
162                    checksum: checksum.clone(),
163                });
164            }
165            Err(err) => return Err(CacheError::Io(err)),
166        };
167        let actual = hasher.hash_hex(&bytes);
168        if &actual != checksum {
169            return Err(CacheError::Integrity {
170                expected: checksum.clone(),
171                actual,
172            });
173        }
174    }
175    Ok(())
176}
177
178/// Outcome of a whole-cache scan by [`verify_cache`].
179#[derive(Debug, Clone, Default, PartialEq, Eq)]
180pub struct CacheReport {
181    /// Number of objects scanned (every `.objects/*/*/*/*` entry).
182    pub checked: usize,
183    /// Content addresses (expected checksums) whose cached bytes did not hash
184    /// back to the address — i.e. corrupt or tampered objects.
185    pub corrupt: Vec<String>,
186    /// Content addresses that were deleted because `purge` was set (a subset of
187    /// `corrupt`; empty when `purge` is false).
188    pub purged: Vec<String>,
189}
190
191impl CacheReport {
192    /// Returns `true` when no corruption was detected (the oracle exits 0).
193    #[must_use]
194    pub fn is_clean(&self) -> bool {
195        self.corrupt.is_empty()
196    }
197}
198
199/// Verifies every object in the cache — mirrors `snapdir verify-cache`.
200///
201/// Enumerates every object at `<cache_dir>/.objects/*/*/*/*`, recomputes its
202/// hash via `hasher`, and compares it to the **expected** checksum encoded by
203/// the object's own sharded path (the path is the content address). The
204/// expected checksum is reconstructed exactly as the oracle does
205/// (`sed 's| .*.objects/| |; s|/||g'`): concatenate the four path segments after
206/// `.objects/` with the separators removed.
207///
208/// Returns a [`CacheReport`]: how many objects were checked, which were corrupt,
209/// and — when `purge` is set — which were deleted. An absent or empty
210/// `.objects` directory is a clean pass with zero checked, matching the oracle's
211/// `test -d "${cache_dir}/.objects" || return 0`.
212///
213/// # Errors
214///
215/// - [`CacheError::Io`] on a directory-traversal or read failure (other than the
216///   `.objects` directory simply being absent, which is a clean pass).
217pub fn verify_cache(
218    cache_dir: &Path,
219    purge: bool,
220    hasher: &dyn Hasher,
221) -> Result<CacheReport, CacheError> {
222    let objects_root = cache_dir.join(OBJECTS_DIR);
223    if !objects_root.is_dir() {
224        // Oracle: `test -d "${cache_dir}"/.objects || return 0`.
225        return Ok(CacheReport::default());
226    }
227
228    let mut report = CacheReport::default();
229
230    // The oracle globs exactly `.objects/*/*/*/*` — three intermediate shard
231    // levels then the leaf file. Walk those four levels deterministically.
232    for path in collect_objects(&objects_root)? {
233        report.checked += 1;
234
235        // Reconstruct the expected checksum from the path: the four components
236        // below `.objects/` concatenated (oracle `sed` strips the separators).
237        let Some(expected) = expected_checksum_from_path(&objects_root, &path) else {
238            continue;
239        };
240
241        let bytes = std::fs::read(&path)?;
242        let actual = hasher.hash_hex(&bytes);
243
244        if actual != expected {
245            report.corrupt.push(expected.clone());
246            if purge {
247                // Oracle: `rm "${cache_dir}/$(_snapdir_get_object_rel_path …)"`.
248                std::fs::remove_file(&path)?;
249                report.purged.push(expected);
250            }
251        }
252    }
253
254    // Deterministic order regardless of filesystem readdir order.
255    report.corrupt.sort();
256    report.purged.sort();
257    Ok(report)
258}
259
260/// Collects every object at exactly `<objects_root>/*/*/*/*` (three shard levels
261/// then the leaf), mirroring the oracle's `.objects/*/*/*/*` glob.
262fn collect_objects(objects_root: &Path) -> Result<Vec<PathBuf>, CacheError> {
263    let mut out = Vec::new();
264    for l0 in read_subdirs(objects_root)? {
265        for l1 in read_subdirs(&l0)? {
266            for l2 in read_subdirs(&l1)? {
267                for entry in std::fs::read_dir(&l2)? {
268                    let path = entry?.path();
269                    if path.is_file() {
270                        out.push(path);
271                    }
272                }
273            }
274        }
275    }
276    out.sort();
277    Ok(out)
278}
279
280/// Returns the immediate subdirectories of `dir`.
281fn read_subdirs(dir: &Path) -> Result<Vec<PathBuf>, CacheError> {
282    let mut out = Vec::new();
283    for entry in std::fs::read_dir(dir)? {
284        let path = entry?.path();
285        if path.is_dir() {
286            out.push(path);
287        }
288    }
289    Ok(out)
290}
291
292/// Reconstructs the content address (expected checksum) of an object from its
293/// sharded path under `objects_root`, exactly as the oracle's
294/// `sed 's| .*.objects/| |; s|/||g'` does: take the path components below
295/// `.objects/` and concatenate them with the separators removed.
296fn expected_checksum_from_path(objects_root: &Path, object: &Path) -> Option<String> {
297    let rel = object.strip_prefix(objects_root).ok()?;
298    let mut checksum = String::new();
299    for component in rel.components() {
300        checksum.push_str(component.as_os_str().to_str()?);
301    }
302    Some(checksum)
303}
304
305/// Empties the local cache — mirrors `snapdir flush-cache`.
306///
307/// Removes the cache directory's contents (objects and manifests). The oracle
308/// does `rm -rf "${cache_dir}"`; this removes the directory's *contents* so the
309/// directory itself (which the caller may have created) survives, while still
310/// leaving the cache empty. Idempotent on a missing cache directory (a clean
311/// no-op pass).
312///
313/// # Errors
314///
315/// - [`CacheError::Io`] on a removal failure other than the directory simply
316///   being absent.
317pub fn flush_cache(cache_dir: &Path) -> Result<(), CacheError> {
318    match std::fs::read_dir(cache_dir) {
319        Ok(entries) => {
320            for entry in entries {
321                let path = entry?.path();
322                if path.is_dir() {
323                    std::fs::remove_dir_all(&path)?;
324                } else {
325                    std::fs::remove_file(&path)?;
326                }
327            }
328            Ok(())
329        }
330        // A missing cache dir is already "empty" — idempotent no-op.
331        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()),
332        Err(err) => Err(CacheError::Io(err)),
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339    use crate::manifest::ManifestEntry;
340    use crate::merkle::Blake3Hasher;
341    use std::fs;
342    use std::path::{Path, PathBuf};
343    use std::sync::atomic::{AtomicU64, Ordering};
344
345    /// A self-cleaning scratch directory under the system temp dir. Mirrors the
346    /// helper in `walk.rs`, deliberately avoiding a `tempfile` dev-dependency:
347    /// the cache module is library-pure and never reads the environment itself —
348    /// only this test harness builds fixtures on disk.
349    struct Scratch {
350        path: PathBuf,
351    }
352
353    impl Scratch {
354        fn new() -> Self {
355            static COUNTER: AtomicU64 = AtomicU64::new(0);
356            let n = COUNTER.fetch_add(1, Ordering::Relaxed);
357            let pid = std::process::id();
358            let base = std::env::temp_dir();
359            let path = base.join(format!("snapdir-cache-test-{pid}-{n}"));
360            fs::create_dir_all(&path).expect("create scratch dir");
361            Scratch { path }
362        }
363
364        fn path(&self) -> &Path {
365            &self.path
366        }
367    }
368
369    impl Drop for Scratch {
370        fn drop(&mut self) {
371            let _ = fs::remove_dir_all(&self.path);
372        }
373    }
374
375    /// Writes `bytes` to the cache as an object filed under its real BLAKE3
376    /// address, returning that checksum.
377    fn put_object(cache_dir: &Path, bytes: &[u8]) -> String {
378        let checksum = Blake3Hasher.hash_hex(bytes);
379        let path = cache_dir.join(object_path(&checksum));
380        fs::create_dir_all(path.parent().unwrap()).unwrap();
381        fs::write(&path, bytes).unwrap();
382        checksum
383    }
384
385    /// Writes a manifest to the cache filed under `id`, returning the manifest.
386    fn put_manifest(cache_dir: &Path, id: &str, manifest: &Manifest) {
387        let path = cache_dir.join(manifest_path(id));
388        fs::create_dir_all(path.parent().unwrap()).unwrap();
389        fs::write(&path, format!("{manifest}")).unwrap();
390    }
391
392    /// Builds a small clean cache: a root dir entry + two file objects, with a
393    /// manifest filed under `id`. Returns `(id, file checksums)`.
394    fn build_clean_cache(cache_dir: &Path) -> (String, String, String) {
395        let foo = b"foo\n";
396        let bar = b"bar\n";
397        let foo_sum = put_object(cache_dir, foo);
398        let bar_sum = put_object(cache_dir, bar);
399
400        let mut manifest = Manifest::new();
401        manifest.push(ManifestEntry::new(
402            PathType::Directory,
403            "700",
404            "rootsum",
405            0,
406            "./",
407        ));
408        manifest.push(ManifestEntry::new(
409            PathType::File,
410            "600",
411            &foo_sum,
412            foo.len() as u64,
413            "./foo",
414        ));
415        manifest.push(ManifestEntry::new(
416            PathType::File,
417            "600",
418            &bar_sum,
419            bar.len() as u64,
420            "./bar",
421        ));
422
423        let id = "cafef00dcafef00dcafef00dcafef00dcafef00dcafef00dcafef00dcafef00d".to_string();
424        put_manifest(cache_dir, &id, &manifest);
425        (id, foo_sum, bar_sum)
426    }
427
428    #[test]
429    fn cache_clean_passes_integrity_and_verify() {
430        let tmp = Scratch::new();
431        let (id, _foo, _bar) = build_clean_cache(tmp.path());
432
433        check_snapshot_integrity(tmp.path(), &id, &Blake3Hasher).expect("clean cache passes");
434
435        let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
436        assert_eq!(report.checked, 2, "two objects scanned");
437        assert!(report.is_clean(), "no corruption: {report:?}");
438        assert!(report.purged.is_empty());
439    }
440
441    #[test]
442    fn cache_tampered_object_detected_by_both_checks() {
443        let tmp = Scratch::new();
444        let (id, foo_sum, _bar) = build_clean_cache(tmp.path());
445
446        // Tamper with one object's bytes in place (path/address unchanged).
447        let foo_path = tmp.path().join(object_path(&foo_sum));
448        fs::write(&foo_path, b"TAMPERED").unwrap();
449
450        // check_snapshot_integrity: the file object no longer hashes to its
451        // manifest checksum.
452        match check_snapshot_integrity(tmp.path(), &id, &Blake3Hasher) {
453            Err(CacheError::Integrity { expected, .. }) => assert_eq!(expected, foo_sum),
454            other => panic!("expected Integrity error, got {other:?}"),
455        }
456
457        // verify_cache: the object's bytes no longer match its path-encoded
458        // address.
459        let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
460        assert_eq!(report.checked, 2);
461        assert_eq!(report.corrupt, vec![foo_sum.clone()]);
462        assert!(report.purged.is_empty(), "no purge without flag");
463        assert!(!report.is_clean());
464        // The corrupt object is still on disk (not purged).
465        assert!(foo_path.exists());
466    }
467
468    #[test]
469    fn cache_purge_removes_only_corrupt_object() {
470        let tmp = Scratch::new();
471        let (_id, foo_sum, bar_sum) = build_clean_cache(tmp.path());
472
473        let foo_path = tmp.path().join(object_path(&foo_sum));
474        let bar_path = tmp.path().join(object_path(&bar_sum));
475        fs::write(&foo_path, b"TAMPERED").unwrap();
476
477        let report = verify_cache(tmp.path(), true, &Blake3Hasher).unwrap();
478        assert_eq!(report.checked, 2);
479        assert_eq!(report.corrupt, vec![foo_sum.clone()]);
480        assert_eq!(report.purged, vec![foo_sum]);
481        assert!(!foo_path.exists(), "corrupt object purged");
482        assert!(bar_path.exists(), "clean object kept");
483
484        // A re-scan now sees only the surviving clean object and passes.
485        let rescan = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
486        assert_eq!(rescan.checked, 1);
487        assert!(rescan.is_clean());
488    }
489
490    #[test]
491    fn cache_missing_manifest_yields_not_found() {
492        let tmp = Scratch::new();
493        let id = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
494        match check_snapshot_integrity(tmp.path(), id, &Blake3Hasher) {
495            Err(CacheError::ManifestNotFound { id: got }) => assert_eq!(got, id),
496            other => panic!("expected ManifestNotFound, got {other:?}"),
497        }
498    }
499
500    #[test]
501    fn cache_missing_object_yields_not_found() {
502        let tmp = Scratch::new();
503        let (id, foo_sum, _bar) = build_clean_cache(tmp.path());
504        // Delete one referenced object but keep the manifest.
505        fs::remove_file(tmp.path().join(object_path(&foo_sum))).unwrap();
506        match check_snapshot_integrity(tmp.path(), &id, &Blake3Hasher) {
507            Err(CacheError::ObjectNotFound { checksum }) => assert_eq!(checksum, foo_sum),
508            other => panic!("expected ObjectNotFound, got {other:?}"),
509        }
510    }
511
512    #[test]
513    fn cache_directory_lines_excluded_from_integrity() {
514        // A manifest whose only entry is a directory (no file objects on disk)
515        // still passes integrity — directory lines are excluded.
516        let tmp = Scratch::new();
517        let mut manifest = Manifest::new();
518        manifest.push(ManifestEntry::new(
519            PathType::Directory,
520            "700",
521            "deadbeef",
522            0,
523            "./",
524        ));
525        let id = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
526        put_manifest(tmp.path(), id, &manifest);
527        check_snapshot_integrity(tmp.path(), id, &Blake3Hasher)
528            .expect("directory-only manifest passes");
529    }
530
531    #[test]
532    fn cache_empty_or_absent_objects_dir_is_clean_pass() {
533        // Absent .objects entirely.
534        let tmp = Scratch::new();
535        let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
536        assert_eq!(report, CacheReport::default());
537        assert!(report.is_clean());
538        assert_eq!(report.checked, 0);
539
540        // Present-but-empty .objects.
541        fs::create_dir_all(tmp.path().join(OBJECTS_DIR)).unwrap();
542        let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
543        assert_eq!(report.checked, 0);
544        assert!(report.is_clean());
545    }
546
547    #[test]
548    fn cache_verify_reconstructs_expected_checksum_from_path() {
549        // Directly guard the sed-equivalent path->checksum reconstruction: an
550        // object filed under a known address reconstructs exactly that address.
551        let tmp = Scratch::new();
552        let checksum = put_object(tmp.path(), b"hello cache\n");
553        let objects_root = tmp.path().join(OBJECTS_DIR);
554        let object = tmp.path().join(object_path(&checksum));
555        let got = expected_checksum_from_path(&objects_root, &object).unwrap();
556        assert_eq!(got, checksum);
557    }
558
559    #[test]
560    fn cache_flush_empties_objects_and_manifests() {
561        let tmp = Scratch::new();
562        let (_id, _foo, _bar) = build_clean_cache(tmp.path());
563        assert!(tmp.path().join(OBJECTS_DIR).exists());
564        assert!(tmp.path().join(MANIFESTS_DIR_TEST).exists());
565
566        flush_cache(tmp.path()).expect("flush succeeds");
567
568        assert!(!tmp.path().join(OBJECTS_DIR).exists());
569        assert!(!tmp.path().join(MANIFESTS_DIR_TEST).exists());
570        // The cache dir itself survives and is empty.
571        assert!(tmp.path().is_dir());
572        assert_eq!(fs::read_dir(tmp.path()).unwrap().count(), 0);
573    }
574
575    #[test]
576    fn cache_flush_is_idempotent_on_missing_dir() {
577        let tmp = Scratch::new();
578        let missing = tmp.path().join("does-not-exist");
579        flush_cache(&missing).expect("flush on missing dir is a no-op");
580    }
581
582    const MANIFESTS_DIR_TEST: &str = ".manifests";
583}