snapdir_core/cache.rs
1//! XDG content-addressable cache with the `cache-id` integrity-check mechanism.
2//!
3//! A snapdir *cache* is just a local content-addressable store (the same
4//! sharded layout as a `file://` store): objects live under
5//! `<cache_dir>/.objects/<h[0..3]>/<h[3..6]>/<h[6..9]>/<h[9..]>` and manifests
6//! under `<cache_dir>/.manifests/<id…>`. This module mirrors the cache-side
7//! integrity machinery of the Bash oracle:
8//!
9//! - [`check_snapshot_integrity`] mirrors `_snapdir_check_integrity` (`snapdir`
10//! ~L1691): given a snapshot id and a cache directory, assert the manifest is
11//! present locally, then verify every **file** object referenced by the
12//! manifest hashes (BLAKE3) to the checksum it is filed under. This is the
13//! "verify a cached snapshot by its id" check at the heart of
14//! `checkout`/`verify`.
15//! - [`verify_cache`] mirrors `verify-cache` (`snapdir` ~L1011): enumerate every
16//! object under `.objects/*/*/*/*`, recompute its hash, and compare the actual
17//! hash to the **expected** hash encoded by the object's own sharded path (the
18//! path *is* the content address). Collect mismatches; when `purge` is set,
19//! delete the corrupt objects.
20//! - [`flush_cache`] mirrors `flush-cache` (`snapdir` ~L1061): empty the cache
21//! directory, idempotent on a missing dir.
22//!
23//! Per the library-purity principle this module performs no terminal I/O and
24//! reads no `$HOME`/`XDG`/environment for behavior. The cache directory is a
25//! parameter; the CLI lane resolves `${XDG_CACHE_HOME:-$HOME/.cache}/snapdir`.
26//! Hashing is in-process via the [`Hasher`] abstraction (the shipped default is
27//! BLAKE3); we never shell out to `b3sum`. The sharded path layout is reused
28//! from [`crate::store`] (`object_path`/`manifest_path`); it is not
29//! reimplemented here.
30
31use std::path::{Path, PathBuf};
32
33use thiserror::Error;
34
35use crate::manifest::{Manifest, PathType};
36use crate::merkle::Hasher;
37use crate::store::{manifest_path, object_path, OBJECTS_DIR};
38
39/// Errors the cache integrity machinery can surface.
40#[derive(Debug, Error)]
41#[non_exhaustive]
42pub enum CacheError {
43 /// The manifest for the requested snapshot id was not present in the cache.
44 ///
45 /// Mirrors the oracle's "Manifest not found locally. Did you forget to
46 /// fetch …?" failure in `_snapdir_check_integrity`.
47 #[error("manifest not found locally for {id}. Did you forget to fetch {id} from the store?")]
48 ManifestNotFound {
49 /// The snapshot id that was looked up.
50 id: String,
51 },
52
53 /// A file object referenced by the manifest was missing from the cache.
54 #[error("object not found in cache: {checksum}")]
55 ObjectNotFound {
56 /// The object checksum (content address) that was looked up.
57 checksum: String,
58 },
59
60 /// A cached object's bytes did not hash to the address it is filed under —
61 /// the object is corrupt or tampered.
62 #[error("checksum mismatch for {expected}: cached bytes hash to {actual}")]
63 Integrity {
64 /// The checksum the object is filed under (its content address).
65 expected: String,
66 /// The checksum actually computed over the cached bytes.
67 actual: String,
68 },
69
70 /// A manifest's text could not be parsed.
71 #[error("failed to parse cached manifest: {0}")]
72 Parse(#[from] crate::manifest::ParseError),
73
74 /// An underlying filesystem failure.
75 #[error("cache I/O error: {0}")]
76 Io(#[from] std::io::Error),
77}
78
79/// Loads a cached manifest by snapshot `id` from `cache_dir`.
80///
81/// Reads `<cache_dir>/.manifests/<id…>` (the sharded manifest path) and parses
82/// it. This is the "manifest must be present locally" precondition of
83/// [`check_snapshot_integrity`], exposed on its own for callers that have only
84/// an id and a cache directory.
85///
86/// # Errors
87///
88/// - [`CacheError::ManifestNotFound`] if no manifest is filed under `id`,
89/// matching the oracle's `test -f … || { echo "…did you forget to fetch…" }`.
90/// - [`CacheError::Parse`] if the cached bytes are not a valid manifest.
91/// - [`CacheError::Io`] on any other read failure.
92pub fn load_cached_manifest(cache_dir: &Path, id: &str) -> Result<Manifest, CacheError> {
93 let path = cache_dir.join(manifest_path(id));
94 let text = match std::fs::read_to_string(&path) {
95 Ok(text) => text,
96 Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
97 return Err(CacheError::ManifestNotFound { id: id.to_owned() });
98 }
99 Err(err) => return Err(CacheError::Io(err)),
100 };
101 Ok(Manifest::parse(&text)?)
102}
103
104/// Verifies a cached snapshot by its id — mirrors `_snapdir_check_integrity`.
105///
106/// First asserts the manifest for `id` is present locally (loading it from
107/// `<cache_dir>/.manifests/<id…>`), then, for every **file** entry of the
108/// manifest (directory entries — whose path ends `/` — are excluded, exactly as
109/// the oracle's `grep -v "/$"`), verifies that the cached object at its sharded
110/// path hashes via `hasher` to the checksum it is filed under (column 3 of the
111/// manifest line, i.e. the object's content address).
112///
113/// The oracle pipes `checksum path` pairs into `b3sum --check`; this reproduces
114/// that check in-process. The first corrupt or missing object short-circuits
115/// with an error, matching `b3sum --check`'s non-zero exit.
116///
117/// # Errors
118///
119/// - [`CacheError::ManifestNotFound`] if the snapshot's manifest is absent.
120/// - [`CacheError::ObjectNotFound`] if a referenced file object is missing.
121/// - [`CacheError::Integrity`] if a cached object does not hash to its address.
122/// - [`CacheError::Parse`] / [`CacheError::Io`] on read/parse failure.
123pub fn check_snapshot_integrity(
124 cache_dir: &Path,
125 id: &str,
126 hasher: &dyn Hasher,
127) -> Result<(), CacheError> {
128 let manifest = load_cached_manifest(cache_dir, id)?;
129 check_manifest_integrity(cache_dir, &manifest, hasher)
130}
131
132/// Like [`check_snapshot_integrity`] but for an already-loaded [`Manifest`].
133///
134/// Skips the `.manifests/<id…>` lookup (the caller already holds the manifest)
135/// and verifies every file object referenced by `manifest` against its content
136/// address. Used internally by [`check_snapshot_integrity`]; exposed for callers
137/// that fetched the manifest themselves.
138///
139/// # Errors
140///
141/// - [`CacheError::ObjectNotFound`] if a referenced file object is missing.
142/// - [`CacheError::Integrity`] if a cached object does not hash to its address.
143/// - [`CacheError::Io`] on a read failure.
144pub fn check_manifest_integrity(
145 cache_dir: &Path,
146 manifest: &Manifest,
147 hasher: &dyn Hasher,
148) -> Result<(), CacheError> {
149 for entry in manifest.entries() {
150 // Directory lines are excluded from the object check (oracle:
151 // `grep -v "/$"`). Directory `D` entries always have a trailing-slash
152 // path; gate on the type, which is the structural truth behind that.
153 if entry.path_type == PathType::Directory {
154 continue;
155 }
156 let checksum = &entry.checksum;
157 let object = cache_dir.join(object_path(checksum));
158 let bytes = match std::fs::read(&object) {
159 Ok(bytes) => bytes,
160 Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
161 return Err(CacheError::ObjectNotFound {
162 checksum: checksum.clone(),
163 });
164 }
165 Err(err) => return Err(CacheError::Io(err)),
166 };
167 let actual = hasher.hash_hex(&bytes);
168 if &actual != checksum {
169 return Err(CacheError::Integrity {
170 expected: checksum.clone(),
171 actual,
172 });
173 }
174 }
175 Ok(())
176}
177
178/// Outcome of a whole-cache scan by [`verify_cache`].
179#[derive(Debug, Clone, Default, PartialEq, Eq)]
180pub struct CacheReport {
181 /// Number of objects scanned (every `.objects/*/*/*/*` entry).
182 pub checked: usize,
183 /// Content addresses (expected checksums) whose cached bytes did not hash
184 /// back to the address — i.e. corrupt or tampered objects.
185 pub corrupt: Vec<String>,
186 /// Content addresses that were deleted because `purge` was set (a subset of
187 /// `corrupt`; empty when `purge` is false).
188 pub purged: Vec<String>,
189}
190
191impl CacheReport {
192 /// Returns `true` when no corruption was detected (the oracle exits 0).
193 #[must_use]
194 pub fn is_clean(&self) -> bool {
195 self.corrupt.is_empty()
196 }
197}
198
199/// Verifies every object in the cache — mirrors `snapdir verify-cache`.
200///
201/// Enumerates every object at `<cache_dir>/.objects/*/*/*/*`, recomputes its
202/// hash via `hasher`, and compares it to the **expected** checksum encoded by
203/// the object's own sharded path (the path is the content address). The
204/// expected checksum is reconstructed exactly as the oracle does
205/// (`sed 's| .*.objects/| |; s|/||g'`): concatenate the four path segments after
206/// `.objects/` with the separators removed.
207///
208/// Returns a [`CacheReport`]: how many objects were checked, which were corrupt,
209/// and — when `purge` is set — which were deleted. An absent or empty
210/// `.objects` directory is a clean pass with zero checked, matching the oracle's
211/// `test -d "${cache_dir}/.objects" || return 0`.
212///
213/// # Errors
214///
215/// - [`CacheError::Io`] on a directory-traversal or read failure (other than the
216/// `.objects` directory simply being absent, which is a clean pass).
217pub fn verify_cache(
218 cache_dir: &Path,
219 purge: bool,
220 hasher: &dyn Hasher,
221) -> Result<CacheReport, CacheError> {
222 let objects_root = cache_dir.join(OBJECTS_DIR);
223 if !objects_root.is_dir() {
224 // Oracle: `test -d "${cache_dir}"/.objects || return 0`.
225 return Ok(CacheReport::default());
226 }
227
228 let mut report = CacheReport::default();
229
230 // The oracle globs exactly `.objects/*/*/*/*` — three intermediate shard
231 // levels then the leaf file. Walk those four levels deterministically.
232 for path in collect_objects(&objects_root)? {
233 report.checked += 1;
234
235 // Reconstruct the expected checksum from the path: the four components
236 // below `.objects/` concatenated (oracle `sed` strips the separators).
237 let Some(expected) = expected_checksum_from_path(&objects_root, &path) else {
238 continue;
239 };
240
241 let bytes = std::fs::read(&path)?;
242 let actual = hasher.hash_hex(&bytes);
243
244 if actual != expected {
245 report.corrupt.push(expected.clone());
246 if purge {
247 // Oracle: `rm "${cache_dir}/$(_snapdir_get_object_rel_path …)"`.
248 std::fs::remove_file(&path)?;
249 report.purged.push(expected);
250 }
251 }
252 }
253
254 // Deterministic order regardless of filesystem readdir order.
255 report.corrupt.sort();
256 report.purged.sort();
257 Ok(report)
258}
259
260/// Collects every object at exactly `<objects_root>/*/*/*/*` (three shard levels
261/// then the leaf), mirroring the oracle's `.objects/*/*/*/*` glob.
262fn collect_objects(objects_root: &Path) -> Result<Vec<PathBuf>, CacheError> {
263 let mut out = Vec::new();
264 for l0 in read_subdirs(objects_root)? {
265 for l1 in read_subdirs(&l0)? {
266 for l2 in read_subdirs(&l1)? {
267 for entry in std::fs::read_dir(&l2)? {
268 let path = entry?.path();
269 if path.is_file() {
270 out.push(path);
271 }
272 }
273 }
274 }
275 }
276 out.sort();
277 Ok(out)
278}
279
280/// Returns the immediate subdirectories of `dir`.
281fn read_subdirs(dir: &Path) -> Result<Vec<PathBuf>, CacheError> {
282 let mut out = Vec::new();
283 for entry in std::fs::read_dir(dir)? {
284 let path = entry?.path();
285 if path.is_dir() {
286 out.push(path);
287 }
288 }
289 Ok(out)
290}
291
292/// Reconstructs the content address (expected checksum) of an object from its
293/// sharded path under `objects_root`, exactly as the oracle's
294/// `sed 's| .*.objects/| |; s|/||g'` does: take the path components below
295/// `.objects/` and concatenate them with the separators removed.
296fn expected_checksum_from_path(objects_root: &Path, object: &Path) -> Option<String> {
297 let rel = object.strip_prefix(objects_root).ok()?;
298 let mut checksum = String::new();
299 for component in rel.components() {
300 checksum.push_str(component.as_os_str().to_str()?);
301 }
302 Some(checksum)
303}
304
305/// Empties the local cache — mirrors `snapdir flush-cache`.
306///
307/// Removes the cache directory's contents (objects and manifests). The oracle
308/// does `rm -rf "${cache_dir}"`; this removes the directory's *contents* so the
309/// directory itself (which the caller may have created) survives, while still
310/// leaving the cache empty. Idempotent on a missing cache directory (a clean
311/// no-op pass).
312///
313/// # Errors
314///
315/// - [`CacheError::Io`] on a removal failure other than the directory simply
316/// being absent.
317pub fn flush_cache(cache_dir: &Path) -> Result<(), CacheError> {
318 match std::fs::read_dir(cache_dir) {
319 Ok(entries) => {
320 for entry in entries {
321 let path = entry?.path();
322 if path.is_dir() {
323 std::fs::remove_dir_all(&path)?;
324 } else {
325 std::fs::remove_file(&path)?;
326 }
327 }
328 Ok(())
329 }
330 // A missing cache dir is already "empty" — idempotent no-op.
331 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()),
332 Err(err) => Err(CacheError::Io(err)),
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use super::*;
339 use crate::manifest::ManifestEntry;
340 use crate::merkle::Blake3Hasher;
341 use std::fs;
342 use std::path::{Path, PathBuf};
343 use std::sync::atomic::{AtomicU64, Ordering};
344
345 /// A self-cleaning scratch directory under the system temp dir. Mirrors the
346 /// helper in `walk.rs`, deliberately avoiding a `tempfile` dev-dependency:
347 /// the cache module is library-pure and never reads the environment itself —
348 /// only this test harness builds fixtures on disk.
349 struct Scratch {
350 path: PathBuf,
351 }
352
353 impl Scratch {
354 fn new() -> Self {
355 static COUNTER: AtomicU64 = AtomicU64::new(0);
356 let n = COUNTER.fetch_add(1, Ordering::Relaxed);
357 let pid = std::process::id();
358 let base = std::env::temp_dir();
359 let path = base.join(format!("snapdir-cache-test-{pid}-{n}"));
360 fs::create_dir_all(&path).expect("create scratch dir");
361 Scratch { path }
362 }
363
364 fn path(&self) -> &Path {
365 &self.path
366 }
367 }
368
369 impl Drop for Scratch {
370 fn drop(&mut self) {
371 let _ = fs::remove_dir_all(&self.path);
372 }
373 }
374
375 /// Writes `bytes` to the cache as an object filed under its real BLAKE3
376 /// address, returning that checksum.
377 fn put_object(cache_dir: &Path, bytes: &[u8]) -> String {
378 let checksum = Blake3Hasher.hash_hex(bytes);
379 let path = cache_dir.join(object_path(&checksum));
380 fs::create_dir_all(path.parent().unwrap()).unwrap();
381 fs::write(&path, bytes).unwrap();
382 checksum
383 }
384
385 /// Writes a manifest to the cache filed under `id`, returning the manifest.
386 fn put_manifest(cache_dir: &Path, id: &str, manifest: &Manifest) {
387 let path = cache_dir.join(manifest_path(id));
388 fs::create_dir_all(path.parent().unwrap()).unwrap();
389 fs::write(&path, format!("{manifest}")).unwrap();
390 }
391
392 /// Builds a small clean cache: a root dir entry + two file objects, with a
393 /// manifest filed under `id`. Returns `(id, file checksums)`.
394 fn build_clean_cache(cache_dir: &Path) -> (String, String, String) {
395 let foo = b"foo\n";
396 let bar = b"bar\n";
397 let foo_sum = put_object(cache_dir, foo);
398 let bar_sum = put_object(cache_dir, bar);
399
400 let mut manifest = Manifest::new();
401 manifest.push(ManifestEntry::new(
402 PathType::Directory,
403 "700",
404 "rootsum",
405 0,
406 "./",
407 ));
408 manifest.push(ManifestEntry::new(
409 PathType::File,
410 "600",
411 &foo_sum,
412 foo.len() as u64,
413 "./foo",
414 ));
415 manifest.push(ManifestEntry::new(
416 PathType::File,
417 "600",
418 &bar_sum,
419 bar.len() as u64,
420 "./bar",
421 ));
422
423 let id = "cafef00dcafef00dcafef00dcafef00dcafef00dcafef00dcafef00dcafef00d".to_string();
424 put_manifest(cache_dir, &id, &manifest);
425 (id, foo_sum, bar_sum)
426 }
427
428 #[test]
429 fn cache_clean_passes_integrity_and_verify() {
430 let tmp = Scratch::new();
431 let (id, _foo, _bar) = build_clean_cache(tmp.path());
432
433 check_snapshot_integrity(tmp.path(), &id, &Blake3Hasher).expect("clean cache passes");
434
435 let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
436 assert_eq!(report.checked, 2, "two objects scanned");
437 assert!(report.is_clean(), "no corruption: {report:?}");
438 assert!(report.purged.is_empty());
439 }
440
441 #[test]
442 fn cache_tampered_object_detected_by_both_checks() {
443 let tmp = Scratch::new();
444 let (id, foo_sum, _bar) = build_clean_cache(tmp.path());
445
446 // Tamper with one object's bytes in place (path/address unchanged).
447 let foo_path = tmp.path().join(object_path(&foo_sum));
448 fs::write(&foo_path, b"TAMPERED").unwrap();
449
450 // check_snapshot_integrity: the file object no longer hashes to its
451 // manifest checksum.
452 match check_snapshot_integrity(tmp.path(), &id, &Blake3Hasher) {
453 Err(CacheError::Integrity { expected, .. }) => assert_eq!(expected, foo_sum),
454 other => panic!("expected Integrity error, got {other:?}"),
455 }
456
457 // verify_cache: the object's bytes no longer match its path-encoded
458 // address.
459 let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
460 assert_eq!(report.checked, 2);
461 assert_eq!(report.corrupt, vec![foo_sum.clone()]);
462 assert!(report.purged.is_empty(), "no purge without flag");
463 assert!(!report.is_clean());
464 // The corrupt object is still on disk (not purged).
465 assert!(foo_path.exists());
466 }
467
468 #[test]
469 fn cache_purge_removes_only_corrupt_object() {
470 let tmp = Scratch::new();
471 let (_id, foo_sum, bar_sum) = build_clean_cache(tmp.path());
472
473 let foo_path = tmp.path().join(object_path(&foo_sum));
474 let bar_path = tmp.path().join(object_path(&bar_sum));
475 fs::write(&foo_path, b"TAMPERED").unwrap();
476
477 let report = verify_cache(tmp.path(), true, &Blake3Hasher).unwrap();
478 assert_eq!(report.checked, 2);
479 assert_eq!(report.corrupt, vec![foo_sum.clone()]);
480 assert_eq!(report.purged, vec![foo_sum]);
481 assert!(!foo_path.exists(), "corrupt object purged");
482 assert!(bar_path.exists(), "clean object kept");
483
484 // A re-scan now sees only the surviving clean object and passes.
485 let rescan = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
486 assert_eq!(rescan.checked, 1);
487 assert!(rescan.is_clean());
488 }
489
490 #[test]
491 fn cache_missing_manifest_yields_not_found() {
492 let tmp = Scratch::new();
493 let id = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
494 match check_snapshot_integrity(tmp.path(), id, &Blake3Hasher) {
495 Err(CacheError::ManifestNotFound { id: got }) => assert_eq!(got, id),
496 other => panic!("expected ManifestNotFound, got {other:?}"),
497 }
498 }
499
500 #[test]
501 fn cache_missing_object_yields_not_found() {
502 let tmp = Scratch::new();
503 let (id, foo_sum, _bar) = build_clean_cache(tmp.path());
504 // Delete one referenced object but keep the manifest.
505 fs::remove_file(tmp.path().join(object_path(&foo_sum))).unwrap();
506 match check_snapshot_integrity(tmp.path(), &id, &Blake3Hasher) {
507 Err(CacheError::ObjectNotFound { checksum }) => assert_eq!(checksum, foo_sum),
508 other => panic!("expected ObjectNotFound, got {other:?}"),
509 }
510 }
511
512 #[test]
513 fn cache_directory_lines_excluded_from_integrity() {
514 // A manifest whose only entry is a directory (no file objects on disk)
515 // still passes integrity — directory lines are excluded.
516 let tmp = Scratch::new();
517 let mut manifest = Manifest::new();
518 manifest.push(ManifestEntry::new(
519 PathType::Directory,
520 "700",
521 "deadbeef",
522 0,
523 "./",
524 ));
525 let id = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
526 put_manifest(tmp.path(), id, &manifest);
527 check_snapshot_integrity(tmp.path(), id, &Blake3Hasher)
528 .expect("directory-only manifest passes");
529 }
530
531 #[test]
532 fn cache_empty_or_absent_objects_dir_is_clean_pass() {
533 // Absent .objects entirely.
534 let tmp = Scratch::new();
535 let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
536 assert_eq!(report, CacheReport::default());
537 assert!(report.is_clean());
538 assert_eq!(report.checked, 0);
539
540 // Present-but-empty .objects.
541 fs::create_dir_all(tmp.path().join(OBJECTS_DIR)).unwrap();
542 let report = verify_cache(tmp.path(), false, &Blake3Hasher).unwrap();
543 assert_eq!(report.checked, 0);
544 assert!(report.is_clean());
545 }
546
547 #[test]
548 fn cache_verify_reconstructs_expected_checksum_from_path() {
549 // Directly guard the sed-equivalent path->checksum reconstruction: an
550 // object filed under a known address reconstructs exactly that address.
551 let tmp = Scratch::new();
552 let checksum = put_object(tmp.path(), b"hello cache\n");
553 let objects_root = tmp.path().join(OBJECTS_DIR);
554 let object = tmp.path().join(object_path(&checksum));
555 let got = expected_checksum_from_path(&objects_root, &object).unwrap();
556 assert_eq!(got, checksum);
557 }
558
559 #[test]
560 fn cache_flush_empties_objects_and_manifests() {
561 let tmp = Scratch::new();
562 let (_id, _foo, _bar) = build_clean_cache(tmp.path());
563 assert!(tmp.path().join(OBJECTS_DIR).exists());
564 assert!(tmp.path().join(MANIFESTS_DIR_TEST).exists());
565
566 flush_cache(tmp.path()).expect("flush succeeds");
567
568 assert!(!tmp.path().join(OBJECTS_DIR).exists());
569 assert!(!tmp.path().join(MANIFESTS_DIR_TEST).exists());
570 // The cache dir itself survives and is empty.
571 assert!(tmp.path().is_dir());
572 assert_eq!(fs::read_dir(tmp.path()).unwrap().count(), 0);
573 }
574
575 #[test]
576 fn cache_flush_is_idempotent_on_missing_dir() {
577 let tmp = Scratch::new();
578 let missing = tmp.path().join("does-not-exist");
579 flush_cache(&missing).expect("flush on missing dir is a no-op");
580 }
581
582 const MANIFESTS_DIR_TEST: &str = ".manifests";
583}