Skip to main content

snapdir_core/
merkle.rs

1//! Directory checksum (merkle) computation over manifest entries.
2//!
3//! snapdir derives a directory's checksum from the checksums of its **direct
4//! children** — not from the directory's own bytes. The oracle
5//! (`snapdir-manifest`) computes it with:
6//!
7//! ```sh
8//! dir_checksums="$(echo "$dir_manifest" | cut -d' ' -f3 | sort -u | tr -d '\n')"
9//! dir_checksum="$(echo -n "$dir_checksums" | _snapdir_manifest_checksum)"
10//! ```
11//!
12//! that is: take the **CHECKSUM field** (column 3) of each direct child entry,
13//! **sort** them lexicographically, **dedup** (`sort -u`), **concatenate with
14//! no separator**, then **re-hash** the resulting byte string with the same
15//! checksum function (BLAKE3 `--no-names` by default).
16//!
17//! The directory checksum is **not** the snapshot id. The root directory
18//! checksum is just the CHECKSUM field of the `D ./` line. The **snapshot id**
19//! is a distinct value: BLAKE3 of the full manifest text (with `#`-comment
20//! lines removed, including the trailing newline `echo` adds). See
21//! [`snapshot_id`].
22//!
23//! Edge cases, confirmed against the oracle:
24//!
25//! - An **empty directory** has no children, so the concatenation is the empty
26//!   string and its checksum is `blake3("")` =
27//!   `af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262`.
28//! - Identical child checksums collapse under `sort -u`: a directory holding
29//!   two empty files (both `af1349b9…`) hashes the single deduped value
30//!   `af1349b9…`, yielding `dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b`.
31//!
32//! Per the library-purity principle this module performs no terminal I/O and
33//! reads no environment; hashing is in-process via the [`blake3`] crate. We
34//! never shell out to `b3sum`. The [`Hasher`] trait leaves room for the
35//! `--checksum-bin` (md5/sha256) abstraction to slot in later without changing
36//! the merkle algorithm.
37
38use crate::manifest::Manifest;
39
40/// A checksum function over an in-memory byte string.
41///
42/// The merkle rule is independent of the concrete hash: it sorts, dedups and
43/// concatenates child checksum *strings* and hands the bytes to a `Hasher`.
44/// The shipped default is in-process BLAKE3 ([`Blake3Hasher`]); the
45/// `--checksum-bin` matrix (md5/sha256) can add further implementations later.
46pub trait Hasher {
47    /// Returns the lowercase hex checksum of `bytes`.
48    fn hash_hex(&self, bytes: &[u8]) -> String;
49}
50
51/// In-process BLAKE3 hasher, equivalent to the oracle's default
52/// `b3sum --no-names`.
53///
54/// This is the shipped default. It hashes the input bytes and renders the
55/// 32-byte digest as lowercase hex, matching `b3sum --no-names` exactly.
56#[derive(Debug, Clone, Copy, Default)]
57pub struct Blake3Hasher;
58
59impl Blake3Hasher {
60    /// Creates a new BLAKE3 hasher.
61    #[must_use]
62    pub const fn new() -> Self {
63        Self
64    }
65}
66
67impl Hasher for Blake3Hasher {
68    fn hash_hex(&self, bytes: &[u8]) -> String {
69        blake3::hash(bytes).to_hex().to_string()
70    }
71}
72
73/// Keyed (derived-key) BLAKE3 hasher, equivalent to the oracle's
74/// `b3sum --derive-key="<context>" --no-names`.
75///
76/// The oracle switches to keyed mode whenever the `SNAPDIR_MANIFEST_CONTEXT`
77/// environment variable is non-empty (and only for `b3sum`; see
78/// `_snapdir_manifest_define_checksum_fn`). Per the library-purity principle,
79/// `snapdir-core` does **not** read that environment variable: the CLI lane
80/// reads it and constructs this hasher with the context string passed in as a
81/// parameter.
82///
83/// The digest is `blake3::derive_key(context, input)` rendered as lowercase
84/// hex, byte-for-byte identical to `b3sum --derive-key=<context> --no-names`.
85#[derive(Debug, Clone)]
86pub struct Blake3KeyedHasher {
87    context: String,
88}
89
90impl Blake3KeyedHasher {
91    /// Creates a keyed BLAKE3 hasher deriving its key from `context`.
92    ///
93    /// `context` is the `SNAPDIR_MANIFEST_CONTEXT` value the CLI lane reads
94    /// from the environment; core never reads it itself.
95    #[must_use]
96    pub fn new(context: impl Into<String>) -> Self {
97        Self {
98            context: context.into(),
99        }
100    }
101}
102
103impl Hasher for Blake3KeyedHasher {
104    fn hash_hex(&self, bytes: &[u8]) -> String {
105        // `blake3::Hasher::new_derive_key(context)` then update+finalize is the
106        // streaming equivalent of `blake3::derive_key(context, bytes)`; both
107        // match `b3sum --derive-key=<context> --no-names`.
108        let mut hasher = blake3::Hasher::new_derive_key(&self.context);
109        hasher.update(bytes);
110        hasher.finalize().to_hex().to_string()
111    }
112}
113
114/// In-process MD5 hasher, equivalent to the oracle's
115/// `md5sum | cut -d' ' -f1`.
116///
117/// The oracle parses non-`b3sum` checksum binaries with `cut -d' ' -f1`, i.e.
118/// it keeps only the leading lowercase-hex digest and drops the filename. This
119/// reproduces that digest in-process via the pure-Rust [`md5`](md_5) crate; we
120/// never shell out to `md5sum`.
121#[derive(Debug, Clone, Copy, Default)]
122pub struct Md5Hasher;
123
124impl Md5Hasher {
125    /// Creates a new MD5 hasher.
126    #[must_use]
127    pub const fn new() -> Self {
128        Self
129    }
130}
131
132impl Hasher for Md5Hasher {
133    fn hash_hex(&self, bytes: &[u8]) -> String {
134        use md5::{Digest, Md5};
135        let digest = Md5::digest(bytes);
136        hex_lower(&digest)
137    }
138}
139
140/// In-process SHA-256 hasher, equivalent to the oracle's
141/// `sha256sum | cut -d' ' -f1`.
142///
143/// As with [`Md5Hasher`], this reproduces the leading lowercase-hex digest the
144/// oracle keeps after `cut -d' ' -f1`, in-process via the pure-Rust [`sha2`]
145/// crate; we never shell out to `sha256sum`.
146#[derive(Debug, Clone, Copy, Default)]
147pub struct Sha256Hasher;
148
149impl Sha256Hasher {
150    /// Creates a new SHA-256 hasher.
151    #[must_use]
152    pub const fn new() -> Self {
153        Self
154    }
155}
156
157impl Hasher for Sha256Hasher {
158    fn hash_hex(&self, bytes: &[u8]) -> String {
159        use sha2::{Digest, Sha256};
160        let digest = Sha256::digest(bytes);
161        hex_lower(&digest)
162    }
163}
164
165/// Renders raw digest bytes as a lowercase hex string.
166fn hex_lower(bytes: &[u8]) -> String {
167    use core::fmt::Write as _;
168    let mut out = String::with_capacity(bytes.len() * 2);
169    for byte in bytes {
170        // Infallible: writing to a String never errors.
171        let _ = write!(out, "{byte:02x}");
172    }
173    out
174}
175
176/// Computes a directory checksum from the checksums of its direct children.
177///
178/// Implements the oracle rule exactly: each child checksum string is **sorted**
179/// (lexicographic, byte-wise on the hex strings), **deduplicated** (`sort -u`),
180/// **concatenated with no separator**, and the resulting byte string is hashed
181/// with `hasher`.
182///
183/// `child_checksums` is the CHECKSUM field (column 3) of each direct child
184/// manifest line. Passing an empty iterator yields the hash of the empty
185/// string (an empty directory).
186///
187/// This is **not** the snapshot id. Computing it over the root directory's
188/// direct children yields the CHECKSUM field of the `D ./` line, not the id
189/// `snapdir id` reports. For the snapshot id, use [`snapshot_id`].
190pub fn directory_checksum<'a, I, H>(child_checksums: I, hasher: &H) -> String
191where
192    I: IntoIterator<Item = &'a str>,
193    H: Hasher,
194{
195    // `sort -u`: collect into a sorted, deduplicated set keyed on the hex
196    // string bytes (BTreeSet orders by Ord, which for &str is byte-wise — the
197    // same ordering `sort` uses in the C locale the oracle runs under).
198    let unique: std::collections::BTreeSet<&str> = child_checksums.into_iter().collect();
199
200    // `tr -d '\n'`: concatenate with no separator.
201    let mut concatenated = String::new();
202    for checksum in unique {
203        concatenated.push_str(checksum);
204    }
205
206    hasher.hash_hex(concatenated.as_bytes())
207}
208
209/// Computes the **snapshot id** of a manifest.
210///
211/// This is the value `snapdir id` reports, and the id under which a snapshot is
212/// stored. It is **distinct** from the root [`directory_checksum`]: the oracle
213/// (`snapdir` lines 259, 762, 436, 776) derives it as
214///
215/// ```sh
216/// snapshot_id="$(echo "$manifest" | grep -v '^#' | b3sum --no-names -)"
217/// ```
218///
219/// i.e. BLAKE3 over the **full manifest text** with `#`-comment lines removed.
220/// Because the oracle pipes the text through `echo`, the hashed bytes carry a
221/// single **trailing newline** after the last manifest line; reproducing the
222/// golden ids requires that newline.
223///
224/// [`Manifest`]'s [`Display`] already renders entries in `sort -k5` order with
225/// no trailing newline and excludes comments on parse, so this renders the
226/// manifest, appends the `echo` newline, and hashes the result with `hasher`.
227///
228/// [`Display`]: std::fmt::Display
229#[must_use]
230pub fn snapshot_id<H>(manifest: &Manifest, hasher: &H) -> String
231where
232    H: Hasher,
233{
234    let mut text = manifest.to_string();
235    // The oracle's `echo "$manifest"` appends a single trailing newline before
236    // the bytes reach `b3sum`. The golden ids only reproduce with it.
237    text.push('\n');
238    hasher.hash_hex(text.as_bytes())
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244
245    /// `blake3("")` — the empty-input / empty-directory checksum, as emitted by
246    /// `snapdir-manifest` for a truly empty directory (`D 700 af1349b9… 0 ./`).
247    const EMPTY_BLAKE3: &str = "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262";
248
249    /// The guide fixture root id: a directory containing two empty files, both
250    /// `af1349b9…`. After `sort -u` they collapse to one, so the directory
251    /// checksum is `blake3("af1349b9…")`.
252    /// (`utils/qa-fixtures/expected-guide-commands.txt` line 4.)
253    const TWO_EMPTY_FILES_ROOT_ID: &str =
254        "dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b";
255
256    #[test]
257    fn blake3_hasher_matches_b3sum_no_names_for_empty_input() {
258        let hasher = Blake3Hasher::new();
259        assert_eq!(hasher.hash_hex(b""), EMPTY_BLAKE3);
260    }
261
262    #[test]
263    fn empty_directory_checksum_is_hash_of_empty_string() {
264        // No children -> concatenation is "" -> blake3("").
265        let hasher = Blake3Hasher::new();
266        let no_children: [&str; 0] = [];
267        assert_eq!(directory_checksum(no_children, &hasher), EMPTY_BLAKE3);
268    }
269
270    #[test]
271    fn directory_checksum_matches_guide_fixture_empty_files_root() {
272        // The guide root dir holds two empty files (foo.txt, bar.txt), both
273        // hashing to af1349b9…. `sort -u` dedups to a single value, and the
274        // directory checksum is blake3 of that single value. (This is the
275        // `D ./` CHECKSUM field, not the snapshot id.)
276        let hasher = Blake3Hasher::new();
277        let children = [EMPTY_BLAKE3, EMPTY_BLAKE3];
278        assert_eq!(
279            directory_checksum(children, &hasher),
280            TWO_EMPTY_FILES_ROOT_ID
281        );
282    }
283
284    #[test]
285    fn directory_checksum_sorts_dedups_and_concatenates_in_order() {
286        // Synthetic multi-child case verifying the exact sort+dedup+concat
287        // pipeline independent of the hash: feed unsorted, duplicated child
288        // checksums and confirm the hashed input equals the sorted-unique
289        // concatenation.
290        let hasher = Blake3Hasher::new();
291
292        // Deliberately out of order, with a duplicate of "bbb".
293        let children = ["ccc", "aaa", "bbb", "bbb"];
294        let got = directory_checksum(children, &hasher);
295
296        // Expected: sort -> [aaa, bbb, ccc], dedup (no-op here beyond the dup),
297        // concat -> "aaabbbccc", then blake3 of those bytes.
298        let expected = blake3::hash(b"aaabbbccc").to_hex().to_string();
299        assert_eq!(got, expected);
300    }
301
302    #[test]
303    fn directory_checksum_dedup_collapses_identical_children() {
304        // All children identical -> a single value remains after `sort -u`.
305        let hasher = Blake3Hasher::new();
306        let children = ["zz", "zz", "zz"];
307        let got = directory_checksum(children, &hasher);
308        let expected = blake3::hash(b"zz").to_hex().to_string();
309        assert_eq!(got, expected);
310    }
311
312    #[test]
313    fn directory_checksum_is_order_independent_of_input_ordering() {
314        // The merkle rule sorts, so input ordering must not affect the result.
315        let hasher = Blake3Hasher::new();
316        let forward = directory_checksum(["a1", "b2", "c3"], &hasher);
317        let reverse = directory_checksum(["c3", "b2", "a1"], &hasher);
318        assert_eq!(forward, reverse);
319    }
320
321    #[test]
322    fn directory_checksum_is_the_root_d_line_field() {
323        // The root directory checksum is the CHECKSUM field of the `D ./` line
324        // — sort -u + concat + re-hash of the root's direct children. It is NOT
325        // the snapshot id (see the snapshot_id_* tests for that distinction).
326        let hasher = Blake3Hasher::new();
327        let root_children = [EMPTY_BLAKE3, EMPTY_BLAKE3];
328        let root_d_line_checksum = directory_checksum(root_children, &hasher);
329        assert_eq!(root_d_line_checksum, TWO_EMPTY_FILES_ROOT_ID);
330    }
331
332    /// The guide's empty-files manifest (two duplicate empty files), copied
333    /// verbatim from `utils/qa-fixtures/expected-guide-commands.txt`.
334    const EMPTY_FILES_MANIFEST: &str = "\
335D 700 dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b 0 ./
336F 600 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./bar.txt
337F 600 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./foo.txt";
338
339    /// The guide's modified manifest (after `echo "foo" > foo.txt`).
340    const MODIFIED_MANIFEST: &str = "\
341D 700 4a0732cfb45ebe9d8d572fc4c77b759384bed029911e35f8859430b889427d4d 4 ./
342F 600 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./bar.txt
343F 600 49dc870df1de7fd60794cebce449f5ccdae575affaa67a24b62acb03e039db92 4 ./foo.txt";
344
345    /// Golden snapshot ids (`snapdir id`) == BLAKE3 of the whole manifest text
346    /// with `#`-comment lines removed, trailing `echo` newline included.
347    const EMPTY_FILES_SNAPSHOT_ID: &str =
348        "c678a299380893769bd7795628b96147229b410a9d5a5b7cae563bcae3c27857";
349    const MODIFIED_SNAPSHOT_ID: &str =
350        "8af03a1bec09b1838d2c4f56c6940ed35ccdad1064243d2d775e8347ba82b9be";
351
352    #[test]
353    fn snapshot_id_reproduces_empty_files_golden_id() {
354        let hasher = Blake3Hasher::new();
355        let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
356        assert_eq!(snapshot_id(&manifest, &hasher), EMPTY_FILES_SNAPSHOT_ID);
357    }
358
359    #[test]
360    fn snapshot_id_reproduces_modified_golden_id() {
361        let hasher = Blake3Hasher::new();
362        let manifest = Manifest::parse(MODIFIED_MANIFEST).expect("parses");
363        assert_eq!(snapshot_id(&manifest, &hasher), MODIFIED_SNAPSHOT_ID);
364    }
365
366    #[test]
367    fn snapshot_id_requires_the_trailing_newline() {
368        // Guard the exact byte handling: the oracle's `echo` appends a trailing
369        // newline, so hashing the manifest text WITHOUT it must NOT match the
370        // golden id (and snapshot_id, which adds it, must).
371        let hasher = Blake3Hasher::new();
372        let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
373        let without_newline = hasher.hash_hex(manifest.to_string().as_bytes());
374        assert_ne!(without_newline, EMPTY_FILES_SNAPSHOT_ID);
375        assert_eq!(snapshot_id(&manifest, &hasher), EMPTY_FILES_SNAPSHOT_ID);
376    }
377
378    #[test]
379    fn snapshot_id_ignores_comment_lines() {
380        // `#`-comment lines are stripped on parse, so a manifest with comments
381        // yields the same snapshot id as one without.
382        let hasher = Blake3Hasher::new();
383        let with_comments = format!("# generated by snapdir\n{EMPTY_FILES_MANIFEST}\n# eof");
384        let manifest = Manifest::parse(&with_comments).expect("parses");
385        assert_eq!(snapshot_id(&manifest, &hasher), EMPTY_FILES_SNAPSHOT_ID);
386    }
387
388    // --- --checksum-bin abstraction: md5sum / sha256sum -------------------
389
390    #[test]
391    fn golden_md5_known_vectors() {
392        // Standard MD5 test vectors, lowercase hex (what the oracle keeps after
393        // `md5sum | cut -d' ' -f1`).
394        let hasher = Md5Hasher::new();
395        assert_eq!(hasher.hash_hex(b""), "d41d8cd98f00b204e9800998ecf8427e");
396        // md5("abc")
397        assert_eq!(hasher.hash_hex(b"abc"), "900150983cd24fb0d6963f7d28e17f72");
398        // md5("foo\n") — matches `printf 'foo\n' | md5sum` (the guide's foo.txt).
399        assert_eq!(
400            hasher.hash_hex(b"foo\n"),
401            "d3b07384d113edec49eaa6238ad5ff00"
402        );
403    }
404
405    #[test]
406    fn golden_md5_works_with_directory_checksum_and_snapshot_id() {
407        // The merkle rule and snapshot id are hash-agnostic: they must run
408        // unchanged with the MD5 hasher.
409        let hasher = Md5Hasher::new();
410        // directory_checksum = md5(sorted-unique-concat of children).
411        let dir = directory_checksum(["ccc", "aaa", "bbb", "bbb"], &hasher);
412        assert_eq!(dir, hasher.hash_hex(b"aaabbbccc"));
413
414        // snapshot_id = md5(manifest text + trailing newline).
415        let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
416        let text = format!("{manifest}\n");
417        assert_eq!(
418            snapshot_id(&manifest, &hasher),
419            hasher.hash_hex(text.as_bytes())
420        );
421    }
422
423    #[test]
424    fn golden_sha256_known_vectors() {
425        // Standard SHA-256 test vectors, lowercase hex (what the oracle keeps
426        // after `sha256sum | cut -d' ' -f1`).
427        let hasher = Sha256Hasher::new();
428        assert_eq!(
429            hasher.hash_hex(b""),
430            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
431        );
432        // sha256("abc")
433        assert_eq!(
434            hasher.hash_hex(b"abc"),
435            "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
436        );
437        // sha256("foo\n") — matches `printf 'foo\n' | sha256sum`.
438        assert_eq!(
439            hasher.hash_hex(b"foo\n"),
440            "b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c"
441        );
442    }
443
444    #[test]
445    fn golden_sha256_works_with_directory_checksum_and_snapshot_id() {
446        let hasher = Sha256Hasher::new();
447        let dir = directory_checksum(["ccc", "aaa", "bbb", "bbb"], &hasher);
448        assert_eq!(dir, hasher.hash_hex(b"aaabbbccc"));
449
450        let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
451        let text = format!("{manifest}\n");
452        assert_eq!(
453            snapshot_id(&manifest, &hasher),
454            hasher.hash_hex(text.as_bytes())
455        );
456    }
457
458    // --- keyed mode (SNAPDIR_MANIFEST_CONTEXT) ----------------------------
459
460    #[test]
461    fn keyed_context_matches_blake3_derive_key_and_differs_from_unkeyed() {
462        // The oracle's keyed mode is `b3sum --derive-key=<ctx> --no-names`,
463        // which is exactly `blake3::derive_key(ctx, input)`.
464        let context = "snapdir 2026 test context";
465        let input = b"the quick brown fox";
466
467        let keyed = Blake3KeyedHasher::new(context);
468        let expected = blake3::derive_key(context, input);
469        let expected_hex = blake3::Hash::from(expected).to_hex().to_string();
470        assert_eq!(keyed.hash_hex(input), expected_hex);
471
472        // Keyed digest must differ from the unkeyed default.
473        let unkeyed = Blake3Hasher::new();
474        assert_ne!(keyed.hash_hex(input), unkeyed.hash_hex(input));
475
476        // Different contexts produce different digests for the same input.
477        let other = Blake3KeyedHasher::new("a different context");
478        assert_ne!(keyed.hash_hex(input), other.hash_hex(input));
479    }
480
481    #[test]
482    fn keyed_context_drives_directory_checksum_and_snapshot_id() {
483        // Keyed hashing slots into the merkle rule / snapshot id like any other
484        // Hasher; core never reads the env var — the context is a parameter.
485        let context = "interop matrix context";
486        let keyed = Blake3KeyedHasher::new(context);
487
488        let dir = directory_checksum(["b", "a"], &keyed);
489        assert_eq!(dir, keyed.hash_hex(b"ab"));
490
491        let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
492        let id = snapshot_id(&manifest, &keyed);
493        let unkeyed_id = snapshot_id(&manifest, &Blake3Hasher::new());
494        assert_ne!(id, unkeyed_id);
495    }
496
497    #[test]
498    fn snapshot_id_differs_from_root_directory_checksum() {
499        // The keystone distinction: the snapshot id hashes the whole manifest
500        // text; it is NOT the root directory checksum for the same tree.
501        let hasher = Blake3Hasher::new();
502        let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
503        let id = snapshot_id(&manifest, &hasher);
504
505        let root_children: Vec<&str> = manifest
506            .entries()
507            .iter()
508            .filter(|e| e.path != "./")
509            .map(|e| e.checksum.as_str())
510            .collect();
511        let root_dir_checksum = directory_checksum(root_children, &hasher);
512
513        assert_eq!(root_dir_checksum, TWO_EMPTY_FILES_ROOT_ID);
514        assert_ne!(id, root_dir_checksum);
515        assert_eq!(id, EMPTY_FILES_SNAPSHOT_ID);
516    }
517}