snapdir_core/merkle.rs
1//! Directory checksum (merkle) computation over manifest entries.
2//!
3//! snapdir derives a directory's checksum from the checksums of its **direct
4//! children** — not from the directory's own bytes. The oracle
5//! (`snapdir-manifest`) computes it with:
6//!
7//! ```sh
8//! dir_checksums="$(echo "$dir_manifest" | cut -d' ' -f3 | sort -u | tr -d '\n')"
9//! dir_checksum="$(echo -n "$dir_checksums" | _snapdir_manifest_checksum)"
10//! ```
11//!
12//! that is: take the **CHECKSUM field** (column 3) of each direct child entry,
13//! **sort** them lexicographically, **dedup** (`sort -u`), **concatenate with
14//! no separator**, then **re-hash** the resulting byte string with the same
15//! checksum function (BLAKE3 `--no-names` by default).
16//!
17//! The directory checksum is **not** the snapshot id. The root directory
18//! checksum is just the CHECKSUM field of the `D ./` line. The **snapshot id**
19//! is a distinct value: BLAKE3 of the full manifest text (with `#`-comment
20//! lines removed, including the trailing newline `echo` adds). See
21//! [`snapshot_id`].
22//!
23//! Edge cases, confirmed against the oracle:
24//!
25//! - An **empty directory** has no children, so the concatenation is the empty
26//! string and its checksum is `blake3("")` =
27//! `af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262`.
28//! - Identical child checksums collapse under `sort -u`: a directory holding
29//! two empty files (both `af1349b9…`) hashes the single deduped value
30//! `af1349b9…`, yielding `dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b`.
31//!
32//! Per the library-purity principle this module performs no terminal I/O and
33//! reads no environment; hashing is in-process via the [`blake3`] crate. We
34//! never shell out to `b3sum`. The [`Hasher`] trait leaves room for the
35//! `--checksum-bin` (md5/sha256) abstraction to slot in later without changing
36//! the merkle algorithm.
37
38use crate::manifest::Manifest;
39
40/// A checksum function over an in-memory byte string.
41///
42/// The merkle rule is independent of the concrete hash: it sorts, dedups and
43/// concatenates child checksum *strings* and hands the bytes to a `Hasher`.
44/// The shipped default is in-process BLAKE3 ([`Blake3Hasher`]); the
45/// `--checksum-bin` matrix (md5/sha256) can add further implementations later.
46pub trait Hasher {
47 /// Returns the lowercase hex checksum of `bytes`.
48 fn hash_hex(&self, bytes: &[u8]) -> String;
49}
50
51/// In-process BLAKE3 hasher, equivalent to the oracle's default
52/// `b3sum --no-names`.
53///
54/// This is the shipped default. It hashes the input bytes and renders the
55/// 32-byte digest as lowercase hex, matching `b3sum --no-names` exactly.
56#[derive(Debug, Clone, Copy, Default)]
57pub struct Blake3Hasher;
58
59impl Blake3Hasher {
60 /// Creates a new BLAKE3 hasher.
61 #[must_use]
62 pub const fn new() -> Self {
63 Self
64 }
65}
66
67impl Hasher for Blake3Hasher {
68 fn hash_hex(&self, bytes: &[u8]) -> String {
69 blake3::hash(bytes).to_hex().to_string()
70 }
71}
72
73/// Keyed (derived-key) BLAKE3 hasher, equivalent to the oracle's
74/// `b3sum --derive-key="<context>" --no-names`.
75///
76/// The oracle switches to keyed mode whenever the `SNAPDIR_MANIFEST_CONTEXT`
77/// environment variable is non-empty (and only for `b3sum`; see
78/// `_snapdir_manifest_define_checksum_fn`). Per the library-purity principle,
79/// `snapdir-core` does **not** read that environment variable: the CLI lane
80/// reads it and constructs this hasher with the context string passed in as a
81/// parameter.
82///
83/// The digest is `blake3::derive_key(context, input)` rendered as lowercase
84/// hex, byte-for-byte identical to `b3sum --derive-key=<context> --no-names`.
85#[derive(Debug, Clone)]
86pub struct Blake3KeyedHasher {
87 context: String,
88}
89
90impl Blake3KeyedHasher {
91 /// Creates a keyed BLAKE3 hasher deriving its key from `context`.
92 ///
93 /// `context` is the `SNAPDIR_MANIFEST_CONTEXT` value the CLI lane reads
94 /// from the environment; core never reads it itself.
95 #[must_use]
96 pub fn new(context: impl Into<String>) -> Self {
97 Self {
98 context: context.into(),
99 }
100 }
101}
102
103impl Hasher for Blake3KeyedHasher {
104 fn hash_hex(&self, bytes: &[u8]) -> String {
105 // `blake3::Hasher::new_derive_key(context)` then update+finalize is the
106 // streaming equivalent of `blake3::derive_key(context, bytes)`; both
107 // match `b3sum --derive-key=<context> --no-names`.
108 let mut hasher = blake3::Hasher::new_derive_key(&self.context);
109 hasher.update(bytes);
110 hasher.finalize().to_hex().to_string()
111 }
112}
113
114/// In-process MD5 hasher, equivalent to the oracle's
115/// `md5sum | cut -d' ' -f1`.
116///
117/// The oracle parses non-`b3sum` checksum binaries with `cut -d' ' -f1`, i.e.
118/// it keeps only the leading lowercase-hex digest and drops the filename. This
119/// reproduces that digest in-process via the pure-Rust [`md5`](md_5) crate; we
120/// never shell out to `md5sum`.
121#[derive(Debug, Clone, Copy, Default)]
122pub struct Md5Hasher;
123
124impl Md5Hasher {
125 /// Creates a new MD5 hasher.
126 #[must_use]
127 pub const fn new() -> Self {
128 Self
129 }
130}
131
132impl Hasher for Md5Hasher {
133 fn hash_hex(&self, bytes: &[u8]) -> String {
134 use md5::{Digest, Md5};
135 let digest = Md5::digest(bytes);
136 hex_lower(&digest)
137 }
138}
139
140/// In-process SHA-256 hasher, equivalent to the oracle's
141/// `sha256sum | cut -d' ' -f1`.
142///
143/// As with [`Md5Hasher`], this reproduces the leading lowercase-hex digest the
144/// oracle keeps after `cut -d' ' -f1`, in-process via the pure-Rust [`sha2`]
145/// crate; we never shell out to `sha256sum`.
146#[derive(Debug, Clone, Copy, Default)]
147pub struct Sha256Hasher;
148
149impl Sha256Hasher {
150 /// Creates a new SHA-256 hasher.
151 #[must_use]
152 pub const fn new() -> Self {
153 Self
154 }
155}
156
157impl Hasher for Sha256Hasher {
158 fn hash_hex(&self, bytes: &[u8]) -> String {
159 use sha2::{Digest, Sha256};
160 let digest = Sha256::digest(bytes);
161 hex_lower(&digest)
162 }
163}
164
165/// Renders raw digest bytes as a lowercase hex string.
166fn hex_lower(bytes: &[u8]) -> String {
167 use core::fmt::Write as _;
168 let mut out = String::with_capacity(bytes.len() * 2);
169 for byte in bytes {
170 // Infallible: writing to a String never errors.
171 let _ = write!(out, "{byte:02x}");
172 }
173 out
174}
175
176/// Computes a directory checksum from the checksums of its direct children.
177///
178/// Implements the oracle rule exactly: each child checksum string is **sorted**
179/// (lexicographic, byte-wise on the hex strings), **deduplicated** (`sort -u`),
180/// **concatenated with no separator**, and the resulting byte string is hashed
181/// with `hasher`.
182///
183/// `child_checksums` is the CHECKSUM field (column 3) of each direct child
184/// manifest line. Passing an empty iterator yields the hash of the empty
185/// string (an empty directory).
186///
187/// This is **not** the snapshot id. Computing it over the root directory's
188/// direct children yields the CHECKSUM field of the `D ./` line, not the id
189/// `snapdir id` reports. For the snapshot id, use [`snapshot_id`].
190pub fn directory_checksum<'a, I, H>(child_checksums: I, hasher: &H) -> String
191where
192 I: IntoIterator<Item = &'a str>,
193 H: Hasher,
194{
195 // `sort -u`: collect into a sorted, deduplicated set keyed on the hex
196 // string bytes (BTreeSet orders by Ord, which for &str is byte-wise — the
197 // same ordering `sort` uses in the C locale the oracle runs under).
198 let unique: std::collections::BTreeSet<&str> = child_checksums.into_iter().collect();
199
200 // `tr -d '\n'`: concatenate with no separator.
201 let mut concatenated = String::new();
202 for checksum in unique {
203 concatenated.push_str(checksum);
204 }
205
206 hasher.hash_hex(concatenated.as_bytes())
207}
208
209/// Computes the **snapshot id** of a manifest.
210///
211/// This is the value `snapdir id` reports, and the id under which a snapshot is
212/// stored. It is **distinct** from the root [`directory_checksum`]: the oracle
213/// (`snapdir` lines 259, 762, 436, 776) derives it as
214///
215/// ```sh
216/// snapshot_id="$(echo "$manifest" | grep -v '^#' | b3sum --no-names -)"
217/// ```
218///
219/// i.e. BLAKE3 over the **full manifest text** with `#`-comment lines removed.
220/// Because the oracle pipes the text through `echo`, the hashed bytes carry a
221/// single **trailing newline** after the last manifest line; reproducing the
222/// golden ids requires that newline.
223///
224/// [`Manifest`]'s [`Display`] already renders entries in `sort -k5` order with
225/// no trailing newline and excludes comments on parse, so this renders the
226/// manifest, appends the `echo` newline, and hashes the result with `hasher`.
227///
228/// [`Display`]: std::fmt::Display
229#[must_use]
230pub fn snapshot_id<H>(manifest: &Manifest, hasher: &H) -> String
231where
232 H: Hasher,
233{
234 let mut text = manifest.to_string();
235 // The oracle's `echo "$manifest"` appends a single trailing newline before
236 // the bytes reach `b3sum`. The golden ids only reproduce with it.
237 text.push('\n');
238 hasher.hash_hex(text.as_bytes())
239}
240
241#[cfg(test)]
242mod tests {
243 use super::*;
244
245 /// `blake3("")` — the empty-input / empty-directory checksum, as emitted by
246 /// `snapdir-manifest` for a truly empty directory (`D 700 af1349b9… 0 ./`).
247 const EMPTY_BLAKE3: &str = "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262";
248
249 /// The guide fixture root id: a directory containing two empty files, both
250 /// `af1349b9…`. After `sort -u` they collapse to one, so the directory
251 /// checksum is `blake3("af1349b9…")`.
252 /// (`utils/qa-fixtures/expected-guide-commands.txt` line 4.)
253 const TWO_EMPTY_FILES_ROOT_ID: &str =
254 "dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b";
255
256 #[test]
257 fn blake3_hasher_matches_b3sum_no_names_for_empty_input() {
258 let hasher = Blake3Hasher::new();
259 assert_eq!(hasher.hash_hex(b""), EMPTY_BLAKE3);
260 }
261
262 #[test]
263 fn empty_directory_checksum_is_hash_of_empty_string() {
264 // No children -> concatenation is "" -> blake3("").
265 let hasher = Blake3Hasher::new();
266 let no_children: [&str; 0] = [];
267 assert_eq!(directory_checksum(no_children, &hasher), EMPTY_BLAKE3);
268 }
269
270 #[test]
271 fn directory_checksum_matches_guide_fixture_empty_files_root() {
272 // The guide root dir holds two empty files (foo.txt, bar.txt), both
273 // hashing to af1349b9…. `sort -u` dedups to a single value, and the
274 // directory checksum is blake3 of that single value. (This is the
275 // `D ./` CHECKSUM field, not the snapshot id.)
276 let hasher = Blake3Hasher::new();
277 let children = [EMPTY_BLAKE3, EMPTY_BLAKE3];
278 assert_eq!(
279 directory_checksum(children, &hasher),
280 TWO_EMPTY_FILES_ROOT_ID
281 );
282 }
283
284 #[test]
285 fn directory_checksum_sorts_dedups_and_concatenates_in_order() {
286 // Synthetic multi-child case verifying the exact sort+dedup+concat
287 // pipeline independent of the hash: feed unsorted, duplicated child
288 // checksums and confirm the hashed input equals the sorted-unique
289 // concatenation.
290 let hasher = Blake3Hasher::new();
291
292 // Deliberately out of order, with a duplicate of "bbb".
293 let children = ["ccc", "aaa", "bbb", "bbb"];
294 let got = directory_checksum(children, &hasher);
295
296 // Expected: sort -> [aaa, bbb, ccc], dedup (no-op here beyond the dup),
297 // concat -> "aaabbbccc", then blake3 of those bytes.
298 let expected = blake3::hash(b"aaabbbccc").to_hex().to_string();
299 assert_eq!(got, expected);
300 }
301
302 #[test]
303 fn directory_checksum_dedup_collapses_identical_children() {
304 // All children identical -> a single value remains after `sort -u`.
305 let hasher = Blake3Hasher::new();
306 let children = ["zz", "zz", "zz"];
307 let got = directory_checksum(children, &hasher);
308 let expected = blake3::hash(b"zz").to_hex().to_string();
309 assert_eq!(got, expected);
310 }
311
312 #[test]
313 fn directory_checksum_is_order_independent_of_input_ordering() {
314 // The merkle rule sorts, so input ordering must not affect the result.
315 let hasher = Blake3Hasher::new();
316 let forward = directory_checksum(["a1", "b2", "c3"], &hasher);
317 let reverse = directory_checksum(["c3", "b2", "a1"], &hasher);
318 assert_eq!(forward, reverse);
319 }
320
321 #[test]
322 fn directory_checksum_is_the_root_d_line_field() {
323 // The root directory checksum is the CHECKSUM field of the `D ./` line
324 // — sort -u + concat + re-hash of the root's direct children. It is NOT
325 // the snapshot id (see the snapshot_id_* tests for that distinction).
326 let hasher = Blake3Hasher::new();
327 let root_children = [EMPTY_BLAKE3, EMPTY_BLAKE3];
328 let root_d_line_checksum = directory_checksum(root_children, &hasher);
329 assert_eq!(root_d_line_checksum, TWO_EMPTY_FILES_ROOT_ID);
330 }
331
332 /// The guide's empty-files manifest (two duplicate empty files), copied
333 /// verbatim from `utils/qa-fixtures/expected-guide-commands.txt`.
334 const EMPTY_FILES_MANIFEST: &str = "\
335D 700 dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b 0 ./
336F 600 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./bar.txt
337F 600 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./foo.txt";
338
339 /// The guide's modified manifest (after `echo "foo" > foo.txt`).
340 const MODIFIED_MANIFEST: &str = "\
341D 700 4a0732cfb45ebe9d8d572fc4c77b759384bed029911e35f8859430b889427d4d 4 ./
342F 600 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./bar.txt
343F 600 49dc870df1de7fd60794cebce449f5ccdae575affaa67a24b62acb03e039db92 4 ./foo.txt";
344
345 /// Golden snapshot ids (`snapdir id`) == BLAKE3 of the whole manifest text
346 /// with `#`-comment lines removed, trailing `echo` newline included.
347 const EMPTY_FILES_SNAPSHOT_ID: &str =
348 "c678a299380893769bd7795628b96147229b410a9d5a5b7cae563bcae3c27857";
349 const MODIFIED_SNAPSHOT_ID: &str =
350 "8af03a1bec09b1838d2c4f56c6940ed35ccdad1064243d2d775e8347ba82b9be";
351
352 #[test]
353 fn snapshot_id_reproduces_empty_files_golden_id() {
354 let hasher = Blake3Hasher::new();
355 let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
356 assert_eq!(snapshot_id(&manifest, &hasher), EMPTY_FILES_SNAPSHOT_ID);
357 }
358
359 #[test]
360 fn snapshot_id_reproduces_modified_golden_id() {
361 let hasher = Blake3Hasher::new();
362 let manifest = Manifest::parse(MODIFIED_MANIFEST).expect("parses");
363 assert_eq!(snapshot_id(&manifest, &hasher), MODIFIED_SNAPSHOT_ID);
364 }
365
366 #[test]
367 fn snapshot_id_requires_the_trailing_newline() {
368 // Guard the exact byte handling: the oracle's `echo` appends a trailing
369 // newline, so hashing the manifest text WITHOUT it must NOT match the
370 // golden id (and snapshot_id, which adds it, must).
371 let hasher = Blake3Hasher::new();
372 let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
373 let without_newline = hasher.hash_hex(manifest.to_string().as_bytes());
374 assert_ne!(without_newline, EMPTY_FILES_SNAPSHOT_ID);
375 assert_eq!(snapshot_id(&manifest, &hasher), EMPTY_FILES_SNAPSHOT_ID);
376 }
377
378 #[test]
379 fn snapshot_id_ignores_comment_lines() {
380 // `#`-comment lines are stripped on parse, so a manifest with comments
381 // yields the same snapshot id as one without.
382 let hasher = Blake3Hasher::new();
383 let with_comments = format!("# generated by snapdir\n{EMPTY_FILES_MANIFEST}\n# eof");
384 let manifest = Manifest::parse(&with_comments).expect("parses");
385 assert_eq!(snapshot_id(&manifest, &hasher), EMPTY_FILES_SNAPSHOT_ID);
386 }
387
388 // --- --checksum-bin abstraction: md5sum / sha256sum -------------------
389
390 #[test]
391 fn golden_md5_known_vectors() {
392 // Standard MD5 test vectors, lowercase hex (what the oracle keeps after
393 // `md5sum | cut -d' ' -f1`).
394 let hasher = Md5Hasher::new();
395 assert_eq!(hasher.hash_hex(b""), "d41d8cd98f00b204e9800998ecf8427e");
396 // md5("abc")
397 assert_eq!(hasher.hash_hex(b"abc"), "900150983cd24fb0d6963f7d28e17f72");
398 // md5("foo\n") — matches `printf 'foo\n' | md5sum` (the guide's foo.txt).
399 assert_eq!(
400 hasher.hash_hex(b"foo\n"),
401 "d3b07384d113edec49eaa6238ad5ff00"
402 );
403 }
404
405 #[test]
406 fn golden_md5_works_with_directory_checksum_and_snapshot_id() {
407 // The merkle rule and snapshot id are hash-agnostic: they must run
408 // unchanged with the MD5 hasher.
409 let hasher = Md5Hasher::new();
410 // directory_checksum = md5(sorted-unique-concat of children).
411 let dir = directory_checksum(["ccc", "aaa", "bbb", "bbb"], &hasher);
412 assert_eq!(dir, hasher.hash_hex(b"aaabbbccc"));
413
414 // snapshot_id = md5(manifest text + trailing newline).
415 let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
416 let text = format!("{manifest}\n");
417 assert_eq!(
418 snapshot_id(&manifest, &hasher),
419 hasher.hash_hex(text.as_bytes())
420 );
421 }
422
423 #[test]
424 fn golden_sha256_known_vectors() {
425 // Standard SHA-256 test vectors, lowercase hex (what the oracle keeps
426 // after `sha256sum | cut -d' ' -f1`).
427 let hasher = Sha256Hasher::new();
428 assert_eq!(
429 hasher.hash_hex(b""),
430 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
431 );
432 // sha256("abc")
433 assert_eq!(
434 hasher.hash_hex(b"abc"),
435 "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
436 );
437 // sha256("foo\n") — matches `printf 'foo\n' | sha256sum`.
438 assert_eq!(
439 hasher.hash_hex(b"foo\n"),
440 "b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c"
441 );
442 }
443
444 #[test]
445 fn golden_sha256_works_with_directory_checksum_and_snapshot_id() {
446 let hasher = Sha256Hasher::new();
447 let dir = directory_checksum(["ccc", "aaa", "bbb", "bbb"], &hasher);
448 assert_eq!(dir, hasher.hash_hex(b"aaabbbccc"));
449
450 let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
451 let text = format!("{manifest}\n");
452 assert_eq!(
453 snapshot_id(&manifest, &hasher),
454 hasher.hash_hex(text.as_bytes())
455 );
456 }
457
458 // --- keyed mode (SNAPDIR_MANIFEST_CONTEXT) ----------------------------
459
460 #[test]
461 fn keyed_context_matches_blake3_derive_key_and_differs_from_unkeyed() {
462 // The oracle's keyed mode is `b3sum --derive-key=<ctx> --no-names`,
463 // which is exactly `blake3::derive_key(ctx, input)`.
464 let context = "snapdir 2026 test context";
465 let input = b"the quick brown fox";
466
467 let keyed = Blake3KeyedHasher::new(context);
468 let expected = blake3::derive_key(context, input);
469 let expected_hex = blake3::Hash::from(expected).to_hex().to_string();
470 assert_eq!(keyed.hash_hex(input), expected_hex);
471
472 // Keyed digest must differ from the unkeyed default.
473 let unkeyed = Blake3Hasher::new();
474 assert_ne!(keyed.hash_hex(input), unkeyed.hash_hex(input));
475
476 // Different contexts produce different digests for the same input.
477 let other = Blake3KeyedHasher::new("a different context");
478 assert_ne!(keyed.hash_hex(input), other.hash_hex(input));
479 }
480
481 #[test]
482 fn keyed_context_drives_directory_checksum_and_snapshot_id() {
483 // Keyed hashing slots into the merkle rule / snapshot id like any other
484 // Hasher; core never reads the env var — the context is a parameter.
485 let context = "interop matrix context";
486 let keyed = Blake3KeyedHasher::new(context);
487
488 let dir = directory_checksum(["b", "a"], &keyed);
489 assert_eq!(dir, keyed.hash_hex(b"ab"));
490
491 let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
492 let id = snapshot_id(&manifest, &keyed);
493 let unkeyed_id = snapshot_id(&manifest, &Blake3Hasher::new());
494 assert_ne!(id, unkeyed_id);
495 }
496
497 #[test]
498 fn snapshot_id_differs_from_root_directory_checksum() {
499 // The keystone distinction: the snapshot id hashes the whole manifest
500 // text; it is NOT the root directory checksum for the same tree.
501 let hasher = Blake3Hasher::new();
502 let manifest = Manifest::parse(EMPTY_FILES_MANIFEST).expect("parses");
503 let id = snapshot_id(&manifest, &hasher);
504
505 let root_children: Vec<&str> = manifest
506 .entries()
507 .iter()
508 .filter(|e| e.path != "./")
509 .map(|e| e.checksum.as_str())
510 .collect();
511 let root_dir_checksum = directory_checksum(root_children, &hasher);
512
513 assert_eq!(root_dir_checksum, TWO_EMPTY_FILES_ROOT_ID);
514 assert_ne!(id, root_dir_checksum);
515 assert_eq!(id, EMPTY_FILES_SNAPSHOT_ID);
516 }
517}