Skip to main content

snapdir_core/
walk.rs

1//! In-process filesystem walk producing a frozen-format [`Manifest`].
2//!
3//! This module reproduces the original `snapdir-manifest generate` behavior in
4//! pure Rust, consuming the frozen [`manifest`](crate::manifest),
5//! [`merkle`](crate::merkle) and [`excludes`](crate::excludes) APIs without
6//! changing any of them. It walks a directory tree and emits one
7//! [`ManifestEntry`] per file (`F`) and directory (`D`), computing per-file
8//! content checksums with a [`Hasher`] and per-directory checksums/sizes with
9//! [`directory_checksum`].
10//!
11//! ## Behaviors matched against the oracle
12//!
13//! - **Traversal** mirrors `find`/`find -L`: every directory becomes a `D`
14//!   entry (path ending `/`) and every regular file directly inside it becomes
15//!   an `F` entry. Directories are recorded even when empty.
16//! - **Symlinks** are *followed by default* ([`FollowMode::Follow`], the
17//!   oracle's `find -L`): a symlink to a directory is reported as a directory
18//!   and descended into, a symlink to a file as a file, inheriting the
19//!   target's type/permissions/size/checksum. [`FollowMode::NoFollow`] (plain
20//!   `find`) drops symlinks entirely — they appear as neither `D` nor `F`.
21//! - **Permissions** are the octal mode bits, matching `stat -f '%A'` (macOS)
22//!   / `stat -c '%a'` (Linux): the low 12 bits of `st_mode` rendered in octal
23//!   with no leading zero (e.g. `755`, `644`, `700`).
24//! - **File size** is the content byte length (`%z` / `%s`). **Directory size**
25//!   is the *sum of its direct members' sizes* (files and subdirectories),
26//!   excluding the directory's own `stat` size — matching the oracle's
27//!   `_snapdir_manifest_sum_lines` over the direct children.
28//! - **Excludes** are applied via [`ExcludeMatcher`] against the *absolute*
29//!   path of each candidate directory and file, mirroring the oracle's
30//!   `find … | grep -E -v "$EXCLUDE"` (the filter runs before the relative
31//!   `./` rewrite). A `%system%` expansion forces [`FollowMode::NoFollow`];
32//!   the caller resolves that via [`expand_excludes`](crate::excludes::expand_excludes).
33//! - **Paths** are absolute under [`PathMode::Absolute`], or rewritten to a
34//!   leading `./` under [`PathMode::Relative`] (the oracle's
35//!   `sed -E "s| \.?${root_dir}| .|"`). Directory paths always end with `/`.
36//! - **Ordering** is `sort -k5` (byte-wise on the path), delegated to
37//!   [`Manifest`]'s own sort.
38//!
39//! Per the library-purity principle this module reads the filesystem at the
40//! *given* root path (that is its job) but reads no `$HOME`/config/environment
41//! for behavior: the root, options, excludes and hasher all arrive as
42//! parameters, and errors surface as the typed [`WalkError`].
43
44use std::collections::BTreeMap;
45use std::io;
46use std::os::unix::fs::PermissionsExt;
47use std::path::{Path, PathBuf};
48
49use thiserror::Error;
50
51use crate::excludes::{ExcludeMatcher, FollowMode};
52use crate::manifest::{Manifest, ManifestEntry, PathType};
53use crate::merkle::Hasher;
54
55/// Whether emitted paths are absolute or rewritten relative to the root.
56///
57/// Mirrors the oracle's `--absolute` flag: the default is
58/// [`Relative`](PathMode::Relative) (paths prefixed with `./`), and
59/// `--absolute` keeps the full absolute path.
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
61pub enum PathMode {
62    /// Rewrite paths to a leading `./` relative to the root (the default).
63    #[default]
64    Relative,
65    /// Keep absolute paths (`--absolute`).
66    Absolute,
67}
68
69/// Options controlling a [`walk`].
70///
71/// All inputs are parameters: this struct carries the symlink-follow setting,
72/// the relative/absolute path mode, and the optional compiled exclude matcher.
73/// The root path and [`Hasher`] are passed to [`walk`] directly.
74#[derive(Debug, Clone, Default)]
75pub struct WalkOptions {
76    /// Whether to follow symlinks ([`FollowMode::Follow`] by default).
77    pub follow: FollowMode,
78    /// Whether to emit absolute or relative (`./`) paths.
79    pub path_mode: PathMode,
80    /// An optional compiled exclude matcher. When `Some`, any directory or
81    /// file whose absolute path matches is dropped (`grep -E -v`).
82    pub exclude: Option<ExcludeMatcher>,
83}
84
85/// Errors raised while walking the filesystem.
86#[derive(Debug, Error)]
87pub enum WalkError {
88    /// The root path is not absolute. The walk needs an absolute root so it can
89    /// rewrite relative paths exactly as the oracle does (it `readlink`s the
90    /// argument to an absolute path first); the CLI lane resolves the user's
91    /// argument before calling [`walk`].
92    #[error("walk root must be an absolute path, got {0:?}")]
93    RootNotAbsolute(PathBuf),
94
95    /// The root path does not resolve to a directory.
96    #[error("walk root is not a directory: {0:?}")]
97    RootNotDirectory(PathBuf),
98
99    /// An I/O error occurred while reading the tree at `path`.
100    #[error("i/o error while walking {path:?}: {source}")]
101    Io {
102        /// The path being read when the error occurred.
103        path: PathBuf,
104        /// The underlying I/O error.
105        #[source]
106        source: io::Error,
107    },
108
109    /// A path could not be rendered as UTF-8. The frozen manifest format is
110    /// UTF-8 text; non-UTF-8 paths cannot be represented.
111    #[error("path is not valid UTF-8: {0:?}")]
112    NonUtf8Path(PathBuf),
113}
114
115impl WalkError {
116    fn io(path: impl Into<PathBuf>, source: io::Error) -> Self {
117        WalkError::Io {
118            path: path.into(),
119            source,
120        }
121    }
122}
123
124/// Renders the octal permission string for a file mode, matching
125/// `stat -f '%A'` (macOS) / `stat -c '%a'` (Linux): the low 12 mode bits in
126/// octal with no leading zero (e.g. `755`, `644`, `4755`).
127fn octal_permissions(mode: u32) -> String {
128    format!("{:o}", mode & 0o7777)
129}
130
131/// Returns a path as `&str`, or a [`WalkError::NonUtf8Path`].
132fn path_str(path: &Path) -> Result<&str, WalkError> {
133    path.to_str()
134        .ok_or_else(|| WalkError::NonUtf8Path(path.to_path_buf()))
135}
136
137/// A discovered file entry, before path rewriting.
138struct FileRecord {
139    /// Absolute path of the file.
140    abs_path: String,
141    permissions: String,
142    checksum: String,
143    size: u64,
144}
145
146/// A discovered directory, holding its absolute path and (filled during the
147/// post-order pass) its computed checksum and member-size total.
148struct DirRecord {
149    /// Absolute path of the directory (no trailing slash, except root `/`).
150    abs_path: String,
151    permissions: String,
152    /// Absolute paths of direct child directories, in discovery order.
153    child_dirs: Vec<String>,
154    /// Direct child files.
155    files: Vec<FileRecord>,
156}
157
158/// Walks the directory tree rooted at `root`, producing a [`Manifest`] that
159/// matches the original `snapdir-manifest` output byte-for-byte for the same
160/// tree and checksum function.
161///
162/// `root` must be an **absolute** path to a directory (the CLI lane resolves
163/// the user's argument first, mirroring the oracle's `readlink`). `hasher`
164/// supplies the content/merkle checksum function (BLAKE3 by default; the
165/// `--checksum-bin` matrix swaps in [`Md5Hasher`](crate::merkle::Md5Hasher) /
166/// [`Sha256Hasher`](crate::merkle::Sha256Hasher) / keyed BLAKE3). `options`
167/// carries the follow mode, path mode and optional exclude matcher.
168///
169/// # Errors
170///
171/// Returns [`WalkError`] if `root` is not absolute, is not a directory, holds a
172/// non-UTF-8 path, or if an I/O error occurs while reading the tree.
173pub fn walk<H: Hasher>(
174    root: &Path,
175    options: &WalkOptions,
176    hasher: &H,
177) -> Result<Manifest, WalkError> {
178    if !root.is_absolute() {
179        return Err(WalkError::RootNotAbsolute(root.to_path_buf()));
180    }
181
182    // Resolve the root's metadata following symlinks (the oracle always works
183    // on the resolved root directory).
184    let root_meta = std::fs::metadata(root).map_err(|e| WalkError::io(root, e))?;
185    if !root_meta.is_dir() {
186        return Err(WalkError::RootNotDirectory(root.to_path_buf()));
187    }
188    // The oracle's `stat -f '%A'` / `stat -c '%a'` does NOT follow symlinks, so
189    // a directory's PERMISSIONS column always comes from its own `lstat`. For
190    // the root we `lstat` it directly (it is normally a real directory; if it
191    // is itself a symlink the user passed, its own perms still apply).
192    let root_lstat = std::fs::symlink_metadata(root).map_err(|e| WalkError::io(root, e))?;
193    let root_permissions = octal_permissions(root_lstat.permissions().mode());
194
195    let root_str = path_str(root)?.to_owned();
196
197    // Discover every directory (depth-first, following symlinks per `follow`),
198    // recording its direct files and direct child directories. We collect into
199    // an ordered map keyed by absolute path so the post-order pass can compute
200    // directory checksums bottom-up.
201    let mut dirs: BTreeMap<String, DirRecord> = BTreeMap::new();
202    discover_dir(
203        root,
204        &root_str,
205        root_permissions,
206        options,
207        hasher,
208        &mut dirs,
209    )?;
210
211    // Compute each directory's checksum + member-size bottom-up. `dirs` is keyed
212    // by path in a BTreeMap (lexicographic), so a child path always sorts after
213    // its parent prefix; processing in reverse key order guarantees children are
214    // finalized before their parents. We memoize finalized (checksum, size).
215    let keys: Vec<String> = dirs.keys().cloned().collect();
216    let mut finalized: BTreeMap<String, (String, u64)> = BTreeMap::new();
217    for key in keys.iter().rev() {
218        let record = &dirs[key];
219
220        // Direct children's checksums (files + subdirs) for the merkle rule,
221        // and their sizes for the member-size sum.
222        let mut child_checksums: Vec<String> = Vec::new();
223        let mut member_size: u64 = 0;
224        for file in &record.files {
225            child_checksums.push(file.checksum.clone());
226            member_size += file.size;
227        }
228        for child in &record.child_dirs {
229            let (csum, size) = finalized
230                .get(child)
231                .expect("child dir finalized before parent (reverse key order)");
232            child_checksums.push(csum.clone());
233            member_size += size;
234        }
235
236        let checksum =
237            crate::merkle::directory_checksum(child_checksums.iter().map(String::as_str), hasher);
238        finalized.insert(key.clone(), (checksum, member_size));
239    }
240
241    // Emit manifest entries. Files first, then their directory, in any order —
242    // the Manifest sorts by path (`sort -k5`) on Display.
243    let mut manifest = Manifest::new();
244    for (key, record) in &dirs {
245        let (checksum, size) = &finalized[key];
246        let dir_path = render_dir_path(key, &root_str, options.path_mode);
247        manifest.push(ManifestEntry::new(
248            PathType::Directory,
249            record.permissions.clone(),
250            checksum.clone(),
251            *size,
252            dir_path,
253        ));
254        for file in &record.files {
255            let file_path = rewrite_path(&file.abs_path, &root_str, options.path_mode);
256            manifest.push(ManifestEntry::new(
257                PathType::File,
258                file.permissions.clone(),
259                file.checksum.clone(),
260                file.size,
261                file_path,
262            ));
263        }
264    }
265    manifest.sort();
266    Ok(manifest)
267}
268
269/// Recursively discovers the directory at `abs_path` (already known to be a
270/// directory), recording its direct files and child directories, then recurses
271/// into each child directory.
272fn discover_dir<H: Hasher>(
273    dir: &Path,
274    abs_path: &str,
275    permissions: String,
276    options: &WalkOptions,
277    hasher: &H,
278    dirs: &mut BTreeMap<String, DirRecord>,
279) -> Result<(), WalkError> {
280    // `permissions` is the directory's own `lstat` octal mode (a symlinked
281    // directory keeps the symlink's perms, matching the oracle's non-following
282    // `stat -f '%A'` / `stat -c '%a'`).
283    let mut record = DirRecord {
284        abs_path: abs_path.to_owned(),
285        permissions,
286        child_dirs: Vec::new(),
287        files: Vec::new(),
288    };
289
290    let read_dir = std::fs::read_dir(dir).map_err(|e| WalkError::io(dir, e))?;
291    for entry in read_dir {
292        let entry = entry.map_err(|e| WalkError::io(dir, e))?;
293        let entry_path = entry.path();
294        let entry_abs = path_str(&entry_path)?.to_owned();
295
296        // Excludes run on the absolute path (`grep -E -v` over `find` output),
297        // before any relative rewrite. A matching path is dropped for both the
298        // directory listing and the file listing.
299        if let Some(matcher) = &options.exclude {
300            if matcher.is_excluded(&entry_abs) {
301                continue;
302            }
303        }
304
305        // `symlink_metadata` does not traverse the final symlink, so we can
306        // detect symlinks and honor the follow mode like plain `find` vs
307        // `find -L`.
308        let link_meta = entry
309            .metadata()
310            .or_else(|_| std::fs::symlink_metadata(&entry_path))
311            .map_err(|e| WalkError::io(&entry_path, e))?;
312        let is_symlink = link_meta.file_type().is_symlink();
313
314        if is_symlink && !options.follow.follows_symlinks() {
315            // Plain `find` lists a symlink as type `l`; it is neither a `-type d`
316            // nor a `-type f`, so it never enters the manifest under no-follow.
317            continue;
318        }
319
320        // Resolve the (possibly symlinked) target's metadata. Following symlinks
321        // (`find -L`) makes a symlink-to-dir a directory and a symlink-to-file a
322        // file, inheriting the target's type/perms/size/checksum.
323        let target_meta = match std::fs::metadata(&entry_path) {
324            Ok(m) => m,
325            Err(e) => {
326                // A broken symlink (or a symlink loop on some platforms) cannot
327                // be stat'd through. `find -L` likewise cannot classify it as a
328                // file or directory, so it is omitted. Surface real I/O errors
329                // on non-symlink entries.
330                if is_symlink && (e.kind() == io::ErrorKind::NotFound || is_loop_error(&e)) {
331                    continue;
332                }
333                return Err(WalkError::io(&entry_path, e));
334            }
335        };
336        let file_type = target_meta.file_type();
337
338        // PERMISSIONS (and, for files, SIZE) come from the entry's own `lstat`,
339        // because the oracle's `stat` is non-following: a symlinked entry keeps
340        // the symlink's perms/size while its CHECKSUM is read through the link
341        // (b3sum/md5sum/sha256sum all follow symlinks). For a real (non-symlink)
342        // entry `lstat` == `stat`, so this is identical there.
343        let own_permissions = octal_permissions(link_meta.permissions().mode());
344
345        if file_type.is_dir() {
346            record.child_dirs.push(entry_abs.clone());
347            discover_dir(
348                &entry_path,
349                &entry_abs,
350                own_permissions,
351                options,
352                hasher,
353                dirs,
354            )?;
355        } else if file_type.is_file() {
356            // Read content through the link for the checksum; take SIZE from the
357            // entry's own `lstat` (for a symlink that is the target-path length,
358            // matching the oracle's `%z` / `%s` on the un-dereferenced symlink).
359            let bytes = std::fs::read(&entry_path).map_err(|e| WalkError::io(&entry_path, e))?;
360            let checksum = hasher.hash_hex(&bytes);
361            record.files.push(FileRecord {
362                abs_path: entry_abs,
363                permissions: own_permissions,
364                checksum,
365                size: link_meta.len(),
366            });
367        }
368        // Anything else (sockets, fifos, devices) is neither `-type d` nor
369        // `-type f`, so it is skipped — matching `find`.
370    }
371
372    dirs.insert(record.abs_path.clone(), record);
373    Ok(())
374}
375
376/// Detects a symlink-loop I/O error (`ELOOP`) so the walk can skip it the way
377/// `find -L` halts on / omits a self-referential symlink.
378fn is_loop_error(error: &io::Error) -> bool {
379    error.raw_os_error() == Some(libc_eloop())
380}
381
382/// `ELOOP` is 40 on Linux and 62 on macOS/BSD. We avoid a `libc` dependency by
383/// matching on the message kind via the raw errno of both platforms.
384const fn libc_eloop() -> i32 {
385    #[cfg(target_os = "linux")]
386    {
387        40
388    }
389    #[cfg(not(target_os = "linux"))]
390    {
391        62
392    }
393}
394
395/// Renders a directory's path for the manifest: always trailing-`/`, and either
396/// absolute or rewritten to a leading `./` relative to `root`.
397fn render_dir_path(abs_path: &str, root: &str, mode: PathMode) -> String {
398    let rewritten = rewrite_path(abs_path, root, mode);
399    // Directory paths always end with `/`. The root rewrites to "." -> "./";
400    // a nested dir "./a" -> "./a/". Absolute "/abs/a" -> "/abs/a/".
401    if rewritten.ends_with('/') {
402        rewritten
403    } else {
404        format!("{rewritten}/")
405    }
406}
407
408/// Applies the oracle's relative rewrite `sed -E "s| \.?${root_dir}| .|"`:
409/// the leading `root` prefix of an absolute path becomes `.`. In absolute mode
410/// the path is returned unchanged.
411fn rewrite_path(abs_path: &str, root: &str, mode: PathMode) -> String {
412    match mode {
413        PathMode::Absolute => abs_path.to_owned(),
414        PathMode::Relative => {
415            if abs_path == root {
416                // The root directory itself becomes ".".
417                ".".to_owned()
418            } else if let Some(rest) = abs_path.strip_prefix(root) {
419                // rest starts with '/': "/a/aa/f1" -> "./a/aa/f1".
420                format!(".{rest}")
421            } else {
422                // Defensive: not under root (should not happen). Leave as-is.
423                abs_path.to_owned()
424            }
425        }
426    }
427}
428
429#[cfg(test)]
430mod tests {
431    //! Pure-Rust walk tests.
432    //!
433    //! Originally these shelled out to the legacy Bash oracle
434    //! (the `snapdir-manifest` script) and asserted byte-identity. The oracle
435    //! has since been deleted from the branch, so each case is now pinned
436    //! against an
437    //! **embedded golden manifest constant** (or, where a column is
438    //! platform-dependent, a structural assertion). The golden bytes were
439    //! captured once from this very `walk` implementation over fixtures with
440    //! **explicit, fixed permissions** (dirs `0o700`/`0o755`, files `0o600`),
441    //! which makes the `TYPE PERMS CHECKSUM SIZE PATH` output fully
442    //! deterministic. The content/size/checksum/merkle columns were
443    //! cross-checked against the recorded oracle vectors in
444    //! `crates/snapdir-core/tests/compat_golden.rs` (e.g. the empty-file
445    //! `af1349b9…` checksum and the `./a/aa/aaa/` merkle `8aed4caf…`).
446    //!
447    //! Symlink rows (`./a_link/`, `./r1f_link`) carry the symlink's *own* lstat
448    //! permissions, which differ across platforms (macOS reports `755`, Linux
449    //! `777`), so those tests assert structure (presence/absence + materialized
450    //! subtree) rather than a byte-exact perm column.
451    use super::*;
452    use crate::merkle::Blake3Hasher;
453    use std::fs;
454    use std::os::unix::fs::PermissionsExt;
455    use std::path::PathBuf;
456    use std::sync::atomic::{AtomicU64, Ordering};
457
458    /// A self-cleaning scratch directory under the system temp dir. Avoids a
459    /// `tempfile` dev-dependency; the walk is library-pure and never reads the
460    /// environment itself — only this test harness builds fixtures on disk. The
461    /// root is chmod'd to a fixed `0o755` so the root `D` line's perm column is
462    /// deterministic across umasks.
463    struct Scratch {
464        path: PathBuf,
465    }
466
467    impl Scratch {
468        fn new(tag: &str) -> Self {
469            static COUNTER: AtomicU64 = AtomicU64::new(0);
470            let n = COUNTER.fetch_add(1, Ordering::Relaxed);
471            let pid = std::process::id();
472            // Resolve through canonicalize so macOS's /var -> /private/var (and
473            // any other symlinked temp prefix) is already normalized.
474            let base = std::env::temp_dir()
475                .canonicalize()
476                .expect("temp dir canonicalizes");
477            let path = base.join(format!("snapdir-walk-test-{tag}-{pid}-{n}"));
478            let _ = fs::remove_dir_all(&path);
479            fs::create_dir_all(&path).expect("create scratch dir");
480            fs::set_permissions(&path, fs::Permissions::from_mode(0o755))
481                .expect("chmod scratch root");
482            Scratch { path }
483        }
484
485        fn root(&self) -> &Path {
486            &self.path
487        }
488    }
489
490    impl Drop for Scratch {
491        fn drop(&mut self) {
492            let _ = fs::remove_dir_all(&self.path);
493        }
494    }
495
496    /// Writes a file (creating parents) with a fixed `0o600` mode so the `F`
497    /// line's perm column is deterministic.
498    fn write_file(path: &Path, contents: &[u8]) {
499        if let Some(parent) = path.parent() {
500            fs::create_dir_all(parent).expect("create parent dir");
501        }
502        fs::write(path, contents).expect("write file");
503        fs::set_permissions(path, fs::Permissions::from_mode(0o600)).expect("chmod file");
504    }
505
506    /// Recursively chmods `root` and every descendant directory to `mode`, so
507    /// every `D` line's perm column is pinned (independent of the process umask).
508    fn chmod_dirs(root: &Path, mode: u32) {
509        fs::set_permissions(root, fs::Permissions::from_mode(mode)).expect("chmod dir");
510        for entry in fs::read_dir(root).expect("read_dir").flatten() {
511            let ft = entry.file_type().expect("file_type");
512            // `is_dir()` here is lstat-based via DirEntry::file_type, so a
513            // symlink-to-dir is NOT recursed into (its own perms stay as-is).
514            if ft.is_dir() {
515                chmod_dirs(&entry.path(), mode);
516            }
517        }
518    }
519
520    /// Builds a [`WalkOptions`] for the given follow/path/exclude combination.
521    fn opts(follow: FollowMode, path_mode: PathMode, exclude: Option<&str>) -> WalkOptions {
522        WalkOptions {
523            follow,
524            path_mode,
525            exclude: exclude.map(|p| ExcludeMatcher::new(p).expect("valid exclude regex")),
526        }
527    }
528
529    /// Runs the walk and returns its `Display` manifest text (no trailing
530    /// newline — `Manifest`'s `Display` does not emit one).
531    fn manifest_text(root: &Path, options: &WalkOptions) -> String {
532        walk(root, options, &Blake3Hasher::new())
533            .expect("walk")
534            .to_string()
535    }
536
537    // -- Empty-string / empty-file checksum reused from the oracle vectors -----
538    // (matches compat_golden.rs::EMPTY_FILE_B3).
539    const EMPTY_B3: &str = "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262";
540
541    #[test]
542    fn walk_root_must_be_absolute() {
543        let err = walk(
544            Path::new("relative/path"),
545            &WalkOptions::default(),
546            &Blake3Hasher::new(),
547        )
548        .unwrap_err();
549        assert!(matches!(err, WalkError::RootNotAbsolute(_)));
550    }
551
552    #[test]
553    fn walk_empty_directory_golden() {
554        // An empty directory: a single `D` line whose checksum is the merkle of
555        // zero children == blake3("") and whose size is 0. Root chmod'd to 755.
556        let scratch = Scratch::new("empty-dir");
557        let expected = format!("D 755 {EMPTY_B3} 0 ./");
558        assert_eq!(
559            manifest_text(scratch.root(), &WalkOptions::default()),
560            expected
561        );
562    }
563
564    #[test]
565    fn walk_single_empty_file_golden() {
566        // Root `D` line (its merkle == single empty-file child) plus the `F`
567        // line for the empty file. Both content checksums are blake3("").
568        let scratch = Scratch::new("empty-file");
569        write_file(&scratch.root().join("empty.txt"), b"");
570        let expected = format!(
571            "D 755 dba5865c0d91b17958e4d2cac98c338f85cbbda07b71a020ab16c391b5e7af4b 0 ./\n\
572             F 600 {EMPTY_B3} 0 ./empty.txt"
573        );
574        assert_eq!(
575            manifest_text(scratch.root(), &WalkOptions::default()),
576            expected
577        );
578    }
579
580    /// The deep guide tree under [`PathMode::Relative`]. Dirs are `0o700`, files
581    /// `0o600`; every checksum/merkle value matches the recorded oracle vectors
582    /// (cf. `compat_golden.rs::MULTILEVEL_MANIFEST` — same `./a/…`/`./b/…`/`./c/…`
583    /// subtree). The extra empty `./d/` dir carries the blake3("") merkle.
584    const NESTED_RELATIVE_GOLDEN: &str = "\
585D 700 3f938f681dcbd616d00d42f704d525c05e7ed2746888c35c8214127c632587c3 43 ./
586D 700 ed23cfd2037d23cf8c6b67497425e7a06d5e40ea2bd8e43fc434006022dafe86 21 ./a/
587F 600 3c9cb8b8c8f3588f8e59e18d284330b0a951be644fbef2b9784b56e15d1c6096 4 ./a/a1f
588D 700 ee795476bff6c1816b4c7558a74ee0b44ec600c3cde6b02564508f67d536a656 17 ./a/aa/
589F 600 a2951028421deef48d1ba185f4c497c2d986f1dd76079baf2f5eb8479f132b5a 5 ./a/aa/aa1f
590D 700 8aed4caf45b22aa4c8a195945136e3a01f77864e91fabe2d9272feeee87ae334 12 ./a/aa/aaa/
591F 600 5cfee4fb4074748633b4ccbddb6b184a9b5e2f5ce74df6d2803f5fea0392a197 6 ./a/aa/aaa/aaa1f
592F 600 3791f11a017feedffd24c2656e18d5c4ca9d6c404c8f40ccc511b6351c8575a6 6 ./a/aa/aaa/aaa2f
593D 700 9a8b0e35c000df69893648b91d15cc30ab88ae5a40af48228caf5fa443dafc9b 12 ./b/
594D 700 d41c2090167e6f546a510f0da98d8a8355d6bd2b61666644604c73b3a8f5b5d9 12 ./b/bb/
595D 700 3b9023fa454aa22466feeb8cbf55a2c764dd79de0e93c9a793e8b54caec227da 12 ./b/bb/bbb/
596F 600 8d18b7f3aabbef192a524fa2549d1d36b48c9030d234c9bdf87caa267fb09933 6 ./b/bb/bbb/bbb1f
597F 600 2e16e172b6e337325f271d4eae00bc1ea20e41609ef78665710cada1477005cc 6 ./b/bb/bbb/bbb2f
598D 700 15eb2657c1e6f5a24023c10429bb6f1b7d81b2cc2057eedee2192fbf3e7b892c 6 ./c/
599D 700 e711f4e76ae9b3e25ad9a32b5f115cc9a81e55a428c552aa0bcab8543967f51a 6 ./c/cc/
600D 700 31a1955d5a65328f31014650cf79b5c0c3d9b82de19352ade8d299cc22f6ec40 6 ./c/cc/ccc/
601F 600 24f0cf3553e0dac0ce8aead4279e0fc368899e89ef776999d0d7e812b5ca0f3b 6 ./c/cc/ccc/ccc1f
602D 700 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 0 ./d/
603F 600 27a55588c59999fd686667c4b186af08161b95c287216f0cde723f0e191d1974 4 ./r1f";
604
605    fn build_nested(root: &Path) {
606        write_file(&root.join("a/aa/aaa/aaa1f"), b"aaa1f\n");
607        write_file(&root.join("a/aa/aaa/aaa2f"), b"aaa2f\n");
608        write_file(&root.join("a/aa/aa1f"), b"aa1f\n");
609        write_file(&root.join("a/a1f"), b"a1f\n");
610        write_file(&root.join("r1f"), b"r1f\n");
611        write_file(&root.join("b/bb/bbb/bbb1f"), b"bbb1f\n");
612        write_file(&root.join("b/bb/bbb/bbb2f"), b"bbb2f\n");
613        write_file(&root.join("c/cc/ccc/ccc1f"), b"ccc1f\n");
614        // Empty subdirectory with no files.
615        fs::create_dir_all(root.join("d")).unwrap();
616        chmod_dirs(root, 0o700);
617    }
618
619    #[test]
620    fn walk_nested_tree_relative_golden() {
621        let scratch = Scratch::new("nested-rel");
622        build_nested(scratch.root());
623        assert_eq!(
624            manifest_text(
625                scratch.root(),
626                &opts(FollowMode::Follow, PathMode::Relative, None)
627            ),
628            NESTED_RELATIVE_GOLDEN
629        );
630    }
631
632    #[test]
633    fn walk_nested_tree_absolute_golden() {
634        // Under PathMode::Absolute every PATH column is the scratch root prefix
635        // + the relative tail; the TYPE/PERMS/CHECKSUM/SIZE columns are
636        // identical to the relative golden. We reconstruct the expected text by
637        // rewriting the relative golden's `./` prefix to the absolute root,
638        // proving the only difference is the path rendering.
639        let scratch = Scratch::new("nested-abs");
640        let r = scratch.root();
641        build_nested(r);
642        let root_str = r.to_str().unwrap();
643        let expected: String = NESTED_RELATIVE_GOLDEN
644            .lines()
645            .map(|line| {
646                // Replace the leading "./" of the PATH (last field) with the
647                // absolute root. The path is everything after the 4th space.
648                let (head, path) = line.rsplit_once(' ').unwrap();
649                let abs_path = if path == "./" {
650                    format!("{root_str}/")
651                } else {
652                    format!("{root_str}/{}", path.strip_prefix("./").unwrap())
653                };
654                format!("{head} {abs_path}")
655            })
656            .collect::<Vec<_>>()
657            .join("\n");
658        assert_eq!(
659            manifest_text(r, &opts(FollowMode::Follow, PathMode::Absolute, None)),
660            expected
661        );
662    }
663
664    #[test]
665    fn walk_directory_size_is_sum_of_members_golden() {
666        // Cross-check dir-size summation: each `D` line's SIZE is the sum of its
667        // members (recursively), independent of the directory's own stat size.
668        let scratch = Scratch::new("dir-size");
669        let r = scratch.root();
670        write_file(&r.join("f1"), b"hello"); // 5
671        write_file(&r.join("sub/f2"), b"world!!"); // 7
672        write_file(&r.join("sub/f3"), b"x"); // 1
673        chmod_dirs(r, 0o700);
674
675        let expected = "\
676D 700 5681c72cfd0ddea4f54683365bc4082b92147bf33976875653133cc4aed0f96a 13 ./
677F 600 ea8f163db38682925e4491c5e58d4bb3506ef8c14eb78a86e908c5624a67200f 5 ./f1
678D 700 2ac73ec4f4ec2ef21ebfba467be499a58aef80a34d7001d68bdeb14cb58a954d 8 ./sub/
679F 600 8bafa24d36bc2aa6edc0d041e763cb59ebadb71b6e63ab4ac9314de95e9a0de7 7 ./sub/f2
680F 600 3ae7d805f6789a6402acb70ad4096a85a56bf6804eaf25c0493ac697548d30b5 1 ./sub/f3";
681        let manifest = walk(r, &WalkOptions::default(), &Blake3Hasher::new()).expect("walk");
682        assert_eq!(manifest.to_string(), expected);
683
684        // Structural cross-check of the summation rule independent of the bytes.
685        let root_dir = manifest.entries().iter().find(|e| e.path == "./").unwrap();
686        let sub_dir = manifest
687            .entries()
688            .iter()
689            .find(|e| e.path == "./sub/")
690            .unwrap();
691        assert_eq!(sub_dir.size, 8, "sub = f2(7) + f3(1)");
692        assert_eq!(root_dir.size, 13, "root = f1(5) + sub(8)");
693    }
694
695    /// Builds the symlink fixture: a real `a/` subtree plus a dir-symlink
696    /// `a_link -> a` and a file-symlink `r1f_link -> r1f`. Real dirs chmod'd to
697    /// `0o700`; files `0o600`. The symlinks' own perms are left platform-default
698    /// (NOT chmod'd) — hence the structural (not byte-golden) assertions below.
699    fn build_symlinks(root: &Path) {
700        write_file(&root.join("a/aa/f1"), b"hello");
701        write_file(&root.join("a/f2"), b"world!!");
702        write_file(&root.join("r1f"), b"r");
703        std::os::unix::fs::symlink("a", root.join("a_link")).expect("symlink dir");
704        std::os::unix::fs::symlink("r1f", root.join("r1f_link")).expect("symlink file");
705        chmod_dirs(root, 0o700);
706    }
707
708    #[test]
709    fn walk_symlink_followed_by_default() {
710        let scratch = Scratch::new("symlink-follow");
711        let r = scratch.root();
712        build_symlinks(r);
713
714        let manifest = manifest_text(r, &opts(FollowMode::Follow, PathMode::Relative, None));
715
716        // The dir symlink is followed: it materializes as its own `D ./a_link/`
717        // row whose CHECKSUM equals the real `./a/` directory's merkle, plus the
718        // full target subtree mirrored under ./a_link/.
719        let a_dir_b3 = "0c862ed8e62262f84e7fc0fe4a6c566adec4a85ef22f8a46b7ad4c9344146701";
720        assert!(
721            manifest
722                .lines()
723                .any(|l| l.starts_with("D ") && l.contains(a_dir_b3) && l.ends_with(" ./a/")),
724            "real ./a/ dir present with its merkle: {manifest}"
725        );
726        assert!(
727            manifest
728                .lines()
729                .any(|l| l.starts_with("D ") && l.contains(a_dir_b3) && l.ends_with(" ./a_link/")),
730            "followed symlink dir ./a_link/ mirrors ./a/'s merkle: {manifest}"
731        );
732        // Mirrored target subtree entries (content checksums are deterministic).
733        assert!(manifest.lines().any(|l| l.ends_with(" ./a_link/aa/")));
734        assert!(manifest.lines().any(|l| {
735            l.starts_with("F ")
736                && l.contains("ea8f163db38682925e4491c5e58d4bb3506ef8c14eb78a86e908c5624a67200f")
737                && l.ends_with(" ./a_link/aa/f1")
738        }));
739        // The file symlink is followed: it appears as an `F` row pointing at the
740        // target's content (blake3("r")), ending in ./r1f_link.
741        let r1f_b3 = "b2dea48d667b2821a9bcf69eded39a2458a1d8165ca7fcac64c3557b69a7ea08";
742        assert!(
743            manifest
744                .lines()
745                .any(|l| l.starts_with("F ") && l.contains(r1f_b3) && l.ends_with(" ./r1f_link")),
746            "followed symlink file ./r1f_link present: {manifest}"
747        );
748        assert!(
749            manifest
750                .lines()
751                .any(|l| l.starts_with("F ") && l.contains(r1f_b3) && l.ends_with(" ./r1f")),
752            "real ./r1f present: {manifest}"
753        );
754    }
755
756    #[test]
757    fn walk_no_follow_drops_symlinks() {
758        let scratch = Scratch::new("symlink-nofollow");
759        let r = scratch.root();
760        build_symlinks(r);
761
762        // With --no-follow the symlinks are dropped entirely; the manifest is a
763        // byte-exact golden over only the real entries (no `_link` rows). Note
764        // the root `D` SIZE is 13 (= sum of real members), not the 28 of the
765        // followed case (which double-counts via a_link/).
766        let expected = "\
767D 700 61a8f1898844a17eeed84d34c2e3b5fd9c7fef136dba5f7036ae70294595a085 13 ./
768D 700 0c862ed8e62262f84e7fc0fe4a6c566adec4a85ef22f8a46b7ad4c9344146701 12 ./a/
769D 700 6cd17c61c7e42c50586ee5f3f54dbc4f809f71073fc176ed2ae865103dd33625 5 ./a/aa/
770F 600 ea8f163db38682925e4491c5e58d4bb3506ef8c14eb78a86e908c5624a67200f 5 ./a/aa/f1
771F 600 8bafa24d36bc2aa6edc0d041e763cb59ebadb71b6e63ab4ac9314de95e9a0de7 7 ./a/f2
772F 600 b2dea48d667b2821a9bcf69eded39a2458a1d8165ca7fcac64c3557b69a7ea08 1 ./r1f";
773        let manifest = manifest_text(r, &opts(FollowMode::NoFollow, PathMode::Relative, None));
774        assert_eq!(manifest, expected);
775        assert!(!manifest.contains("_link"), "no-follow drops all symlinks");
776    }
777
778    #[test]
779    fn walk_exclude_regex_golden() {
780        let scratch = Scratch::new("exclude-regex");
781        let r = scratch.root();
782        write_file(&r.join("keep/k"), b"x");
783        write_file(&r.join("drop/d"), b"y");
784        write_file(&r.join("top.txt"), b"top");
785        chmod_dirs(r, 0o700);
786
787        // The matcher runs against the ABSOLUTE find path, so the exclude is
788        // anchored at the absolute root + "/drop". `drop/` is dropped entirely;
789        // `keep/` and `top.txt` remain (byte-exact golden over the survivors).
790        let abs = r.to_str().unwrap();
791        let pattern = format!("{abs}/drop");
792        let manifest = manifest_text(
793            r,
794            &opts(FollowMode::Follow, PathMode::Relative, Some(&pattern)),
795        );
796        let expected = "\
797D 700 b6f1055a5f14fdd55fa831ff6d2e2f433c7ca7fa2cc43e63a8cd0a4542d3010a 4 ./
798D 700 b9030f201b43e2a72e62951476c0bcfafe3b020ece221d2254d8610ea9e88fb5 1 ./keep/
799F 600 3ae7d805f6789a6402acb70ad4096a85a56bf6804eaf25c0493ac697548d30b5 1 ./keep/k
800F 600 ef854702aa94ba4f60c67d731671c9e0e49a031be6ce475489e91f7a33cb5243 3 ./top.txt";
801        assert_eq!(manifest, expected);
802        assert!(!manifest.contains("drop"), "drop/ excluded");
803    }
804
805    #[test]
806    fn walk_exclude_common_golden() {
807        let scratch = Scratch::new("exclude-common");
808        let r = scratch.root();
809        write_file(&r.join("src/main.rs"), b"fn main() {}\n");
810        write_file(&r.join(".git/objects/secret"), b"secret");
811        write_file(&r.join("node_modules/pkg/index.js"), b"//js\n");
812        chmod_dirs(r, 0o700);
813
814        // %common% expands to the regex that drops .git, node_modules, etc.
815        // (the CLI lane uses the same expansion; core never reads the env).
816        let expanded = crate::excludes::expand_excludes(
817            "%common%",
818            "/nonexistent/.cache/",
819            "/nonexistent/cache",
820        );
821        let pattern = expanded.pattern.expect("non-empty");
822        let manifest = manifest_text(
823            r,
824            &opts(FollowMode::Follow, PathMode::Relative, Some(&pattern)),
825        );
826        // Only ./src survives — byte-exact golden over the survivors.
827        let expected = "\
828D 700 ad5409ad5f97a26c908382b379b23971ee143e6bcd29a7d663175936d2cd4e94 13 ./
829D 700 069cd5e102d7dd39faa7093b5b2d784c32e19b01f829a902c14aa10b7182debc 13 ./src/
830F 600 2d1ebfa706ba230165250f744796a92accba5e1b6fa357983b65319da33f8e93 13 ./src/main.rs";
831        assert_eq!(manifest, expected);
832        assert!(!manifest.contains(".git"), "%common% excludes .git");
833        assert!(
834            !manifest.contains("node_modules"),
835            "%common% excludes node_modules"
836        );
837    }
838
839    #[test]
840    fn walk_snapshot_id_is_blake3_of_manifest_text() {
841        // The snapshot id is BLAKE3 of the manifest text + a trailing newline
842        // (comment lines stripped). Cross-check the public derivation against an
843        // explicit recomputation over the walk's own output.
844        let scratch = Scratch::new("snapshot-id");
845        let r = scratch.root();
846        write_file(&r.join("a/f1"), b"hello\n");
847        write_file(&r.join("b/f2"), b"world\n");
848        chmod_dirs(r, 0o700);
849        let hasher = Blake3Hasher::new();
850        let manifest = walk(r, &WalkOptions::default(), &hasher).expect("walk");
851        let id = crate::merkle::snapshot_id(&manifest, &hasher);
852
853        let mut bytes = manifest.to_string().into_bytes();
854        bytes.push(b'\n');
855        let expected = hasher.hash_hex(&bytes);
856        assert_eq!(
857            id, expected,
858            "snapshot id == blake3(manifest_text + \"\\n\")"
859        );
860        assert_eq!(id.len(), 64, "id is 64 lowercase hex chars");
861        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
862    }
863}