Skip to main content

pf_world/
fs.rs

1// SPDX-License-Identifier: MIT
2//! Filesystem layer: walk + content-address + restore.
3
4use pf_core::cas::BlobStore;
5use pf_core::digest::Digest256;
6
7use rayon::prelude::*;
8use serde::{Deserialize, Serialize};
9use std::path::{Path, PathBuf};
10use std::sync::Arc;
11
12/// One entry in the captured FS tree manifest.
13#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
14pub struct FsTreeEntry {
15    /// Path **relative to the captured root** (forward-slash separated).
16    pub path: String,
17    /// `mode` stored as 4 octal digits (e.g. `"0644"`); we keep it as a
18    /// string to preserve the leading zero through JSON.
19    pub mode: String,
20    /// File size in bytes (post-decompression). Symlinks: target byte length.
21    pub size: u64,
22    /// File kind.
23    pub kind: FsEntryKind,
24    /// Content digest. For symlinks, the digest of the target string. For
25    /// directories, [`None`] (the directory is implied by its children).
26    #[serde(default, skip_serializing_if = "Option::is_none")]
27    pub blob: Option<Digest256>,
28    /// Symlink target (only for symlinks).
29    #[serde(default, skip_serializing_if = "Option::is_none")]
30    pub link_target: Option<String>,
31}
32
33/// File kind for [`FsTreeEntry`].
34#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
35#[serde(rename_all = "snake_case")]
36pub enum FsEntryKind {
37    /// Regular file.
38    File,
39    /// Directory (no content; presence implies the dir).
40    Dir,
41    /// Symbolic link.
42    Symlink,
43}
44
45/// Wire format of the captured tree (`fs.tree.v1` blob).
46#[derive(Clone, Debug, Serialize, Deserialize)]
47pub struct FsTree {
48    /// Schema discriminator. Always `"fs.tree.v1"`.
49    pub kind: String,
50    /// Entries sorted by `path` for deterministic digests.
51    pub entries: Vec<FsTreeEntry>,
52}
53
54/// Captures a directory tree into a [`BlobStore`] and emits a single
55/// `fs.tree.v1` blob describing the structure.
56///
57/// Concurrency: file content-addressing runs on a rayon thread pool. Walk
58/// itself is single-threaded (`walkdir`) — we sort all entries first so the
59/// emitted manifest is byte-identical across runs over the same tree.
60pub struct WalkFsCapture {
61    root: PathBuf,
62    use_apfs_clone: bool,
63    follow_symlinks: bool,
64    ignore: Vec<String>,
65    /// v1.0.13 audit fix: glob-style ignore patterns alongside the
66    /// segment-match ones. Built lazily from `ignore` entries that
67    /// contain glob meta-characters (`*`, `?`, `[`).
68    ignore_globs: Vec<globset::GlobMatcher>,
69}
70
71/// v1.0.13 default-ignore set extension. The previous default set
72/// covered build directories (`target`, `node_modules`, `.git/objects`)
73/// and the `.pfcid` sentinel. The v1.0.12 retest reproduced **false
74/// merge conflicts** when `__pycache__/` and `.pytest_cache/`
75/// landed in the captured tree from a `pytest` run on otherwise
76/// disjoint branches. This set adds the universally-cache directory
77/// names that should never be source-of-truth, plus common Python
78/// bytecode patterns. Conservative: nothing here is ever a "maybe
79/// I want this" — they are all caches by definition.
80const DEFAULT_EXTRA_IGNORES: &[&str] = &[
81    "__pycache__",
82    ".pytest_cache",
83    ".mypy_cache",
84    ".ruff_cache",
85    ".tox",
86    ".coverage",
87    ".venv",
88    ".DS_Store",
89    "*.pyc",
90    "*.pyo",
91];
92
93impl WalkFsCapture {
94    /// Capture the directory rooted at `root`.
95    pub fn new(root: impl AsRef<Path>) -> Self {
96        let mut ignore: Vec<String> = vec![
97            ".git/objects".into(),
98            "target".into(),
99            "node_modules".into(),
100            // `.pfcid` is the sentinel `pf checkout` writes so a
101            // subsequent `pf snapshot` knows its parent CID. We
102            // skip it here so it never lands in the captured tree.
103            ".pfcid".into(),
104        ];
105        for extra in DEFAULT_EXTRA_IGNORES {
106            ignore.push((*extra).to_owned());
107        }
108        let ignore_globs = compile_globs(&ignore);
109        Self {
110            root: root.as_ref().to_path_buf(),
111            use_apfs_clone: false,
112            follow_symlinks: false,
113            ignore,
114            ignore_globs,
115        }
116    }
117
118    /// Build a capturer that does NOT carry the v1.0.13 default-extra
119    /// ignore set (`__pycache__`, `.pytest_cache`, `*.pyc`, …).
120    /// Operators who want byte-for-byte capture of every file in
121    /// the source tree (rare; CI auditing the set itself; building
122    /// a registry mirror) call this. Default callers should use
123    /// [`WalkFsCapture::new`] which has the safe set.
124    pub fn new_without_default_ignores(root: impl AsRef<Path>) -> Self {
125        let ignore: Vec<String> = vec![
126            ".git/objects".into(),
127            "target".into(),
128            "node_modules".into(),
129            ".pfcid".into(),
130        ];
131        let ignore_globs = compile_globs(&ignore);
132        Self {
133            root: root.as_ref().to_path_buf(),
134            use_apfs_clone: false,
135            follow_symlinks: false,
136            ignore,
137            ignore_globs,
138        }
139    }
140
141    /// Toggle the macOS APFS clone fast-path. When enabled and the source is
142    /// on APFS, we `clonefile(2)`-clone the directory into a temp dir first
143    /// (O(1) per the APFS docs) and walk the clone — giving a stable view
144    /// without pausing the agent. Falls back to a direct walk on other
145    /// filesystems / OSes. Off by default in v1; opt in for production.
146    #[must_use]
147    pub fn use_apfs_clone(mut self, enable: bool) -> Self {
148        self.use_apfs_clone = enable;
149        self
150    }
151
152    /// Follow symlinks during walk. Off by default — we capture symlinks as
153    /// symlinks, not as the targets they happen to point at.
154    #[must_use]
155    pub fn follow_symlinks(mut self, enable: bool) -> Self {
156        self.follow_symlinks = enable;
157        self
158    }
159
160    /// Add a path-fragment OR glob pattern to the ignore list.
161    ///
162    /// - Plain entries (`target`, `__pycache__`, `.git/objects`) are
163    ///   matched as path-component sequences, exactly as before.
164    /// - Glob entries (anything containing `*`, `?`, `[`) are matched
165    ///   against the relative path via [`globset::Glob`]. Common
166    ///   patterns: `*.pyc`, `*.log`, `**/build/**`.
167    ///
168    /// v1.0.13 added glob support; segment-match semantics for plain
169    /// entries are unchanged.
170    #[must_use]
171    pub fn ignore(mut self, fragment: impl Into<String>) -> Self {
172        let entry: String = fragment.into();
173        if has_glob_chars(&entry)
174            && let Ok(g) = globset::Glob::new(&entry)
175        {
176            self.ignore_globs.push(g.compile_matcher());
177        }
178        self.ignore.push(entry);
179        self
180    }
181
182    /// Read a `.gitignore`/`.pfignore`-style file and apply each
183    /// non-comment, non-empty line as an ignore entry. Lines ending
184    /// with `/` have the slash stripped (gitignore directory marker).
185    /// Lines starting with `!` (gitignore negation) are skipped with
186    /// a `tracing::warn!` — this is a v1.0.13 limitation; full
187    /// gitignore semantics with negation arrive when an operator
188    /// hits the use case.
189    ///
190    /// Returns Ok(self) even if the file doesn't exist (so
191    /// `.ignore_from(".pfignore")` is safe to chain unconditionally).
192    /// Returns the underlying io::Error only if the file exists but
193    /// can't be read (permissions, etc.).
194    pub fn ignore_from(mut self, path: impl AsRef<Path>) -> std::io::Result<Self> {
195        let path = path.as_ref();
196        if !path.exists() {
197            return Ok(self);
198        }
199        let content = std::fs::read_to_string(path)?;
200        for raw in content.lines() {
201            let line = raw.trim();
202            if line.is_empty() || line.starts_with('#') {
203                continue;
204            }
205            if line.starts_with('!') {
206                tracing::warn!(
207                    "ignoring gitignore negation in {}: {} (negation not yet supported in v1.0.13)",
208                    path.display(),
209                    line
210                );
211                continue;
212            }
213            let trimmed = line.trim_start_matches('/').trim_end_matches('/');
214            if trimmed.is_empty() {
215                continue;
216            }
217            self = self.ignore(trimmed);
218        }
219        Ok(self)
220    }
221
222    /// Run the capture. Returns the digest of the `fs.tree.v1` blob.
223    pub fn capture(&self, blobs: &Arc<dyn BlobStore>) -> pf_core::Result<Digest256> {
224        // APFS clone fast-path is best-effort; if it fails we fall back to
225        // walking the live tree.
226        let walk_root: PathBuf = if self.use_apfs_clone && cfg!(target_os = "macos") {
227            apfs_clone(&self.root).unwrap_or_else(|_| self.root.clone())
228        } else {
229            self.root.clone()
230        };
231
232        // Collect entries first so we can sort and parallelize hashing.
233        let mut raw: Vec<walkdir::DirEntry> = walkdir::WalkDir::new(&walk_root)
234            .follow_links(self.follow_symlinks)
235            .into_iter()
236            .filter_entry(|e| {
237                // Component-segment match (NOT substring). The v1.0.2
238                // audit found that the previous `p.contains(frag)` test
239                // dropped legitimate paths whose name happened to share
240                // a substring with an ignore entry, e.g.
241                // `src/targeted/keep.txt` was filtered because "target"
242                // appeared as a substring. We now compare each
243                // path-component to each ignore entry exactly. Multi-
244                // segment ignores like ".git/objects" still work via
245                // path-prefix containment of the joined segments.
246                //
247                // v1.0.13: glob entries (containing `*`/`?`/`[`) are
248                // also matched against the path RELATIVE to walk_root,
249                // so `*.pyc` correctly skips `src/foo.pyc`.
250                let rel = e.path().strip_prefix(&walk_root).unwrap_or(e.path());
251                !path_matches_any_ignore(e.path(), &self.ignore)
252                    && !path_matches_any_glob(rel, &self.ignore_globs)
253            })
254            .filter_map(std::result::Result::ok)
255            .collect();
256
257        // Skip the root itself (we capture its contents, not its name).
258        raw.retain(|e| e.path() != walk_root.as_path());
259
260        // Sort by path for deterministic manifests.
261        raw.sort_by(|a, b| a.path().cmp(b.path()));
262
263        // Parallel-hash regular files; symlinks/dirs are O(1).
264        let entries: Vec<FsTreeEntry> = raw
265            .par_iter()
266            .map(|de| -> pf_core::Result<FsTreeEntry> {
267                let abs = de.path();
268                let rel = abs.strip_prefix(&walk_root).unwrap_or(abs);
269                let rel_str = rel.to_string_lossy().replace('\\', "/");
270                let meta = de
271                    .metadata()
272                    .map_err(|e| std::io::Error::other(e.to_string()))?;
273                let mode = unix_mode_string(&meta);
274
275                if meta.file_type().is_dir() {
276                    return Ok(FsTreeEntry {
277                        path: rel_str,
278                        mode,
279                        size: 0,
280                        kind: FsEntryKind::Dir,
281                        blob: None,
282                        link_target: None,
283                    });
284                }
285                if meta.file_type().is_symlink() {
286                    let target = std::fs::read_link(abs)?;
287                    let target_str = target.to_string_lossy().to_string();
288                    let blob = blobs.put(target_str.as_bytes())?;
289                    return Ok(FsTreeEntry {
290                        path: rel_str,
291                        mode,
292                        size: target_str.len() as u64,
293                        kind: FsEntryKind::Symlink,
294                        blob: Some(blob),
295                        link_target: Some(target_str),
296                    });
297                }
298                // Regular file.
299                let bytes = std::fs::read(abs)?;
300                let size = bytes.len() as u64;
301                let digest = blobs.put(&bytes)?;
302                Ok(FsTreeEntry {
303                    path: rel_str,
304                    mode,
305                    size,
306                    kind: FsEntryKind::File,
307                    blob: Some(digest),
308                    link_target: None,
309                })
310            })
311            .collect::<pf_core::Result<Vec<_>>>()?;
312
313        let tree = FsTree {
314            kind: "fs.tree.v1".into(),
315            entries,
316        };
317        let json = serde_json::to_vec(&tree)?;
318        blobs.put(&json)
319    }
320}
321
322/// Restore a previously-captured tree blob into a fresh directory `dst`.
323///
324/// The restore is **atomic**: we rebuild into `dst.with_extension("pftmp")`,
325/// `fsync` the parent, then `rename(2)` over `dst`. If `dst` already exists
326/// the call errors — callers can pass a tempdir or pre-clean.
327pub fn restore_tree(
328    blobs: &Arc<dyn BlobStore>,
329    tree_digest: &Digest256,
330    dst: impl AsRef<Path>,
331) -> pf_core::Result<()> {
332    let dst = dst.as_ref();
333    if dst.exists() {
334        return Err(pf_core::Error::Io(std::io::Error::new(
335            std::io::ErrorKind::AlreadyExists,
336            format!(
337                "restore_tree refuses to overwrite existing path {}",
338                dst.display()
339            ),
340        )));
341    }
342    let tree_bytes = blobs.get(tree_digest)?;
343    let tree: FsTree = serde_json::from_slice(&tree_bytes)?;
344    if tree.kind != "fs.tree.v1" {
345        return Err(pf_core::Error::Integrity(format!(
346            "expected fs.tree.v1, got {}",
347            tree.kind
348        )));
349    }
350
351    // Stage to a sibling temp directory.
352    let parent = dst.parent().unwrap_or_else(|| Path::new("."));
353    std::fs::create_dir_all(parent)?;
354    let staging = parent.join(format!(
355        ".pf-restore.{}.{}",
356        std::process::id(),
357        chrono::Utc::now().timestamp_nanos_opt().unwrap_or_default(),
358    ));
359    std::fs::create_dir(&staging)?;
360
361    // Pass 1: directories (sorted, so parents land before children).
362    for e in tree
363        .entries
364        .iter()
365        .filter(|e| matches!(e.kind, FsEntryKind::Dir))
366    {
367        let safe = safe_join(&staging, &e.path)?;
368        std::fs::create_dir_all(&safe)?;
369        apply_mode(&safe, &e.mode)?;
370    }
371    // Pass 2: files + symlinks.
372    for e in &tree.entries {
373        let p = safe_join(&staging, &e.path)?;
374        match e.kind {
375            FsEntryKind::Dir => {}
376            FsEntryKind::File => {
377                let blob = e.blob.as_ref().ok_or_else(|| {
378                    pf_core::Error::Integrity(format!("file entry {} missing blob", e.path))
379                })?;
380                let bytes = blobs.get(blob)?;
381                if let Some(parent) = p.parent() {
382                    std::fs::create_dir_all(parent)?;
383                }
384                std::fs::write(&p, bytes)?;
385                apply_mode(&p, &e.mode)?;
386            }
387            FsEntryKind::Symlink => {
388                let raw_target = e.link_target.as_ref().ok_or_else(|| {
389                    pf_core::Error::Integrity(format!(
390                        "symlink entry {} missing link_target",
391                        e.path
392                    ))
393                })?;
394                // Symlink target hardening: refuse absolute targets and
395                // refuse relative targets that would escape the staging
396                // root. Together with the safe_join above this means a
397                // malicious .pfimg can never write or link outside the
398                // restore directory.
399                check_symlink_target(&staging, &p, raw_target)?;
400                if let Some(parent) = p.parent() {
401                    std::fs::create_dir_all(parent)?;
402                }
403                #[cfg(unix)]
404                std::os::unix::fs::symlink(raw_target, &p)?;
405                #[cfg(not(unix))]
406                std::fs::write(&p, raw_target.as_bytes())?;
407            }
408        }
409    }
410
411    // Atomic flip.
412    std::fs::rename(&staging, dst)?;
413    Ok(())
414}
415
416// `safe_join` (defined further down) is the v1.0.3 fix for the
417// "Zip Slip"–style CVE found in the v1.0.2 audit: a malicious .pfimg
418// with `path: "../../etc/passwd"` could write outside the target dir.
419
420/// Component-segment ignore matcher. v1.0.2 audit found that
421/// substring-matching dropped legitimate paths like
422/// `src/targeted/keep.txt` (because "target" appeared as a substring).
423///
424/// We now match each ignore entry as a *path-component slash-sequence*:
425/// an ignore of "target" matches a path that has any component equal
426/// to "target", but does NOT match "targeted" or "untargeted".
427/// Multi-segment ignores like ".git/objects" match consecutive
428/// component runs.
429/// True if `entry` looks like a glob pattern (contains `*`, `?`,
430/// or `[`). v1.0.13: used to decide whether to compile a glob
431/// matcher or treat the entry as a plain segment-match.
432fn has_glob_chars(entry: &str) -> bool {
433    entry.contains('*') || entry.contains('?') || entry.contains('[')
434}
435
436/// Compile every glob-style entry in `ignores` into a `GlobMatcher`.
437/// Plain segment entries are skipped (handled by
438/// `path_matches_any_ignore`). Invalid globs are silently dropped —
439/// the operator's `--ignore` arg already passed clap parsing, so a
440/// malformed glob is a user error worth surfacing via tracing but
441/// not worth aborting capture for.
442fn compile_globs(ignores: &[String]) -> Vec<globset::GlobMatcher> {
443    let mut out = Vec::new();
444    for ign in ignores {
445        if !has_glob_chars(ign) {
446            continue;
447        }
448        match globset::Glob::new(ign) {
449            Ok(g) => out.push(g.compile_matcher()),
450            Err(e) => tracing::warn!("ignore: invalid glob {ign:?}: {e}"),
451        }
452    }
453    out
454}
455
456/// True if any compiled glob matches `relative_path` (the path
457/// stripped of `walk_root`). Globs match the path with forward
458/// slashes — globset normalises this internally.
459fn path_matches_any_glob(relative_path: &Path, globs: &[globset::GlobMatcher]) -> bool {
460    if globs.is_empty() {
461        return false;
462    }
463    for g in globs {
464        if g.is_match(relative_path) {
465            return true;
466        }
467        // Also match against just the file name so `*.pyc` matches
468        // `src/foo.pyc` even on platforms where globset doesn't
469        // automatically descend on a non-`**` glob.
470        if let Some(name) = relative_path.file_name()
471            && g.is_match(Path::new(name))
472        {
473            return true;
474        }
475    }
476    false
477}
478
479fn path_matches_any_ignore(path: &Path, ignores: &[String]) -> bool {
480    let comps: Vec<&str> = path
481        .components()
482        .filter_map(|c| match c {
483            std::path::Component::Normal(s) => s.to_str(),
484            _ => None,
485        })
486        .collect();
487    for ign in ignores {
488        // Split each ignore on `/` so `.git/objects` checks for the
489        // consecutive pair, while bare `target` checks for the single
490        // segment.
491        let needles: Vec<&str> = ign.split('/').filter(|s| !s.is_empty()).collect();
492        if needles.is_empty() {
493            continue;
494        }
495        for w in comps.windows(needles.len()) {
496            if w == needles.as_slice() {
497                return true;
498            }
499        }
500    }
501    false
502}
503
504/// Join `relative` onto `root`, but reject anything that would escape
505/// `root`. Catches `..` segments, absolute paths, and Windows drive
506/// letters. Returns `pf_core::Error::Integrity` on any escape attempt.
507///
508/// v1.0.3 fix for the "Zip Slip"–style CVE found in the v1.0.2 audit.
509fn safe_join(root: &Path, relative: &str) -> pf_core::Result<PathBuf> {
510    let candidate = Path::new(relative);
511    if candidate.is_absolute() {
512        return Err(pf_core::Error::Integrity(format!(
513            "fs.tree entry has absolute path {relative:?} — refusing"
514        )));
515    }
516    // Component-by-component check rather than `..`-substring (substring
517    // would false-positive on legitimate names like "..foo").
518    for comp in candidate.components() {
519        match comp {
520            std::path::Component::ParentDir => {
521                return Err(pf_core::Error::Integrity(format!(
522                    "fs.tree entry path {relative:?} contains `..` — refusing"
523                )));
524            }
525            std::path::Component::RootDir | std::path::Component::Prefix(_) => {
526                return Err(pf_core::Error::Integrity(format!(
527                    "fs.tree entry path {relative:?} has root/prefix — refusing"
528                )));
529            }
530            std::path::Component::CurDir | std::path::Component::Normal(_) => {}
531        }
532    }
533    Ok(root.join(candidate))
534}
535
536/// Reject symlink targets that would resolve outside the restore root.
537/// Absolute targets are always rejected (they obviously escape). For
538/// relative targets we walk the components from the symlink's parent
539/// dir and reject if the cumulative depth ever goes negative relative
540/// to the root.
541fn check_symlink_target(root: &Path, link_path: &Path, target: &str) -> pf_core::Result<()> {
542    let target_path = Path::new(target);
543    if target_path.is_absolute() {
544        return Err(pf_core::Error::Integrity(format!(
545            "symlink target {target:?} is absolute — refusing"
546        )));
547    }
548    // Compute the symlink's depth below root, then walk the target's
549    // components keeping a running depth counter. If it ever goes
550    // below 0 the symlink would escape.
551    let link_depth = link_path
552        .strip_prefix(root)
553        .ok()
554        .map_or(0, |p| p.components().count().saturating_sub(1));
555    let mut depth = isize::try_from(link_depth).unwrap_or(isize::MAX);
556    for comp in target_path.components() {
557        match comp {
558            std::path::Component::ParentDir => depth -= 1,
559            std::path::Component::Normal(_) => depth += 1,
560            std::path::Component::CurDir => {}
561            std::path::Component::RootDir | std::path::Component::Prefix(_) => {
562                return Err(pf_core::Error::Integrity(format!(
563                    "symlink target {target:?} has root/prefix — refusing"
564                )));
565            }
566        }
567        if depth < 0 {
568            return Err(pf_core::Error::Integrity(format!(
569                "symlink target {target:?} escapes restore root — refusing"
570            )));
571        }
572    }
573    Ok(())
574}
575
576/// Apply the captured unix mode (e.g. "100755") to `path`. No-op on
577/// Windows. The mode string is taken from `unix_mode_string()` at
578/// capture time — the high bits are the file type and we mask them
579/// out before chmod (only the permission bits matter for restore).
580#[cfg(unix)]
581fn apply_mode(path: &Path, mode: &str) -> pf_core::Result<()> {
582    use std::os::unix::fs::PermissionsExt as _;
583    let raw = u32::from_str_radix(mode, 8).unwrap_or(0o644);
584    let perm = std::fs::Permissions::from_mode(raw & 0o7777);
585    // Don't chmod symlinks (lchmod isn't portable); the symlink's
586    // own mode is irrelevant on every linux/macos host.
587    let meta = std::fs::symlink_metadata(path)?;
588    if meta.file_type().is_symlink() {
589        return Ok(());
590    }
591    std::fs::set_permissions(path, perm)?;
592    Ok(())
593}
594
595#[cfg(not(unix))]
596fn apply_mode(_path: &Path, _mode: &str) -> pf_core::Result<()> {
597    Ok(())
598}
599
600// ----- macOS APFS clone helper -----
601
602#[cfg(target_os = "macos")]
603fn apfs_clone(src: &Path) -> std::io::Result<PathBuf> {
604    use std::process::Command;
605    let dst = std::env::temp_dir().join(format!(
606        "pf-apfs-clone.{}.{}",
607        std::process::id(),
608        chrono::Utc::now().timestamp_nanos_opt().unwrap_or_default(),
609    ));
610    let status = Command::new("cp")
611        .args(["-c", "-R"])
612        .arg(src)
613        .arg(&dst)
614        .status()?;
615    if !status.success() {
616        return Err(std::io::Error::other(format!(
617            "cp -c -R exit status: {status:?}"
618        )));
619    }
620    Ok(dst)
621}
622
623#[cfg(not(target_os = "macos"))]
624fn apfs_clone(_src: &Path) -> std::io::Result<PathBuf> {
625    Err(std::io::Error::other("APFS clone only available on macOS"))
626}
627
628// ----- mode helper -----
629
630#[cfg(unix)]
631fn unix_mode_string(meta: &std::fs::Metadata) -> String {
632    use std::os::unix::fs::PermissionsExt;
633    format!("{:04o}", meta.permissions().mode() & 0o7777)
634}
635#[cfg(not(unix))]
636fn unix_mode_string(meta: &std::fs::Metadata) -> String {
637    if meta.permissions().readonly() {
638        "0444".into()
639    } else {
640        "0644".into()
641    }
642}
643
644#[cfg(test)]
645mod tests {
646    use super::*;
647    use pf_core::cas::MemBlobStore;
648    use std::sync::Arc;
649    use tempfile::TempDir;
650
651    fn write(dir: &Path, rel: &str, contents: &[u8]) {
652        let p = dir.join(rel);
653        if let Some(parent) = p.parent() {
654            std::fs::create_dir_all(parent).unwrap();
655        }
656        std::fs::write(&p, contents).unwrap();
657    }
658
659    #[test]
660    fn round_trip_small_tree() {
661        let src = TempDir::new().unwrap();
662        write(src.path(), "a.txt", b"hello");
663        write(src.path(), "sub/b.txt", b"world");
664        write(src.path(), "sub/c.bin", &vec![0xABu8; 8 * 1024]);
665
666        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
667        let tree_cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
668
669        let restore_root = TempDir::new().unwrap();
670        let dst = restore_root.path().join("restored");
671        restore_tree(&blobs, &tree_cid, &dst).unwrap();
672
673        assert_eq!(std::fs::read(dst.join("a.txt")).unwrap(), b"hello");
674        assert_eq!(std::fs::read(dst.join("sub/b.txt")).unwrap(), b"world");
675        assert_eq!(
676            std::fs::read(dst.join("sub/c.bin")).unwrap().len(),
677            8 * 1024
678        );
679    }
680
681    #[test]
682    fn capture_is_deterministic() {
683        let src = TempDir::new().unwrap();
684        write(src.path(), "a.txt", b"hello");
685        write(src.path(), "b.txt", b"world");
686        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
687        let cid1 = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
688        let cid2 = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
689        assert_eq!(
690            cid1, cid2,
691            "capture of identical tree must be byte-identical"
692        );
693    }
694
695    #[test]
696    fn ignored_paths_are_skipped() {
697        let src = TempDir::new().unwrap();
698        write(src.path(), "kept.txt", b"keep");
699        write(src.path(), "node_modules/dep/index.js", b"skip");
700        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
701        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
702        let bytes = blobs.get(&cid).unwrap();
703        let tree: FsTree = serde_json::from_slice(&bytes).unwrap();
704        assert!(tree.entries.iter().any(|e| e.path == "kept.txt"));
705        assert!(
706            !tree
707                .entries
708                .iter()
709                .any(|e| e.path.starts_with("node_modules"))
710        );
711    }
712
713    #[cfg(unix)]
714    #[test]
715    fn symlinks_are_captured_as_symlinks() {
716        let src = TempDir::new().unwrap();
717        write(src.path(), "real.txt", b"data");
718        std::os::unix::fs::symlink("real.txt", src.path().join("link.txt")).unwrap();
719        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
720        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
721
722        let restore_root = TempDir::new().unwrap();
723        let dst = restore_root.path().join("r");
724        restore_tree(&blobs, &cid, &dst).unwrap();
725        let meta = std::fs::symlink_metadata(dst.join("link.txt")).unwrap();
726        assert!(meta.file_type().is_symlink());
727        assert_eq!(
728            std::fs::read_link(dst.join("link.txt"))
729                .unwrap()
730                .to_str()
731                .unwrap(),
732            "real.txt"
733        );
734    }
735
736    // ---- v1.0.3 audit-fix regression tests ----
737
738    /// CVE: malicious .pfimg with `..` in a path must be refused.
739    /// v1.0.2 audit reproduced writing outside the target dir twice.
740    #[test]
741    fn malicious_relative_path_traversal_is_refused() {
742        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
743        let payload = b"PWNED";
744        let blob = blobs.put(payload).unwrap();
745        let tree = FsTree {
746            kind: "fs.tree.v1".into(),
747            entries: vec![FsTreeEntry {
748                path: "../../escape.txt".into(),
749                mode: "100644".into(),
750                size: payload.len() as u64,
751                kind: FsEntryKind::File,
752                blob: Some(blob),
753                link_target: None,
754            }],
755        };
756        let tree_bytes = serde_json::to_vec(&tree).unwrap();
757        let tree_cid = blobs.put(&tree_bytes).unwrap();
758
759        let restore_root = TempDir::new().unwrap();
760        let dst = restore_root.path().join("dst");
761        let err = restore_tree(&blobs, &tree_cid, &dst).unwrap_err();
762        assert!(
763            format!("{err}").contains("`..`") || format!("{err}").contains("refusing"),
764            "expected path-traversal refusal, got {err}"
765        );
766        // And the would-be escaped path doesn't exist.
767        assert!(!restore_root.path().join("escape.txt").exists());
768    }
769
770    /// CVE: malicious .pfimg with an absolute path must be refused.
771    #[test]
772    fn malicious_absolute_path_is_refused() {
773        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
774        let blob = blobs.put(b"x").unwrap();
775        let tree = FsTree {
776            kind: "fs.tree.v1".into(),
777            entries: vec![FsTreeEntry {
778                path: "/tmp/should-not-write".into(),
779                mode: "100644".into(),
780                size: 1,
781                kind: FsEntryKind::File,
782                blob: Some(blob),
783                link_target: None,
784            }],
785        };
786        let tree_cid = blobs.put(&serde_json::to_vec(&tree).unwrap()).unwrap();
787        let restore_root = TempDir::new().unwrap();
788        let dst = restore_root.path().join("dst");
789        let err = restore_tree(&blobs, &tree_cid, &dst).unwrap_err();
790        assert!(
791            format!("{err}").contains("absolute") || format!("{err}").contains("refusing"),
792            "expected absolute-path refusal, got {err}"
793        );
794    }
795
796    /// CVE: malicious symlink whose target escapes the restore root
797    /// must be refused (otherwise a follow-up file-write through the
798    /// link writes outside the sandbox).
799    #[cfg(unix)]
800    #[test]
801    fn malicious_symlink_escape_is_refused() {
802        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
803        let target_str = "../../escape";
804        let blob = blobs.put(target_str.as_bytes()).unwrap();
805        let tree = FsTree {
806            kind: "fs.tree.v1".into(),
807            entries: vec![FsTreeEntry {
808                path: "evil.lnk".into(),
809                mode: "120777".into(),
810                size: target_str.len() as u64,
811                kind: FsEntryKind::Symlink,
812                blob: Some(blob),
813                link_target: Some(target_str.to_owned()),
814            }],
815        };
816        let tree_cid = blobs.put(&serde_json::to_vec(&tree).unwrap()).unwrap();
817        let restore_root = TempDir::new().unwrap();
818        let dst = restore_root.path().join("dst");
819        let err = restore_tree(&blobs, &tree_cid, &dst).unwrap_err();
820        assert!(
821            format!("{err}").contains("escape") || format!("{err}").contains("refusing"),
822            "expected symlink-escape refusal, got {err}"
823        );
824    }
825
826    /// v1.0.2 audit: 0755 source file restored as 0644.
827    #[cfg(unix)]
828    #[test]
829    fn executable_mode_is_restored() {
830        use std::os::unix::fs::PermissionsExt as _;
831        let src = TempDir::new().unwrap();
832        write(src.path(), "script.sh", b"#!/bin/sh\necho hi\n");
833        let scr = src.path().join("script.sh");
834        std::fs::set_permissions(&scr, std::fs::Permissions::from_mode(0o755)).unwrap();
835        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
836        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
837
838        let restore_root = TempDir::new().unwrap();
839        let dst = restore_root.path().join("r");
840        restore_tree(&blobs, &cid, &dst).unwrap();
841        let meta = std::fs::metadata(dst.join("script.sh")).unwrap();
842        assert_eq!(
843            meta.permissions().mode() & 0o7777,
844            0o755,
845            "executable bit must survive snapshot+restore"
846        );
847    }
848
849    /// v1.0.2 audit: substring matching dropped legitimate paths
850    /// like `src/targeted/keep.txt` (the "target" segment is also a
851    /// default ignore). After v1.0.3 the match is component-segment.
852    #[test]
853    fn ignore_matches_segments_not_substrings() {
854        let src = TempDir::new().unwrap();
855        write(src.path(), "src/targeted/keep.txt", b"keep");
856        write(src.path(), "target/should-skip.txt", b"skip");
857        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
858        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
859        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
860        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
861        assert!(
862            paths.contains(&"src/targeted/keep.txt"),
863            "src/targeted/keep.txt must NOT be filtered (was: {paths:?})"
864        );
865        assert!(
866            !paths.iter().any(|p| p.starts_with("target/")),
867            "target/ subtree must be filtered (was: {paths:?})"
868        );
869    }
870
871    /// v1.0.13: the v1.0.12 retest reproduced false merge conflicts
872    /// when `__pycache__/` and `.pytest_cache/` landed in the
873    /// captured tree from a `pytest` run on otherwise-disjoint
874    /// branches. Default-extra ignores now skip them.
875    #[test]
876    fn default_ignores_skip_python_cache_dirs() {
877        let src = TempDir::new().unwrap();
878        write(src.path(), "src/main.py", b"print('hi')\n");
879        write(
880            src.path(),
881            "src/__pycache__/main.cpython-313.pyc",
882            b"\x03\xf3\r\n", // pyc magic-ish
883        );
884        write(src.path(), ".pytest_cache/CACHEDIR.TAG", b"Signature: ...");
885        write(src.path(), ".mypy_cache/3.13/CACHEDIR.TAG", b"...");
886        write(src.path(), ".ruff_cache/0.6.0/foo", b"x");
887        write(src.path(), ".venv/bin/python", b"#!/...\n");
888        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
889        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
890        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
891        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
892
893        assert!(
894            paths.contains(&"src/main.py"),
895            "real source file must survive: {paths:?}"
896        );
897        for cache_pat in [
898            "__pycache__",
899            ".pytest_cache",
900            ".mypy_cache",
901            ".ruff_cache",
902            ".venv",
903        ] {
904            assert!(
905                !paths.iter().any(|p| p.contains(cache_pat)),
906                "{cache_pat} must be filtered by default; got: {paths:?}"
907            );
908        }
909    }
910
911    /// v1.0.13: glob-pattern entries on the ignore list match
912    /// against the path. The default set includes `*.pyc` so a
913    /// stray `.pyc` outside `__pycache__/` (e.g. shipped artefacts)
914    /// is also skipped.
915    #[test]
916    #[allow(clippy::case_sensitive_file_extension_comparisons)]
917    fn glob_patterns_match_files_anywhere_in_tree() {
918        let src = TempDir::new().unwrap();
919        write(src.path(), "src/main.py", b"keep");
920        write(src.path(), "src/legacy.pyc", b"skip-by-glob");
921        write(src.path(), "build/output.pyc", b"skip-by-glob");
922        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
923        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
924        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
925        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
926        assert!(
927            paths.contains(&"src/main.py"),
928            "non-glob source must survive: {paths:?}"
929        );
930        assert!(
931            !paths.iter().any(|p| p.ends_with(".pyc")),
932            "*.pyc glob must filter every .pyc anywhere: {paths:?}"
933        );
934    }
935
936    /// v1.0.13: opt out of the default-extra set when you genuinely
937    /// need to capture cache files (CI auditing the cache shape, a
938    /// registry mirror, etc.).
939    #[test]
940    fn opt_out_of_default_ignores_captures_caches() {
941        let src = TempDir::new().unwrap();
942        write(src.path(), "__pycache__/foo.pyc", b"x");
943        write(src.path(), "src/main.py", b"hi");
944        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
945        let cid = WalkFsCapture::new_without_default_ignores(src.path())
946            .capture(&blobs)
947            .unwrap();
948        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
949        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
950        assert!(
951            paths.iter().any(|p| p.contains("__pycache__")),
952            "without default ignores, __pycache__ must round-trip: {paths:?}"
953        );
954    }
955
956    /// v1.0.13: `.ignore_from(".pfignore")` reads gitignore-style
957    /// rules from the captured tree. Common case: operator drops
958    /// project-specific patterns into a `.pfignore` file alongside
959    /// the source.
960    #[test]
961    #[allow(clippy::case_sensitive_file_extension_comparisons)]
962    fn ignore_from_file_applies_each_line() {
963        let src = TempDir::new().unwrap();
964        write(src.path(), "src/main.py", b"keep");
965        write(src.path(), "secrets/api.key", b"private");
966        write(src.path(), "logs/today.log", b"verbose");
967        write(
968            src.path(),
969            ".pfignore",
970            b"# project ignores\nsecrets\n*.log\n",
971        );
972        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
973        let cid = WalkFsCapture::new(src.path())
974            .ignore_from(src.path().join(".pfignore"))
975            .unwrap()
976            .capture(&blobs)
977            .unwrap();
978        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
979        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
980        assert!(paths.contains(&"src/main.py"));
981        assert!(
982            !paths.iter().any(|p| p.starts_with("secrets/")),
983            "secrets/ should be filtered by .pfignore: {paths:?}"
984        );
985        assert!(
986            !paths.iter().any(|p| p.ends_with(".log")),
987            "*.log glob from .pfignore should filter logs: {paths:?}"
988        );
989    }
990}