Skip to main content

pf_world/
fs.rs

1// SPDX-License-Identifier: MIT
2//! Filesystem layer: walk + content-address + restore.
3
4use pf_core::cas::BlobStore;
5use pf_core::digest::Digest256;
6
7use rayon::prelude::*;
8use serde::{Deserialize, Serialize};
9use std::path::{Path, PathBuf};
10use std::sync::Arc;
11
12/// One entry in the captured FS tree manifest.
13#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
14pub struct FsTreeEntry {
15    /// Path **relative to the captured root** (forward-slash separated).
16    pub path: String,
17    /// `mode` stored as 4 octal digits (e.g. `"0644"`); we keep it as a
18    /// string to preserve the leading zero through JSON.
19    pub mode: String,
20    /// File size in bytes (post-decompression). Symlinks: target byte length.
21    pub size: u64,
22    /// File kind.
23    pub kind: FsEntryKind,
24    /// Content digest. For symlinks, the digest of the target string. For
25    /// directories, [`None`] (the directory is implied by its children).
26    #[serde(default, skip_serializing_if = "Option::is_none")]
27    pub blob: Option<Digest256>,
28    /// Symlink target (only for symlinks).
29    #[serde(default, skip_serializing_if = "Option::is_none")]
30    pub link_target: Option<String>,
31}
32
33/// File kind for [`FsTreeEntry`].
34#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
35#[serde(rename_all = "snake_case")]
36pub enum FsEntryKind {
37    /// Regular file.
38    File,
39    /// Directory (no content; presence implies the dir).
40    Dir,
41    /// Symbolic link.
42    Symlink,
43}
44
45/// Wire format of the captured tree (`fs.tree.v1` blob).
46#[derive(Clone, Debug, Serialize, Deserialize)]
47pub struct FsTree {
48    /// Schema discriminator. Always `"fs.tree.v1"`.
49    pub kind: String,
50    /// Entries sorted by `path` for deterministic digests.
51    pub entries: Vec<FsTreeEntry>,
52}
53
54/// Captures a directory tree into a [`BlobStore`] and emits a single
55/// `fs.tree.v1` blob describing the structure.
56///
57/// Concurrency: file content-addressing runs on a rayon thread pool. Walk
58/// itself is single-threaded (`walkdir`) — we sort all entries first so the
59/// emitted manifest is byte-identical across runs over the same tree.
60pub struct WalkFsCapture {
61    root: PathBuf,
62    use_apfs_clone: bool,
63    follow_symlinks: bool,
64    ignore: Vec<String>,
65    /// v1.0.13 audit fix: glob-style ignore patterns alongside the
66    /// segment-match ones. Built lazily from `ignore` entries that
67    /// contain glob meta-characters (`*`, `?`, `[`).
68    ignore_globs: Vec<globset::GlobMatcher>,
69}
70
71/// v1.0.13 default-ignore set extension. The previous default set
72/// covered build directories (`target`, `node_modules`, `.git/objects`)
73/// and the `.pfcid` sentinel. The v1.0.12 retest reproduced **false
74/// merge conflicts** when `__pycache__/` and `.pytest_cache/`
75/// landed in the captured tree from a `pytest` run on otherwise
76/// disjoint branches. This set adds the universally-cache directory
77/// names that should never be source-of-truth, plus common Python
78/// bytecode patterns. Conservative: nothing here is ever a "maybe
79/// I want this" — they are all caches by definition.
80const DEFAULT_EXTRA_IGNORES: &[&str] = &[
81    "__pycache__",
82    ".pytest_cache",
83    ".mypy_cache",
84    ".ruff_cache",
85    ".tox",
86    ".coverage",
87    ".venv",
88    ".DS_Store",
89    "*.pyc",
90    "*.pyo",
91];
92
93impl WalkFsCapture {
94    /// Capture the directory rooted at `root`.
95    pub fn new(root: impl AsRef<Path>) -> Self {
96        let mut ignore: Vec<String> = vec![
97            ".git/objects".into(),
98            "target".into(),
99            "node_modules".into(),
100            // `.pfcid` is the sentinel `pf checkout` writes so a
101            // subsequent `pf snapshot` knows its parent CID. We
102            // skip it here so it never lands in the captured tree.
103            ".pfcid".into(),
104        ];
105        for extra in DEFAULT_EXTRA_IGNORES {
106            ignore.push((*extra).to_owned());
107        }
108        let ignore_globs = compile_globs(&ignore);
109        Self {
110            root: root.as_ref().to_path_buf(),
111            use_apfs_clone: false,
112            follow_symlinks: false,
113            ignore,
114            ignore_globs,
115        }
116    }
117
118    /// Build a capturer that does NOT carry the v1.0.13 default-extra
119    /// ignore set (`__pycache__`, `.pytest_cache`, `*.pyc`, …).
120    /// Operators who want byte-for-byte capture of every file in
121    /// the source tree (rare; CI auditing the set itself; building
122    /// a registry mirror) call this. Default callers should use
123    /// [`WalkFsCapture::new`] which has the safe set.
124    pub fn new_without_default_ignores(root: impl AsRef<Path>) -> Self {
125        let ignore: Vec<String> = vec![
126            ".git/objects".into(),
127            "target".into(),
128            "node_modules".into(),
129            ".pfcid".into(),
130        ];
131        let ignore_globs = compile_globs(&ignore);
132        Self {
133            root: root.as_ref().to_path_buf(),
134            use_apfs_clone: false,
135            follow_symlinks: false,
136            ignore,
137            ignore_globs,
138        }
139    }
140
141    /// Toggle the macOS APFS clone fast-path. When enabled and the source is
142    /// on APFS, we `clonefile(2)`-clone the directory into a temp dir first
143    /// (O(1) per the APFS docs) and walk the clone — giving a stable view
144    /// without pausing the agent. Falls back to a direct walk on other
145    /// filesystems / OSes. Off by default in v1; opt in for production.
146    #[must_use]
147    pub fn use_apfs_clone(mut self, enable: bool) -> Self {
148        self.use_apfs_clone = enable;
149        self
150    }
151
152    /// Follow symlinks during walk. Off by default — we capture symlinks as
153    /// symlinks, not as the targets they happen to point at.
154    #[must_use]
155    pub fn follow_symlinks(mut self, enable: bool) -> Self {
156        self.follow_symlinks = enable;
157        self
158    }
159
160    /// Add a path-fragment OR glob pattern to the ignore list.
161    ///
162    /// - Plain entries (`target`, `__pycache__`, `.git/objects`) are
163    ///   matched as path-component sequences, exactly as before.
164    /// - Glob entries (anything containing `*`, `?`, `[`) are matched
165    ///   against the relative path via [`globset::Glob`]. Common
166    ///   patterns: `*.pyc`, `*.log`, `**/build/**`.
167    ///
168    /// v1.0.13 added glob support; segment-match semantics for plain
169    /// entries are unchanged.
170    #[must_use]
171    pub fn ignore(mut self, fragment: impl Into<String>) -> Self {
172        let entry: String = fragment.into();
173        if has_glob_chars(&entry)
174            && let Ok(g) = globset::Glob::new(&entry)
175        {
176            self.ignore_globs.push(g.compile_matcher());
177        }
178        self.ignore.push(entry);
179        self
180    }
181
182    /// Read a `.gitignore`/`.pfignore`-style file and apply each
183    /// non-comment, non-empty line as an ignore entry. Lines ending
184    /// with `/` have the slash stripped (gitignore directory marker).
185    /// Lines starting with `!` (gitignore negation) are skipped with
186    /// a `tracing::warn!` — this is a v1.0.13 limitation; full
187    /// gitignore semantics with negation arrive when an operator
188    /// hits the use case.
189    ///
190    /// Returns Ok(self) even if the file doesn't exist (so
191    /// `.ignore_from(".pfignore")` is safe to chain unconditionally).
192    /// Returns the underlying io::Error only if the file exists but
193    /// can't be read (permissions, etc.).
194    pub fn ignore_from(mut self, path: impl AsRef<Path>) -> std::io::Result<Self> {
195        let path = path.as_ref();
196        if !path.exists() {
197            return Ok(self);
198        }
199        let content = std::fs::read_to_string(path)?;
200        for raw in content.lines() {
201            let line = raw.trim();
202            if line.is_empty() || line.starts_with('#') {
203                continue;
204            }
205            if line.starts_with('!') {
206                tracing::warn!(
207                    "ignoring gitignore negation in {}: {} (negation not yet supported in v1.0.13)",
208                    path.display(),
209                    line
210                );
211                continue;
212            }
213            let trimmed = line.trim_start_matches('/').trim_end_matches('/');
214            if trimmed.is_empty() {
215                continue;
216            }
217            self = self.ignore(trimmed);
218        }
219        Ok(self)
220    }
221
222    /// Run the capture. Returns the digest of the `fs.tree.v1` blob.
223    pub fn capture(&self, blobs: &Arc<dyn BlobStore>) -> pf_core::Result<Digest256> {
224        // APFS clone fast-path is best-effort; if it fails we fall back to
225        // walking the live tree.
226        let walk_root: PathBuf = if self.use_apfs_clone && cfg!(target_os = "macos") {
227            apfs_clone(&self.root).unwrap_or_else(|_| self.root.clone())
228        } else {
229            self.root.clone()
230        };
231
232        // Collect entries first so we can sort and parallelize hashing.
233        let mut raw: Vec<walkdir::DirEntry> = walkdir::WalkDir::new(&walk_root)
234            .follow_links(self.follow_symlinks)
235            .into_iter()
236            .filter_entry(|e| {
237                // Component-segment match (NOT substring). The v1.0.2
238                // audit found that the previous `p.contains(frag)` test
239                // dropped legitimate paths whose name happened to share
240                // a substring with an ignore entry, e.g.
241                // `src/targeted/keep.txt` was filtered because "target"
242                // appeared as a substring. We now compare each
243                // path-component to each ignore entry exactly. Multi-
244                // segment ignores like ".git/objects" still work via
245                // path-prefix containment of the joined segments.
246                //
247                // v1.0.13: glob entries (containing `*`/`?`/`[`) are
248                // also matched against the path RELATIVE to walk_root,
249                // so `*.pyc` correctly skips `src/foo.pyc`.
250                let rel = e.path().strip_prefix(&walk_root).unwrap_or(e.path());
251                !path_matches_any_ignore(e.path(), &self.ignore)
252                    && !path_matches_any_glob(rel, &self.ignore_globs)
253            })
254            .filter_map(std::result::Result::ok)
255            .collect();
256
257        // Skip the root itself (we capture its contents, not its name).
258        raw.retain(|e| e.path() != walk_root.as_path());
259
260        // Sort by path for deterministic manifests.
261        raw.sort_by(|a, b| a.path().cmp(b.path()));
262
263        // Parallel-hash regular files; symlinks/dirs are O(1).
264        let entries: Vec<FsTreeEntry> = raw
265            .par_iter()
266            .map(|de| -> pf_core::Result<FsTreeEntry> {
267                let abs = de.path();
268                let rel = abs.strip_prefix(&walk_root).unwrap_or(abs);
269                let rel_str = rel.to_string_lossy().replace('\\', "/");
270                let meta = de
271                    .metadata()
272                    .map_err(|e| std::io::Error::other(e.to_string()))?;
273                let mode = unix_mode_string(&meta);
274
275                if meta.file_type().is_dir() {
276                    return Ok(FsTreeEntry {
277                        path: rel_str,
278                        mode,
279                        size: 0,
280                        kind: FsEntryKind::Dir,
281                        blob: None,
282                        link_target: None,
283                    });
284                }
285                if meta.file_type().is_symlink() {
286                    let target = std::fs::read_link(abs)?;
287                    let target_str = target.to_string_lossy().to_string();
288                    let blob = blobs.put(target_str.as_bytes())?;
289                    return Ok(FsTreeEntry {
290                        path: rel_str,
291                        mode,
292                        size: target_str.len() as u64,
293                        kind: FsEntryKind::Symlink,
294                        blob: Some(blob),
295                        link_target: Some(target_str),
296                    });
297                }
298                // Regular file.
299                let bytes = std::fs::read(abs)?;
300                let size = bytes.len() as u64;
301                let digest = blobs.put(&bytes)?;
302                Ok(FsTreeEntry {
303                    path: rel_str,
304                    mode,
305                    size,
306                    kind: FsEntryKind::File,
307                    blob: Some(digest),
308                    link_target: None,
309                })
310            })
311            .collect::<pf_core::Result<Vec<_>>>()?;
312
313        let tree = FsTree {
314            kind: "fs.tree.v1".into(),
315            entries,
316        };
317        let json = serde_json::to_vec(&tree)?;
318        blobs.put(&json)
319    }
320}
321
322/// Knobs for [`restore_tree_with_options`].
323///
324/// v1.0.14 audit fix: prior versions of `restore_tree` treated any
325/// absolute-target symlink as a hard error that aborted the whole
326/// restore. The auditor flagged this as awkward — captured trees
327/// often contain legitimate absolute symlinks (e.g. `/var/log/agent`)
328/// that the operator wants to keep. New behavior:
329///
330/// - `allow_absolute_symlinks = false` (default): absolute symlinks
331///   are **skipped with an `eprintln!` warning** instead of erroring.
332///   The CVE protection (PF-SA-2026-001 "Zip Slip") is unaffected:
333///   we never WRITE through the symlink, only choose whether to
334///   create it. Skipping is a strict safety improvement over erroring
335///   (the rest of the tree restores; the operator sees what was
336///   skipped) and matches what tar/rsync do.
337/// - `allow_absolute_symlinks = true`: opt-in restore of absolute
338///   symlinks verbatim. The operator explicitly acknowledges that
339///   anything later reading through the symlink may escape the
340///   sandbox.
341#[derive(Debug, Clone, Copy, Default)]
342pub struct RestoreOptions {
343    /// See struct docs. Default: `false` (skip-with-warn).
344    pub allow_absolute_symlinks: bool,
345}
346
347/// Restore a previously-captured tree blob into a fresh directory `dst`.
348///
349/// The restore is **atomic**: we rebuild into `dst.with_extension("pftmp")`,
350/// `fsync` the parent, then `rename(2)` over `dst`. If `dst` already exists
351/// the call errors — callers can pass a tempdir or pre-clean.
352///
353/// Equivalent to [`restore_tree_with_options`] with [`RestoreOptions::default`].
354pub fn restore_tree(
355    blobs: &Arc<dyn BlobStore>,
356    tree_digest: &Digest256,
357    dst: impl AsRef<Path>,
358) -> pf_core::Result<()> {
359    restore_tree_with_options(blobs, tree_digest, dst, RestoreOptions::default())
360}
361
362/// Restore a tree with operator-supplied options. v1.0.14 — see
363/// [`RestoreOptions`].
364pub fn restore_tree_with_options(
365    blobs: &Arc<dyn BlobStore>,
366    tree_digest: &Digest256,
367    dst: impl AsRef<Path>,
368    opts: RestoreOptions,
369) -> pf_core::Result<()> {
370    let dst = dst.as_ref();
371    if dst.exists() {
372        return Err(pf_core::Error::Io(std::io::Error::new(
373            std::io::ErrorKind::AlreadyExists,
374            format!(
375                "restore_tree refuses to overwrite existing path {}",
376                dst.display()
377            ),
378        )));
379    }
380    let tree_bytes = blobs.get(tree_digest)?;
381    let tree: FsTree = serde_json::from_slice(&tree_bytes)?;
382    if tree.kind != "fs.tree.v1" {
383        return Err(pf_core::Error::Integrity(format!(
384            "expected fs.tree.v1, got {}",
385            tree.kind
386        )));
387    }
388
389    // Stage to a sibling temp directory.
390    let parent = dst.parent().unwrap_or_else(|| Path::new("."));
391    std::fs::create_dir_all(parent)?;
392    let staging = parent.join(format!(
393        ".pf-restore.{}.{}",
394        std::process::id(),
395        chrono::Utc::now().timestamp_nanos_opt().unwrap_or_default(),
396    ));
397    std::fs::create_dir(&staging)?;
398
399    // Pass 1: directories (sorted, so parents land before children).
400    for e in tree
401        .entries
402        .iter()
403        .filter(|e| matches!(e.kind, FsEntryKind::Dir))
404    {
405        let safe = safe_join(&staging, &e.path)?;
406        std::fs::create_dir_all(&safe)?;
407        apply_mode(&safe, &e.mode)?;
408    }
409    // Pass 2: files + symlinks.
410    for e in &tree.entries {
411        let p = safe_join(&staging, &e.path)?;
412        match e.kind {
413            FsEntryKind::Dir => {}
414            FsEntryKind::File => {
415                let blob = e.blob.as_ref().ok_or_else(|| {
416                    pf_core::Error::Integrity(format!("file entry {} missing blob", e.path))
417                })?;
418                let bytes = blobs.get(blob)?;
419                if let Some(parent) = p.parent() {
420                    std::fs::create_dir_all(parent)?;
421                }
422                std::fs::write(&p, bytes)?;
423                apply_mode(&p, &e.mode)?;
424            }
425            FsEntryKind::Symlink => {
426                let raw_target = e.link_target.as_ref().ok_or_else(|| {
427                    pf_core::Error::Integrity(format!(
428                        "symlink entry {} missing link_target",
429                        e.path
430                    ))
431                })?;
432                // Symlink target hardening:
433                //   - Relative targets that escape the staging root
434                //     are ALWAYS refused (the depth-counter check).
435                //     This is the v1.0.3 PF-SA-2026-001 "Zip Slip"
436                //     fix and is non-negotiable.
437                //   - Absolute targets are gated on
438                //     `opts.allow_absolute_symlinks`:
439                //       false (default): skip-with-warn so the rest
440                //                        of the tree still restores.
441                //       true: restore verbatim; operator opts in.
442                if Path::new(raw_target).is_absolute() {
443                    if opts.allow_absolute_symlinks {
444                        if let Some(parent) = p.parent() {
445                            std::fs::create_dir_all(parent)?;
446                        }
447                        #[cfg(unix)]
448                        std::os::unix::fs::symlink(raw_target, &p)?;
449                        #[cfg(not(unix))]
450                        std::fs::write(&p, raw_target.as_bytes())?;
451                    } else {
452                        eprintln!(
453                            "warning: skipped absolute symlink {} -> {} \
454                             (pass --allow-absolute-symlinks to restore)",
455                            e.path, raw_target
456                        );
457                    }
458                    continue;
459                }
460                check_symlink_target(&staging, &p, raw_target)?;
461                if let Some(parent) = p.parent() {
462                    std::fs::create_dir_all(parent)?;
463                }
464                #[cfg(unix)]
465                std::os::unix::fs::symlink(raw_target, &p)?;
466                #[cfg(not(unix))]
467                std::fs::write(&p, raw_target.as_bytes())?;
468            }
469        }
470    }
471
472    // Atomic flip.
473    std::fs::rename(&staging, dst)?;
474    Ok(())
475}
476
477// `safe_join` (defined further down) is the v1.0.3 fix for the
478// "Zip Slip"–style CVE found in the v1.0.2 audit: a malicious .pfimg
479// with `path: "../../etc/passwd"` could write outside the target dir.
480
481/// Component-segment ignore matcher. v1.0.2 audit found that
482/// substring-matching dropped legitimate paths like
483/// `src/targeted/keep.txt` (because "target" appeared as a substring).
484///
485/// We now match each ignore entry as a *path-component slash-sequence*:
486/// an ignore of "target" matches a path that has any component equal
487/// to "target", but does NOT match "targeted" or "untargeted".
488/// Multi-segment ignores like ".git/objects" match consecutive
489/// component runs.
490/// True if `entry` looks like a glob pattern (contains `*`, `?`,
491/// or `[`). v1.0.13: used to decide whether to compile a glob
492/// matcher or treat the entry as a plain segment-match.
493fn has_glob_chars(entry: &str) -> bool {
494    entry.contains('*') || entry.contains('?') || entry.contains('[')
495}
496
497/// Compile every glob-style entry in `ignores` into a `GlobMatcher`.
498/// Plain segment entries are skipped (handled by
499/// `path_matches_any_ignore`). Invalid globs are silently dropped —
500/// the operator's `--ignore` arg already passed clap parsing, so a
501/// malformed glob is a user error worth surfacing via tracing but
502/// not worth aborting capture for.
503fn compile_globs(ignores: &[String]) -> Vec<globset::GlobMatcher> {
504    let mut out = Vec::new();
505    for ign in ignores {
506        if !has_glob_chars(ign) {
507            continue;
508        }
509        match globset::Glob::new(ign) {
510            Ok(g) => out.push(g.compile_matcher()),
511            Err(e) => tracing::warn!("ignore: invalid glob {ign:?}: {e}"),
512        }
513    }
514    out
515}
516
517/// True if any compiled glob matches `relative_path` (the path
518/// stripped of `walk_root`). Globs match the path with forward
519/// slashes — globset normalises this internally.
520fn path_matches_any_glob(relative_path: &Path, globs: &[globset::GlobMatcher]) -> bool {
521    if globs.is_empty() {
522        return false;
523    }
524    for g in globs {
525        if g.is_match(relative_path) {
526            return true;
527        }
528        // Also match against just the file name so `*.pyc` matches
529        // `src/foo.pyc` even on platforms where globset doesn't
530        // automatically descend on a non-`**` glob.
531        if let Some(name) = relative_path.file_name()
532            && g.is_match(Path::new(name))
533        {
534            return true;
535        }
536    }
537    false
538}
539
540fn path_matches_any_ignore(path: &Path, ignores: &[String]) -> bool {
541    let comps: Vec<&str> = path
542        .components()
543        .filter_map(|c| match c {
544            std::path::Component::Normal(s) => s.to_str(),
545            _ => None,
546        })
547        .collect();
548    for ign in ignores {
549        // Split each ignore on `/` so `.git/objects` checks for the
550        // consecutive pair, while bare `target` checks for the single
551        // segment.
552        let needles: Vec<&str> = ign.split('/').filter(|s| !s.is_empty()).collect();
553        if needles.is_empty() {
554            continue;
555        }
556        for w in comps.windows(needles.len()) {
557            if w == needles.as_slice() {
558                return true;
559            }
560        }
561    }
562    false
563}
564
565/// Join `relative` onto `root`, but reject anything that would escape
566/// `root`. Catches `..` segments, absolute paths, and Windows drive
567/// letters. Returns `pf_core::Error::Integrity` on any escape attempt.
568///
569/// v1.0.3 fix for the "Zip Slip"–style CVE found in the v1.0.2 audit.
570fn safe_join(root: &Path, relative: &str) -> pf_core::Result<PathBuf> {
571    let candidate = Path::new(relative);
572    if candidate.is_absolute() {
573        return Err(pf_core::Error::Integrity(format!(
574            "fs.tree entry has absolute path {relative:?} — refusing"
575        )));
576    }
577    // Component-by-component check rather than `..`-substring (substring
578    // would false-positive on legitimate names like "..foo").
579    for comp in candidate.components() {
580        match comp {
581            std::path::Component::ParentDir => {
582                return Err(pf_core::Error::Integrity(format!(
583                    "fs.tree entry path {relative:?} contains `..` — refusing"
584                )));
585            }
586            std::path::Component::RootDir | std::path::Component::Prefix(_) => {
587                return Err(pf_core::Error::Integrity(format!(
588                    "fs.tree entry path {relative:?} has root/prefix — refusing"
589                )));
590            }
591            std::path::Component::CurDir | std::path::Component::Normal(_) => {}
592        }
593    }
594    Ok(root.join(candidate))
595}
596
597/// Reject symlink targets that would resolve outside the restore root.
598/// Absolute targets are always rejected (they obviously escape). For
599/// relative targets we walk the components from the symlink's parent
600/// dir and reject if the cumulative depth ever goes negative relative
601/// to the root.
602fn check_symlink_target(root: &Path, link_path: &Path, target: &str) -> pf_core::Result<()> {
603    let target_path = Path::new(target);
604    if target_path.is_absolute() {
605        return Err(pf_core::Error::Integrity(format!(
606            "symlink target {target:?} is absolute — refusing"
607        )));
608    }
609    // Compute the symlink's depth below root, then walk the target's
610    // components keeping a running depth counter. If it ever goes
611    // below 0 the symlink would escape.
612    let link_depth = link_path
613        .strip_prefix(root)
614        .ok()
615        .map_or(0, |p| p.components().count().saturating_sub(1));
616    let mut depth = isize::try_from(link_depth).unwrap_or(isize::MAX);
617    for comp in target_path.components() {
618        match comp {
619            std::path::Component::ParentDir => depth -= 1,
620            std::path::Component::Normal(_) => depth += 1,
621            std::path::Component::CurDir => {}
622            std::path::Component::RootDir | std::path::Component::Prefix(_) => {
623                return Err(pf_core::Error::Integrity(format!(
624                    "symlink target {target:?} has root/prefix — refusing"
625                )));
626            }
627        }
628        if depth < 0 {
629            return Err(pf_core::Error::Integrity(format!(
630                "symlink target {target:?} escapes restore root — refusing"
631            )));
632        }
633    }
634    Ok(())
635}
636
637/// Apply the captured unix mode (e.g. "100755") to `path`. No-op on
638/// Windows. The mode string is taken from `unix_mode_string()` at
639/// capture time — the high bits are the file type and we mask them
640/// out before chmod (only the permission bits matter for restore).
641#[cfg(unix)]
642fn apply_mode(path: &Path, mode: &str) -> pf_core::Result<()> {
643    use std::os::unix::fs::PermissionsExt as _;
644    let raw = u32::from_str_radix(mode, 8).unwrap_or(0o644);
645    let perm = std::fs::Permissions::from_mode(raw & 0o7777);
646    // Don't chmod symlinks (lchmod isn't portable); the symlink's
647    // own mode is irrelevant on every linux/macos host.
648    let meta = std::fs::symlink_metadata(path)?;
649    if meta.file_type().is_symlink() {
650        return Ok(());
651    }
652    std::fs::set_permissions(path, perm)?;
653    Ok(())
654}
655
656#[cfg(not(unix))]
657fn apply_mode(_path: &Path, _mode: &str) -> pf_core::Result<()> {
658    Ok(())
659}
660
661// ----- macOS APFS clone helper -----
662
663#[cfg(target_os = "macos")]
664fn apfs_clone(src: &Path) -> std::io::Result<PathBuf> {
665    use std::process::Command;
666    let dst = std::env::temp_dir().join(format!(
667        "pf-apfs-clone.{}.{}",
668        std::process::id(),
669        chrono::Utc::now().timestamp_nanos_opt().unwrap_or_default(),
670    ));
671    let status = Command::new("cp")
672        .args(["-c", "-R"])
673        .arg(src)
674        .arg(&dst)
675        .status()?;
676    if !status.success() {
677        return Err(std::io::Error::other(format!(
678            "cp -c -R exit status: {status:?}"
679        )));
680    }
681    Ok(dst)
682}
683
684#[cfg(not(target_os = "macos"))]
685fn apfs_clone(_src: &Path) -> std::io::Result<PathBuf> {
686    Err(std::io::Error::other("APFS clone only available on macOS"))
687}
688
689// ----- mode helper -----
690
691#[cfg(unix)]
692fn unix_mode_string(meta: &std::fs::Metadata) -> String {
693    use std::os::unix::fs::PermissionsExt;
694    format!("{:04o}", meta.permissions().mode() & 0o7777)
695}
696#[cfg(not(unix))]
697fn unix_mode_string(meta: &std::fs::Metadata) -> String {
698    if meta.permissions().readonly() {
699        "0444".into()
700    } else {
701        "0644".into()
702    }
703}
704
705#[cfg(test)]
706mod tests {
707    use super::*;
708    use pf_core::cas::MemBlobStore;
709    use std::sync::Arc;
710    use tempfile::TempDir;
711
712    fn write(dir: &Path, rel: &str, contents: &[u8]) {
713        let p = dir.join(rel);
714        if let Some(parent) = p.parent() {
715            std::fs::create_dir_all(parent).unwrap();
716        }
717        std::fs::write(&p, contents).unwrap();
718    }
719
720    #[test]
721    fn round_trip_small_tree() {
722        let src = TempDir::new().unwrap();
723        write(src.path(), "a.txt", b"hello");
724        write(src.path(), "sub/b.txt", b"world");
725        write(src.path(), "sub/c.bin", &vec![0xABu8; 8 * 1024]);
726
727        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
728        let tree_cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
729
730        let restore_root = TempDir::new().unwrap();
731        let dst = restore_root.path().join("restored");
732        restore_tree(&blobs, &tree_cid, &dst).unwrap();
733
734        assert_eq!(std::fs::read(dst.join("a.txt")).unwrap(), b"hello");
735        assert_eq!(std::fs::read(dst.join("sub/b.txt")).unwrap(), b"world");
736        assert_eq!(
737            std::fs::read(dst.join("sub/c.bin")).unwrap().len(),
738            8 * 1024
739        );
740    }
741
742    #[test]
743    fn capture_is_deterministic() {
744        let src = TempDir::new().unwrap();
745        write(src.path(), "a.txt", b"hello");
746        write(src.path(), "b.txt", b"world");
747        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
748        let cid1 = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
749        let cid2 = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
750        assert_eq!(
751            cid1, cid2,
752            "capture of identical tree must be byte-identical"
753        );
754    }
755
756    #[test]
757    fn ignored_paths_are_skipped() {
758        let src = TempDir::new().unwrap();
759        write(src.path(), "kept.txt", b"keep");
760        write(src.path(), "node_modules/dep/index.js", b"skip");
761        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
762        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
763        let bytes = blobs.get(&cid).unwrap();
764        let tree: FsTree = serde_json::from_slice(&bytes).unwrap();
765        assert!(tree.entries.iter().any(|e| e.path == "kept.txt"));
766        assert!(
767            !tree
768                .entries
769                .iter()
770                .any(|e| e.path.starts_with("node_modules"))
771        );
772    }
773
774    #[cfg(unix)]
775    #[test]
776    fn symlinks_are_captured_as_symlinks() {
777        let src = TempDir::new().unwrap();
778        write(src.path(), "real.txt", b"data");
779        std::os::unix::fs::symlink("real.txt", src.path().join("link.txt")).unwrap();
780        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
781        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
782
783        let restore_root = TempDir::new().unwrap();
784        let dst = restore_root.path().join("r");
785        restore_tree(&blobs, &cid, &dst).unwrap();
786        let meta = std::fs::symlink_metadata(dst.join("link.txt")).unwrap();
787        assert!(meta.file_type().is_symlink());
788        assert_eq!(
789            std::fs::read_link(dst.join("link.txt"))
790                .unwrap()
791                .to_str()
792                .unwrap(),
793            "real.txt"
794        );
795    }
796
797    // ---- v1.0.3 audit-fix regression tests ----
798
799    /// CVE: malicious .pfimg with `..` in a path must be refused.
800    /// v1.0.2 audit reproduced writing outside the target dir twice.
801    #[test]
802    fn malicious_relative_path_traversal_is_refused() {
803        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
804        let payload = b"PWNED";
805        let blob = blobs.put(payload).unwrap();
806        let tree = FsTree {
807            kind: "fs.tree.v1".into(),
808            entries: vec![FsTreeEntry {
809                path: "../../escape.txt".into(),
810                mode: "100644".into(),
811                size: payload.len() as u64,
812                kind: FsEntryKind::File,
813                blob: Some(blob),
814                link_target: None,
815            }],
816        };
817        let tree_bytes = serde_json::to_vec(&tree).unwrap();
818        let tree_cid = blobs.put(&tree_bytes).unwrap();
819
820        let restore_root = TempDir::new().unwrap();
821        let dst = restore_root.path().join("dst");
822        let err = restore_tree(&blobs, &tree_cid, &dst).unwrap_err();
823        assert!(
824            format!("{err}").contains("`..`") || format!("{err}").contains("refusing"),
825            "expected path-traversal refusal, got {err}"
826        );
827        // And the would-be escaped path doesn't exist.
828        assert!(!restore_root.path().join("escape.txt").exists());
829    }
830
831    /// CVE: malicious .pfimg with an absolute path must be refused.
832    #[test]
833    fn malicious_absolute_path_is_refused() {
834        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
835        let blob = blobs.put(b"x").unwrap();
836        let tree = FsTree {
837            kind: "fs.tree.v1".into(),
838            entries: vec![FsTreeEntry {
839                path: "/tmp/should-not-write".into(),
840                mode: "100644".into(),
841                size: 1,
842                kind: FsEntryKind::File,
843                blob: Some(blob),
844                link_target: None,
845            }],
846        };
847        let tree_cid = blobs.put(&serde_json::to_vec(&tree).unwrap()).unwrap();
848        let restore_root = TempDir::new().unwrap();
849        let dst = restore_root.path().join("dst");
850        let err = restore_tree(&blobs, &tree_cid, &dst).unwrap_err();
851        assert!(
852            format!("{err}").contains("absolute") || format!("{err}").contains("refusing"),
853            "expected absolute-path refusal, got {err}"
854        );
855    }
856
857    /// CVE: malicious symlink whose target escapes the restore root
858    /// must be refused (otherwise a follow-up file-write through the
859    /// link writes outside the sandbox).
860    #[cfg(unix)]
861    #[test]
862    fn malicious_symlink_escape_is_refused() {
863        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
864        let target_str = "../../escape";
865        let blob = blobs.put(target_str.as_bytes()).unwrap();
866        let tree = FsTree {
867            kind: "fs.tree.v1".into(),
868            entries: vec![FsTreeEntry {
869                path: "evil.lnk".into(),
870                mode: "120777".into(),
871                size: target_str.len() as u64,
872                kind: FsEntryKind::Symlink,
873                blob: Some(blob),
874                link_target: Some(target_str.to_owned()),
875            }],
876        };
877        let tree_cid = blobs.put(&serde_json::to_vec(&tree).unwrap()).unwrap();
878        let restore_root = TempDir::new().unwrap();
879        let dst = restore_root.path().join("dst");
880        let err = restore_tree(&blobs, &tree_cid, &dst).unwrap_err();
881        assert!(
882            format!("{err}").contains("escape") || format!("{err}").contains("refusing"),
883            "expected symlink-escape refusal, got {err}"
884        );
885    }
886
887    /// v1.0.14: absolute-target symlinks are SKIPPED with a stderr
888    /// warning by default (rather than aborting the whole restore).
889    /// The rest of the tree restores normally, and the operator can
890    /// see what was skipped. This is what tar/rsync do; the v1.0.3
891    /// "Zip Slip" CVE protection is unaffected because we never WRITE
892    /// through the symlink, only choose whether to create it.
893    #[cfg(unix)]
894    #[test]
895    fn absolute_symlink_skipped_by_default_with_rest_restored() {
896        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
897        let file_blob = blobs.put(b"hello\n").unwrap();
898        let tree = FsTree {
899            kind: "fs.tree.v1".into(),
900            entries: vec![
901                FsTreeEntry {
902                    path: "abs.lnk".into(),
903                    mode: "120777".into(),
904                    size: 9,
905                    kind: FsEntryKind::Symlink,
906                    blob: None,
907                    link_target: Some("/var/log/agent".into()),
908                },
909                FsTreeEntry {
910                    path: "src/main.py".into(),
911                    mode: "100644".into(),
912                    size: 6,
913                    kind: FsEntryKind::File,
914                    blob: Some(file_blob),
915                    link_target: None,
916                },
917            ],
918        };
919        let tree_cid = blobs.put(&serde_json::to_vec(&tree).unwrap()).unwrap();
920        let restore_root = TempDir::new().unwrap();
921        let dst = restore_root.path().join("out");
922        // Default options → restore must succeed; absolute symlink
923        // is skipped; the regular file lands.
924        restore_tree(&blobs, &tree_cid, &dst).unwrap();
925        assert!(
926            !dst.join("abs.lnk").exists(),
927            "absolute symlink must be skipped by default"
928        );
929        assert_eq!(
930            std::fs::read_to_string(dst.join("src/main.py")).unwrap(),
931            "hello\n",
932            "rest of the tree must restore normally"
933        );
934    }
935
936    /// v1.0.14: --allow-absolute-symlinks (RestoreOptions
937    /// equivalent) opts in to restoring absolute targets verbatim.
938    /// Operator explicitly acknowledges the sandbox-escape risk.
939    #[cfg(unix)]
940    #[test]
941    fn allow_absolute_symlinks_restores_them_verbatim() {
942        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
943        let tree = FsTree {
944            kind: "fs.tree.v1".into(),
945            entries: vec![FsTreeEntry {
946                path: "abs.lnk".into(),
947                mode: "120777".into(),
948                size: 9,
949                kind: FsEntryKind::Symlink,
950                blob: None,
951                link_target: Some("/var/log/agent".into()),
952            }],
953        };
954        let tree_cid = blobs.put(&serde_json::to_vec(&tree).unwrap()).unwrap();
955        let restore_root = TempDir::new().unwrap();
956        let dst = restore_root.path().join("out");
957        restore_tree_with_options(
958            &blobs,
959            &tree_cid,
960            &dst,
961            RestoreOptions {
962                allow_absolute_symlinks: true,
963            },
964        )
965        .unwrap();
966        let link_meta = std::fs::symlink_metadata(dst.join("abs.lnk")).unwrap();
967        assert!(link_meta.file_type().is_symlink());
968        let target = std::fs::read_link(dst.join("abs.lnk")).unwrap();
969        assert_eq!(target.to_str().unwrap(), "/var/log/agent");
970    }
971
972    /// v1.0.2 audit: 0755 source file restored as 0644.
973    #[cfg(unix)]
974    #[test]
975    fn executable_mode_is_restored() {
976        use std::os::unix::fs::PermissionsExt as _;
977        let src = TempDir::new().unwrap();
978        write(src.path(), "script.sh", b"#!/bin/sh\necho hi\n");
979        let scr = src.path().join("script.sh");
980        std::fs::set_permissions(&scr, std::fs::Permissions::from_mode(0o755)).unwrap();
981        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
982        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
983
984        let restore_root = TempDir::new().unwrap();
985        let dst = restore_root.path().join("r");
986        restore_tree(&blobs, &cid, &dst).unwrap();
987        let meta = std::fs::metadata(dst.join("script.sh")).unwrap();
988        assert_eq!(
989            meta.permissions().mode() & 0o7777,
990            0o755,
991            "executable bit must survive snapshot+restore"
992        );
993    }
994
995    /// v1.0.2 audit: substring matching dropped legitimate paths
996    /// like `src/targeted/keep.txt` (the "target" segment is also a
997    /// default ignore). After v1.0.3 the match is component-segment.
998    #[test]
999    fn ignore_matches_segments_not_substrings() {
1000        let src = TempDir::new().unwrap();
1001        write(src.path(), "src/targeted/keep.txt", b"keep");
1002        write(src.path(), "target/should-skip.txt", b"skip");
1003        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
1004        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
1005        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
1006        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
1007        assert!(
1008            paths.contains(&"src/targeted/keep.txt"),
1009            "src/targeted/keep.txt must NOT be filtered (was: {paths:?})"
1010        );
1011        assert!(
1012            !paths.iter().any(|p| p.starts_with("target/")),
1013            "target/ subtree must be filtered (was: {paths:?})"
1014        );
1015    }
1016
1017    /// v1.0.13: the v1.0.12 retest reproduced false merge conflicts
1018    /// when `__pycache__/` and `.pytest_cache/` landed in the
1019    /// captured tree from a `pytest` run on otherwise-disjoint
1020    /// branches. Default-extra ignores now skip them.
1021    #[test]
1022    fn default_ignores_skip_python_cache_dirs() {
1023        let src = TempDir::new().unwrap();
1024        write(src.path(), "src/main.py", b"print('hi')\n");
1025        write(
1026            src.path(),
1027            "src/__pycache__/main.cpython-313.pyc",
1028            b"\x03\xf3\r\n", // pyc magic-ish
1029        );
1030        write(src.path(), ".pytest_cache/CACHEDIR.TAG", b"Signature: ...");
1031        write(src.path(), ".mypy_cache/3.13/CACHEDIR.TAG", b"...");
1032        write(src.path(), ".ruff_cache/0.6.0/foo", b"x");
1033        write(src.path(), ".venv/bin/python", b"#!/...\n");
1034        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
1035        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
1036        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
1037        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
1038
1039        assert!(
1040            paths.contains(&"src/main.py"),
1041            "real source file must survive: {paths:?}"
1042        );
1043        for cache_pat in [
1044            "__pycache__",
1045            ".pytest_cache",
1046            ".mypy_cache",
1047            ".ruff_cache",
1048            ".venv",
1049        ] {
1050            assert!(
1051                !paths.iter().any(|p| p.contains(cache_pat)),
1052                "{cache_pat} must be filtered by default; got: {paths:?}"
1053            );
1054        }
1055    }
1056
1057    /// v1.0.13: glob-pattern entries on the ignore list match
1058    /// against the path. The default set includes `*.pyc` so a
1059    /// stray `.pyc` outside `__pycache__/` (e.g. shipped artefacts)
1060    /// is also skipped.
1061    #[test]
1062    #[allow(clippy::case_sensitive_file_extension_comparisons)]
1063    fn glob_patterns_match_files_anywhere_in_tree() {
1064        let src = TempDir::new().unwrap();
1065        write(src.path(), "src/main.py", b"keep");
1066        write(src.path(), "src/legacy.pyc", b"skip-by-glob");
1067        write(src.path(), "build/output.pyc", b"skip-by-glob");
1068        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
1069        let cid = WalkFsCapture::new(src.path()).capture(&blobs).unwrap();
1070        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
1071        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
1072        assert!(
1073            paths.contains(&"src/main.py"),
1074            "non-glob source must survive: {paths:?}"
1075        );
1076        assert!(
1077            !paths.iter().any(|p| p.ends_with(".pyc")),
1078            "*.pyc glob must filter every .pyc anywhere: {paths:?}"
1079        );
1080    }
1081
1082    /// v1.0.13: opt out of the default-extra set when you genuinely
1083    /// need to capture cache files (CI auditing the cache shape, a
1084    /// registry mirror, etc.).
1085    #[test]
1086    fn opt_out_of_default_ignores_captures_caches() {
1087        let src = TempDir::new().unwrap();
1088        write(src.path(), "__pycache__/foo.pyc", b"x");
1089        write(src.path(), "src/main.py", b"hi");
1090        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
1091        let cid = WalkFsCapture::new_without_default_ignores(src.path())
1092            .capture(&blobs)
1093            .unwrap();
1094        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
1095        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
1096        assert!(
1097            paths.iter().any(|p| p.contains("__pycache__")),
1098            "without default ignores, __pycache__ must round-trip: {paths:?}"
1099        );
1100    }
1101
1102    /// v1.0.13: `.ignore_from(".pfignore")` reads gitignore-style
1103    /// rules from the captured tree. Common case: operator drops
1104    /// project-specific patterns into a `.pfignore` file alongside
1105    /// the source.
1106    #[test]
1107    #[allow(clippy::case_sensitive_file_extension_comparisons)]
1108    fn ignore_from_file_applies_each_line() {
1109        let src = TempDir::new().unwrap();
1110        write(src.path(), "src/main.py", b"keep");
1111        write(src.path(), "secrets/api.key", b"private");
1112        write(src.path(), "logs/today.log", b"verbose");
1113        write(
1114            src.path(),
1115            ".pfignore",
1116            b"# project ignores\nsecrets\n*.log\n",
1117        );
1118        let blobs: Arc<dyn BlobStore> = Arc::new(MemBlobStore::new());
1119        let cid = WalkFsCapture::new(src.path())
1120            .ignore_from(src.path().join(".pfignore"))
1121            .unwrap()
1122            .capture(&blobs)
1123            .unwrap();
1124        let tree: FsTree = serde_json::from_slice(&blobs.get(&cid).unwrap()).unwrap();
1125        let paths: Vec<&str> = tree.entries.iter().map(|e| e.path.as_str()).collect();
1126        assert!(paths.contains(&"src/main.py"));
1127        assert!(
1128            !paths.iter().any(|p| p.starts_with("secrets/")),
1129            "secrets/ should be filtered by .pfignore: {paths:?}"
1130        );
1131        assert!(
1132            !paths.iter().any(|p| p.ends_with(".log")),
1133            "*.log glob from .pfignore should filter logs: {paths:?}"
1134        );
1135    }
1136}