Skip to main content

objects/
fs_clone.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Filesystem-level copy-on-write helpers.
3//!
4//! Heddle's worktree materializer needs the storage win of pointing
5//! N worktrees at the same blob bytes (so checking out the same state
6//! to many sibling worktrees costs ~1× disk, not N×) **without** the
7//! mutation hazard that hardlinks bring. With hardlinks, an in-place
8//! write — `chmod +w file && echo new > file`, `O_TRUNC`, etc. —
9//! mutates the shared inode, corrupting every other worktree that
10//! points at the same blob.
11//!
12//! Filesystem reflinks (a.k.a. CoW clones) solve this: the destination
13//! starts out sharing physical blocks with the source, but the first
14//! write to either side automatically forks the underlying allocation.
15//! The OS guarantees isolation even if an agent strips the read-only
16//! bit and overwrites the file in place.
17//!
18//! Platform support:
19//! - **macOS / APFS:** `clonefile(2)` from `<sys/clonefile.h>`. True CoW.
20//! - **Linux / btrfs / XFS-with-reflinks / ZFS:** `ioctl(dest_fd, FICLONE, src_fd)`.
21//! - **Anywhere else** (or when reflink isn't supported by the
22//!   underlying filesystem): caller falls back to a real copy.
23//!
24//! The core [`try_reflink`] returns a [`ReflinkOutcome`] so the caller
25//! can tell three genuinely-different situations apart: a successful
26//! clone, a "this filesystem can't reflink" verdict (batch-wide signal
27//! to stop trying), and a "the source vanished from under us" race
28//! (a per-blob fallback that must NOT poison the batch). Overloading the
29//! last two — as a bare `Ok(false)` did — makes one concurrently-pruned
30//! loose mirror needlessly disable reflinks for every remaining blob.
31
32use std::{fs, io, path::Path};
33
34/// The three outcomes of a reflink attempt, kept distinct so callers
35/// don't conflate "filesystem can't reflink" (a batch-wide property)
36/// with "this one source vanished mid-flight" (a per-blob race).
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum ReflinkOutcome {
39    /// CoW clone succeeded; `dest` now exists, sharing physical blocks
40    /// with `source` until either side is written.
41    Cloned,
42    /// The kernel reported reflinks aren't supported for this
43    /// filesystem / src+dst pair (`EXDEV`/`EOPNOTSUPP`/`ENOTSUP`/
44    /// `ENOSYS`/`EINVAL`). This is a property of the destination
45    /// filesystem, so a caller materializing a batch MAY disable
46    /// reflinks for the rest of it and fall straight to copy/write.
47    Unsupported,
48    /// The `source` was gone by the time we looked (concurrent prune /
49    /// torn NoSync promote). Reflink is only an optimization, so the
50    /// caller should degrade to a real copy / bytes-write for THIS blob
51    /// only — and crucially keep reflinks enabled for the rest of the
52    /// batch, since the filesystem itself is perfectly capable. A blob
53    /// that is genuinely absent (not just unreflinkable) still surfaces
54    /// downstream when the copy/write fallback can't find its bytes.
55    SourceVanished,
56}
57
58/// Try a filesystem-level reflink (copy-on-write clone) from `source`
59/// to `dest`. On success the destination has its own inode and shares
60/// physical blocks with the source until either side is modified.
61///
62/// On a successful reflink: returns `Ok(ReflinkOutcome::Cloned)`. The
63/// destination file has been created with the kernel's choice of
64/// permissions (typically the source's). Callers should
65/// `set_permissions` afterwards if they need a specific mode.
66///
67/// On a "filesystem doesn't support reflinks" verdict (`EXDEV`,
68/// `EOPNOTSUPP`, `ENOTSUP`, `ENOSYS`, `EINVAL` from the ioctl form):
69/// returns `Ok(ReflinkOutcome::Unsupported)`. The caller should fall
70/// back to `fs::copy` and may skip future reflink attempts on this
71/// filesystem.
72///
73/// When the `source` is gone (missing at the pre-check, or `ENOENT`
74/// from the syscall in the TOCTOU window after it): returns
75/// `Ok(ReflinkOutcome::SourceVanished)`. The caller should fall back
76/// to a copy/bytes-write for this blob only and keep reflinks enabled
77/// for the rest of the batch — a vanished mirror says nothing about
78/// the filesystem's reflink capability.
79///
80/// On any other I/O error: returns `Err`.
81///
82/// `dest` must not already exist on macOS (`clonefile` requires a
83/// nonexistent destination). On Linux `FICLONE` requires the dest fd
84/// be opened for writing on a regular file, which we create with
85/// `O_CREAT | O_WRONLY | O_TRUNC`.
86pub fn try_reflink(source: &Path, dest: &Path) -> io::Result<ReflinkOutcome> {
87    // Never hand `clonefile`/`FICLONE` a source that isn't there: a missing
88    // source is reported as ENOENT, which `reflink_unsupported` deliberately
89    // does NOT swallow (ENOENT is a genuinely-missing file, not "reflink
90    // unsupported"), so it would hard-error. Reflink is only an optimization —
91    // a vanished loose mirror (concurrent prune / torn promote) must degrade to
92    // the caller's copy/bytes-write fallback, not crash. This is reported as
93    // `SourceVanished` (NOT `Unsupported`) so a single pruned blob doesn't
94    // disable reflinks for the whole batch. This guard is what stopped `heddle
95    // start` from failing on macOS/APFS with `conflict: No such file or
96    // directory` (heddle#571). A genuinely-missing blob still errors loudly
97    // downstream — `get_blob` returns `NotFound` with the hash when the copy
98    // fallback also can't find the bytes.
99    if !source.exists() {
100        return Ok(ReflinkOutcome::SourceVanished);
101    }
102    #[cfg(target_os = "macos")]
103    {
104        try_clonefile_macos(source, dest)
105    }
106    #[cfg(target_os = "linux")]
107    {
108        try_ficlone_linux(source, dest)
109    }
110    #[cfg(not(any(target_os = "macos", target_os = "linux")))]
111    {
112        let _ = (source, dest);
113        Ok(ReflinkOutcome::Unsupported)
114    }
115}
116
117/// Reflink if possible, otherwise fall back to a real copy. Returns
118/// the same `Ok(true)/Ok(false)` discriminator as [`try_reflink`] —
119/// `true` when the OS gave us a CoW clone, `false` when we paid the
120/// full copy cost. Either way, on `Ok` the destination exists and has
121/// the source's bytes.
122///
123/// The destination's permission bits are not normalized here. Callers
124/// that need a specific mode (`0o644`, `0o755`) should call
125/// `fs::set_permissions` after a successful return.
126pub fn clonefile_or_copy(source: &Path, dest: &Path) -> io::Result<bool> {
127    // `clonefile`/FICLONE require dest not to exist; remove any stale
128    // entry first. Ignored if dest doesn't exist.
129    let _ = fs::remove_file(dest);
130    if matches!(try_reflink(source, dest)?, ReflinkOutcome::Cloned) {
131        return Ok(true);
132    }
133    fs::copy(source, dest)?;
134    Ok(false)
135}
136
137#[cfg(target_os = "macos")]
138fn try_clonefile_macos(source: &Path, dest: &Path) -> io::Result<ReflinkOutcome> {
139    use std::{ffi::CString, os::unix::ffi::OsStrExt};
140
141    // SAFETY: linking the system `clonefile(2)` symbol. Signature
142    // matches `<sys/clonefile.h>`:
143    //   int clonefile(const char *src, const char *dst, uint32_t flags);
144    unsafe extern "C" {
145        fn clonefile(src: *const libc::c_char, dst: *const libc::c_char, flags: u32)
146        -> libc::c_int;
147    }
148
149    let src_c = CString::new(source.as_os_str().as_bytes()).map_err(|_| {
150        io::Error::new(
151            io::ErrorKind::InvalidInput,
152            "source path contains interior NUL",
153        )
154    })?;
155    let dst_c = CString::new(dest.as_os_str().as_bytes()).map_err(|_| {
156        io::Error::new(
157            io::ErrorKind::InvalidInput,
158            "destination path contains interior NUL",
159        )
160    })?;
161
162    // SAFETY: both pointers are NUL-terminated C strings owned by
163    // the local CStrings; flags=0 requests the default behavior
164    // (clone metadata + data, follow no symlinks on the source).
165    let rc = unsafe { clonefile(src_c.as_ptr(), dst_c.as_ptr(), 0) };
166    if rc == 0 {
167        return Ok(ReflinkOutcome::Cloned);
168    }
169
170    let err = io::Error::last_os_error();
171    classify_clone_err(source, err)
172}
173
174#[cfg(target_os = "linux")]
175fn try_ficlone_linux(source: &Path, dest: &Path) -> io::Result<ReflinkOutcome> {
176    use std::{fs::OpenOptions, os::unix::io::AsRawFd};
177
178    // FICLONE = _IOW(0x94, 9, int) on Linux. The kernel header
179    // `<linux/fs.h>` (and `<linux/fs.h>` UAPI) define this as
180    // 0x40049409 = (1 << 30) | (4 << 16) | (0x94 << 8) | 9
181    // i.e. _IOC_WRITE | sizeof(int) | type=0x94 | nr=9.
182    const FICLONE: libc::c_ulong = 0x4004_9409;
183
184    // Opening the source can race a concurrent prune: the pre-check in
185    // `try_reflink` saw it, but it can vanish before this open. Map that
186    // to `SourceVanished` so the caller degrades per-blob rather than
187    // disabling reflinks for the batch (or hard-erroring).
188    let src = match OpenOptions::new().read(true).open(source) {
189        Ok(f) => f,
190        Err(err) => return classify_clone_err(source, err),
191    };
192    let dst = OpenOptions::new()
193        .write(true)
194        .create(true)
195        .truncate(true)
196        .open(dest)?;
197
198    // SAFETY: ioctl with two valid fds; FICLONE expects an `int` fd
199    // as the third arg.
200    let rc = unsafe { libc::ioctl(dst.as_raw_fd(), FICLONE, src.as_raw_fd()) };
201    if rc == 0 {
202        return Ok(ReflinkOutcome::Cloned);
203    }
204
205    let err = io::Error::last_os_error();
206    // Clean up the empty dest we just created so the caller's
207    // `fs::copy` fallback starts from a known state.
208    drop(dst);
209    let _ = fs::remove_file(dest);
210    classify_clone_err(source, err)
211}
212
213/// Classify a clonefile/FICLONE (or source-open) failure into the
214/// caller-meaningful [`ReflinkOutcome`] or a genuine error.
215///
216/// * `Unsupported` — the filesystem (or src/dst pair) can't reflink
217///   (`reflink_unsupported`). A batch-wide property.
218/// * `SourceVanished` — the failure is `ENOENT` and the source is in
219///   fact gone now (concurrent prune / torn promote in the TOCTOU
220///   window after the pre-check). A per-blob race; reflinks stay viable
221///   for the rest of the batch. An `ENOENT` whose source still exists
222///   (e.g. a missing dest parent) is NOT swallowed here — it surfaces
223///   as an `Err` for the caller to attribute correctly.
224/// * `Err` — anything else; the caller should surface it.
225#[cfg(any(target_os = "macos", target_os = "linux"))]
226fn classify_clone_err(source: &Path, err: io::Error) -> io::Result<ReflinkOutcome> {
227    if reflink_unsupported(&err) {
228        Ok(ReflinkOutcome::Unsupported)
229    } else if err.kind() == io::ErrorKind::NotFound && !source.exists() {
230        Ok(ReflinkOutcome::SourceVanished)
231    } else {
232        Err(err)
233    }
234}
235
236/// Decide whether a clonefile/FICLONE error means "this filesystem
237/// (or this src/dst pair) won't ever reflink" vs a transient or
238/// caller-bug failure that we should surface.
239#[cfg(any(target_os = "macos", target_os = "linux"))]
240fn reflink_unsupported(err: &io::Error) -> bool {
241    let Some(code) = err.raw_os_error() else {
242        return false;
243    };
244    // EXDEV: cross-device — the two paths live on different filesystems.
245    // EOPNOTSUPP / ENOTSUP: filesystem doesn't implement reflinks
246    //    (e.g. ext4 on Linux, HFS+ on macOS). On Linux these two are
247    //    aliases (both = 95) so listing both makes one branch
248    //    unreachable; on macOS they're distinct (102 vs 45), so we need
249    //    both to be matched. `#[allow(unreachable_patterns)]` keeps the
250    //    portable spelling without a `cfg`-split.
251    // ENOSYS: kernel too old to know the syscall.
252    // EINVAL: FICLONE returns this when the src/dst aren't on the same
253    //    filesystem on some kernels, or when the filesystem is mounted
254    //    without reflink support.
255    #[allow(unreachable_patterns)]
256    let is_unsupported = matches!(
257        code,
258        libc::EXDEV | libc::EOPNOTSUPP | libc::ENOTSUP | libc::ENOSYS | libc::EINVAL
259    );
260    is_unsupported
261}
262
263/// Test whether the filesystem at `parent_dir` supports reflinks by
264/// trying one against a temp source/dest pair. Returns `true` on
265/// success. Useful for tests that want to soft-skip on filesystems
266/// without CoW support, and for any caller that wants a runtime
267/// capability check before asserting on reflink-specific properties.
268pub fn filesystem_supports_reflink(parent_dir: &Path) -> bool {
269    use std::io::Write;
270
271    let src = parent_dir.join(".heddle-reflink-probe-src");
272    let dst = parent_dir.join(".heddle-reflink-probe-dst");
273    let _ = fs::remove_file(&src);
274    let _ = fs::remove_file(&dst);
275
276    let mut f = match fs::File::create(&src) {
277        Ok(f) => f,
278        Err(_) => return false,
279    };
280    if f.write_all(b"reflink-probe").is_err() {
281        let _ = fs::remove_file(&src);
282        return false;
283    }
284    drop(f);
285
286    let supported = matches!(try_reflink(&src, &dst), Ok(ReflinkOutcome::Cloned));
287    let _ = fs::remove_file(&src);
288    let _ = fs::remove_file(&dst);
289    supported
290}
291
292#[cfg(test)]
293mod tests {
294    use tempfile::TempDir;
295
296    use super::*;
297
298    /// heddle#571 (Bug 2): reflink must be gated on the source existing. A
299    /// vanished loose mirror (concurrent prune / torn promote) must degrade to
300    /// the caller's copy/bytes-write fallback, NOT hard-error with the ENOENT
301    /// that `clonefile` raises on macOS (and that `reflink_unsupported`
302    /// correctly refuses to swallow). It must report `SourceVanished` —
303    /// distinct from `Unsupported` — so one pruned blob doesn't disable
304    /// reflinks for the whole batch (heddle#571 r3). Verifiable on Linux: no
305    /// syscall is issued.
306    #[test]
307    fn try_reflink_missing_source_reports_vanished_not_unsupported() {
308        let temp = TempDir::new().unwrap();
309        let src = temp.path().join("does-not-exist.txt");
310        let dst = temp.path().join("dst.txt");
311        assert!(!src.exists());
312
313        let result = try_reflink(&src, &dst);
314        assert!(
315            matches!(result, Ok(ReflinkOutcome::SourceVanished)),
316            "a missing reflink source must report SourceVanished (per-blob fallback, \
317             NOT the batch-wide Unsupported), got {result:?}"
318        );
319        assert!(
320            !dst.exists(),
321            "no destination should be created when the source is missing"
322        );
323    }
324
325    #[test]
326    fn clonefile_or_copy_creates_destination_with_source_bytes() {
327        let temp = TempDir::new().unwrap();
328        let src = temp.path().join("src.txt");
329        let dst = temp.path().join("dst.txt");
330        fs::write(&src, b"hello reflink").unwrap();
331
332        let _ = clonefile_or_copy(&src, &dst).unwrap();
333        assert_eq!(fs::read(&dst).unwrap(), b"hello reflink");
334    }
335
336    #[test]
337    fn clonefile_or_copy_overwrites_existing_destination() {
338        let temp = TempDir::new().unwrap();
339        let src = temp.path().join("src.txt");
340        let dst = temp.path().join("dst.txt");
341        fs::write(&src, b"new content").unwrap();
342        fs::write(&dst, b"old content").unwrap();
343
344        let _ = clonefile_or_copy(&src, &dst).unwrap();
345        assert_eq!(fs::read(&dst).unwrap(), b"new content");
346    }
347
348    /// Core isolation property: writing to the cloned destination
349    /// must not change the source's bytes. With a real CoW clone the
350    /// kernel forks blocks on first write; with the `fs::copy`
351    /// fallback the dest is a separate file from the start. Either
352    /// way the source must be untouched.
353    #[test]
354    fn writing_to_destination_does_not_mutate_source() {
355        let temp = TempDir::new().unwrap();
356        let src = temp.path().join("src.txt");
357        let dst = temp.path().join("dst.txt");
358        fs::write(&src, b"original source").unwrap();
359
360        let _ = clonefile_or_copy(&src, &dst).unwrap();
361        fs::write(&dst, b"mutated dest").unwrap();
362
363        assert_eq!(fs::read(&src).unwrap(), b"original source");
364        assert_eq!(fs::read(&dst).unwrap(), b"mutated dest");
365    }
366
367    /// Reflinks (unlike hardlinks) give the destination its own
368    /// inode. On a CoW filesystem this is the key correctness
369    /// distinction: agents can chmod or write in place without
370    /// reaching across worktrees.
371    #[cfg(unix)]
372    #[test]
373    fn successful_reflink_yields_distinct_inode() {
374        use std::os::unix::fs::MetadataExt;
375
376        let temp = TempDir::new().unwrap();
377        if !filesystem_supports_reflink(temp.path()) {
378            eprintln!(
379                "[skip] filesystem at {:?} does not support reflinks; cannot assert inode property",
380                temp.path()
381            );
382            return;
383        }
384
385        let src = temp.path().join("src.txt");
386        let dst = temp.path().join("dst.txt");
387        fs::write(&src, b"reflink inode test").unwrap();
388
389        let outcome = try_reflink(&src, &dst).unwrap();
390        assert_eq!(
391            outcome,
392            ReflinkOutcome::Cloned,
393            "filesystem advertised reflink support"
394        );
395
396        let src_inode = fs::metadata(&src).unwrap().ino();
397        let dst_inode = fs::metadata(&dst).unwrap().ino();
398        assert_ne!(
399            src_inode, dst_inode,
400            "reflinked files must have distinct inodes (got {} for both)",
401            src_inode
402        );
403    }
404}