Skip to main content

objects/
fs_clone.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Filesystem-level copy-on-write helpers.
3//!
4//! Heddle's worktree materializer needs the storage win of pointing
5//! N worktrees at the same blob bytes (so checking out the same state
6//! to many sibling worktrees costs ~1× disk, not N×) **without** the
7//! mutation hazard that hardlinks bring. With hardlinks, an in-place
8//! write — `chmod +w file && echo new > file`, `O_TRUNC`, etc. —
9//! mutates the shared inode, corrupting every other worktree that
10//! points at the same blob.
11//!
12//! Filesystem reflinks (a.k.a. CoW clones) solve this: the destination
13//! starts out sharing physical blocks with the source, but the first
14//! write to either side automatically forks the underlying allocation.
15//! The OS guarantees isolation even if an agent strips the read-only
16//! bit and overwrites the file in place.
17//!
18//! Platform support:
19//! - **macOS / APFS:** `clonefile(2)` from `<sys/clonefile.h>`. True CoW.
20//! - **Linux / btrfs / XFS-with-reflinks / ZFS:** `ioctl(dest_fd, FICLONE, src_fd)`.
21//! - **Anywhere else** (or when reflink isn't supported by the
22//!   underlying filesystem): caller falls back to a real copy.
23//!
24//! The functions here return `Ok(true)` on a successful clone,
25//! `Ok(false)` when the kernel reported the operation isn't supported
26//! on this filesystem (so the caller should fall back to a real copy
27//! and remember to skip future reflink attempts in this batch), and an
28//! `Err` for genuine I/O errors that the caller should surface.
29
30use std::{fs, io, path::Path};
31
32/// Try a filesystem-level reflink (copy-on-write clone) from `source`
33/// to `dest`. On success the destination has its own inode and shares
34/// physical blocks with the source until either side is modified.
35///
36/// On a successful reflink: returns `Ok(true)`. The destination file
37/// has been created with the kernel's choice of permissions (typically
38/// the source's). Callers should `set_permissions` afterwards if they
39/// need a specific mode.
40///
41/// On a "filesystem doesn't support reflinks" verdict (`EXDEV`,
42/// `EOPNOTSUPP`, `ENOTSUP`, `ENOSYS`, `EINVAL` from the ioctl form):
43/// returns `Ok(false)`. The caller should fall back to `fs::copy` and
44/// remember to skip future reflink attempts on this filesystem.
45///
46/// On any other I/O error: returns `Err`.
47///
48/// `dest` must not already exist on macOS (`clonefile` requires a
49/// nonexistent destination). On Linux `FICLONE` requires the dest fd
50/// be opened for writing on a regular file, which we create with
51/// `O_CREAT | O_WRONLY | O_TRUNC`.
52pub fn try_reflink(source: &Path, dest: &Path) -> io::Result<bool> {
53    #[cfg(target_os = "macos")]
54    {
55        try_clonefile_macos(source, dest)
56    }
57    #[cfg(target_os = "linux")]
58    {
59        try_ficlone_linux(source, dest)
60    }
61    #[cfg(not(any(target_os = "macos", target_os = "linux")))]
62    {
63        let _ = (source, dest);
64        Ok(false)
65    }
66}
67
68/// Reflink if possible, otherwise fall back to a real copy. Returns
69/// the same `Ok(true)/Ok(false)` discriminator as [`try_reflink`] —
70/// `true` when the OS gave us a CoW clone, `false` when we paid the
71/// full copy cost. Either way, on `Ok` the destination exists and has
72/// the source's bytes.
73///
74/// The destination's permission bits are not normalized here. Callers
75/// that need a specific mode (`0o644`, `0o755`) should call
76/// `fs::set_permissions` after a successful return.
77pub fn clonefile_or_copy(source: &Path, dest: &Path) -> io::Result<bool> {
78    // `clonefile`/FICLONE require dest not to exist; remove any stale
79    // entry first. Ignored if dest doesn't exist.
80    let _ = fs::remove_file(dest);
81    if try_reflink(source, dest)? {
82        return Ok(true);
83    }
84    fs::copy(source, dest)?;
85    Ok(false)
86}
87
88#[cfg(target_os = "macos")]
89fn try_clonefile_macos(source: &Path, dest: &Path) -> io::Result<bool> {
90    use std::{ffi::CString, os::unix::ffi::OsStrExt};
91
92    // SAFETY: linking the system `clonefile(2)` symbol. Signature
93    // matches `<sys/clonefile.h>`:
94    //   int clonefile(const char *src, const char *dst, uint32_t flags);
95    unsafe extern "C" {
96        fn clonefile(src: *const libc::c_char, dst: *const libc::c_char, flags: u32)
97        -> libc::c_int;
98    }
99
100    let src_c = CString::new(source.as_os_str().as_bytes()).map_err(|_| {
101        io::Error::new(
102            io::ErrorKind::InvalidInput,
103            "source path contains interior NUL",
104        )
105    })?;
106    let dst_c = CString::new(dest.as_os_str().as_bytes()).map_err(|_| {
107        io::Error::new(
108            io::ErrorKind::InvalidInput,
109            "destination path contains interior NUL",
110        )
111    })?;
112
113    // SAFETY: both pointers are NUL-terminated C strings owned by
114    // the local CStrings; flags=0 requests the default behavior
115    // (clone metadata + data, follow no symlinks on the source).
116    let rc = unsafe { clonefile(src_c.as_ptr(), dst_c.as_ptr(), 0) };
117    if rc == 0 {
118        return Ok(true);
119    }
120
121    let err = io::Error::last_os_error();
122    if reflink_unsupported(&err) {
123        Ok(false)
124    } else {
125        Err(err)
126    }
127}
128
129#[cfg(target_os = "linux")]
130fn try_ficlone_linux(source: &Path, dest: &Path) -> io::Result<bool> {
131    use std::{fs::OpenOptions, os::unix::io::AsRawFd};
132
133    // FICLONE = _IOW(0x94, 9, int) on Linux. The kernel header
134    // `<linux/fs.h>` (and `<linux/fs.h>` UAPI) define this as
135    // 0x40049409 = (1 << 30) | (4 << 16) | (0x94 << 8) | 9
136    // i.e. _IOC_WRITE | sizeof(int) | type=0x94 | nr=9.
137    const FICLONE: libc::c_ulong = 0x4004_9409;
138
139    let src = OpenOptions::new().read(true).open(source)?;
140    let dst = OpenOptions::new()
141        .write(true)
142        .create(true)
143        .truncate(true)
144        .open(dest)?;
145
146    // SAFETY: ioctl with two valid fds; FICLONE expects an `int` fd
147    // as the third arg.
148    let rc = unsafe { libc::ioctl(dst.as_raw_fd(), FICLONE, src.as_raw_fd()) };
149    if rc == 0 {
150        return Ok(true);
151    }
152
153    let err = io::Error::last_os_error();
154    // Clean up the empty dest we just created so the caller's
155    // `fs::copy` fallback starts from a known state.
156    drop(dst);
157    let _ = fs::remove_file(dest);
158    if reflink_unsupported(&err) {
159        Ok(false)
160    } else {
161        Err(err)
162    }
163}
164
165/// Decide whether a clonefile/FICLONE error means "this filesystem
166/// (or this src/dst pair) won't ever reflink" vs a transient or
167/// caller-bug failure that we should surface.
168#[cfg(any(target_os = "macos", target_os = "linux"))]
169fn reflink_unsupported(err: &io::Error) -> bool {
170    let Some(code) = err.raw_os_error() else {
171        return false;
172    };
173    // EXDEV: cross-device — the two paths live on different filesystems.
174    // EOPNOTSUPP / ENOTSUP: filesystem doesn't implement reflinks
175    //    (e.g. ext4 on Linux, HFS+ on macOS). On Linux these two are
176    //    aliases (both = 95) so listing both makes one branch
177    //    unreachable; on macOS they're distinct (102 vs 45), so we need
178    //    both to be matched. `#[allow(unreachable_patterns)]` keeps the
179    //    portable spelling without a `cfg`-split.
180    // ENOSYS: kernel too old to know the syscall.
181    // EINVAL: FICLONE returns this when the src/dst aren't on the same
182    //    filesystem on some kernels, or when the filesystem is mounted
183    //    without reflink support.
184    #[allow(unreachable_patterns)]
185    let is_unsupported = matches!(
186        code,
187        libc::EXDEV | libc::EOPNOTSUPP | libc::ENOTSUP | libc::ENOSYS | libc::EINVAL
188    );
189    is_unsupported
190}
191
192/// Test whether the filesystem at `parent_dir` supports reflinks by
193/// trying one against a temp source/dest pair. Returns `true` on
194/// success. Useful for tests that want to soft-skip on filesystems
195/// without CoW support, and for any caller that wants a runtime
196/// capability check before asserting on reflink-specific properties.
197pub fn filesystem_supports_reflink(parent_dir: &Path) -> bool {
198    use std::io::Write;
199
200    let src = parent_dir.join(".heddle-reflink-probe-src");
201    let dst = parent_dir.join(".heddle-reflink-probe-dst");
202    let _ = fs::remove_file(&src);
203    let _ = fs::remove_file(&dst);
204
205    let mut f = match fs::File::create(&src) {
206        Ok(f) => f,
207        Err(_) => return false,
208    };
209    if f.write_all(b"reflink-probe").is_err() {
210        let _ = fs::remove_file(&src);
211        return false;
212    }
213    drop(f);
214
215    let supported = matches!(try_reflink(&src, &dst), Ok(true));
216    let _ = fs::remove_file(&src);
217    let _ = fs::remove_file(&dst);
218    supported
219}
220
221#[cfg(test)]
222mod tests {
223    use tempfile::TempDir;
224
225    use super::*;
226
227    #[test]
228    fn clonefile_or_copy_creates_destination_with_source_bytes() {
229        let temp = TempDir::new().unwrap();
230        let src = temp.path().join("src.txt");
231        let dst = temp.path().join("dst.txt");
232        fs::write(&src, b"hello reflink").unwrap();
233
234        let _ = clonefile_or_copy(&src, &dst).unwrap();
235        assert_eq!(fs::read(&dst).unwrap(), b"hello reflink");
236    }
237
238    #[test]
239    fn clonefile_or_copy_overwrites_existing_destination() {
240        let temp = TempDir::new().unwrap();
241        let src = temp.path().join("src.txt");
242        let dst = temp.path().join("dst.txt");
243        fs::write(&src, b"new content").unwrap();
244        fs::write(&dst, b"old content").unwrap();
245
246        let _ = clonefile_or_copy(&src, &dst).unwrap();
247        assert_eq!(fs::read(&dst).unwrap(), b"new content");
248    }
249
250    /// Core isolation property: writing to the cloned destination
251    /// must not change the source's bytes. With a real CoW clone the
252    /// kernel forks blocks on first write; with the `fs::copy`
253    /// fallback the dest is a separate file from the start. Either
254    /// way the source must be untouched.
255    #[test]
256    fn writing_to_destination_does_not_mutate_source() {
257        let temp = TempDir::new().unwrap();
258        let src = temp.path().join("src.txt");
259        let dst = temp.path().join("dst.txt");
260        fs::write(&src, b"original source").unwrap();
261
262        let _ = clonefile_or_copy(&src, &dst).unwrap();
263        fs::write(&dst, b"mutated dest").unwrap();
264
265        assert_eq!(fs::read(&src).unwrap(), b"original source");
266        assert_eq!(fs::read(&dst).unwrap(), b"mutated dest");
267    }
268
269    /// Reflinks (unlike hardlinks) give the destination its own
270    /// inode. On a CoW filesystem this is the key correctness
271    /// distinction: agents can chmod or write in place without
272    /// reaching across worktrees.
273    #[cfg(unix)]
274    #[test]
275    fn successful_reflink_yields_distinct_inode() {
276        use std::os::unix::fs::MetadataExt;
277
278        let temp = TempDir::new().unwrap();
279        if !filesystem_supports_reflink(temp.path()) {
280            eprintln!(
281                "[skip] filesystem at {:?} does not support reflinks; cannot assert inode property",
282                temp.path()
283            );
284            return;
285        }
286
287        let src = temp.path().join("src.txt");
288        let dst = temp.path().join("dst.txt");
289        fs::write(&src, b"reflink inode test").unwrap();
290
291        let did_reflink = try_reflink(&src, &dst).unwrap();
292        assert!(did_reflink, "filesystem advertised reflink support");
293
294        let src_inode = fs::metadata(&src).unwrap().ino();
295        let dst_inode = fs::metadata(&dst).unwrap().ino();
296        assert_ne!(
297            src_inode, dst_inode,
298            "reflinked files must have distinct inodes (got {} for both)",
299            src_inode
300        );
301    }
302}