objects/fs_clone.rs
1// SPDX-License-Identifier: Apache-2.0
2//! Filesystem-level copy-on-write helpers.
3//!
4//! Heddle's worktree materializer needs the storage win of pointing
5//! N worktrees at the same blob bytes (so checking out the same state
6//! to many sibling worktrees costs ~1× disk, not N×) **without** the
7//! mutation hazard that hardlinks bring. With hardlinks, an in-place
8//! write — `chmod +w file && echo new > file`, `O_TRUNC`, etc. —
9//! mutates the shared inode, corrupting every other worktree that
10//! points at the same blob.
11//!
12//! Filesystem reflinks (a.k.a. CoW clones) solve this: the destination
13//! starts out sharing physical blocks with the source, but the first
14//! write to either side automatically forks the underlying allocation.
15//! The OS guarantees isolation even if an agent strips the read-only
16//! bit and overwrites the file in place.
17//!
18//! Platform support:
19//! - **macOS / APFS:** `clonefile(2)` from `<sys/clonefile.h>`. True CoW.
20//! - **Linux / btrfs / XFS-with-reflinks / ZFS:** `ioctl(dest_fd, FICLONE, src_fd)`.
21//! - **Anywhere else** (or when reflink isn't supported by the
22//! underlying filesystem): caller falls back to a real copy.
23//!
24//! The core [`try_reflink`] returns a [`ReflinkOutcome`] so the caller
25//! can tell three genuinely-different situations apart: a successful
26//! clone, a "this filesystem can't reflink" verdict (batch-wide signal
27//! to stop trying), and a "the source vanished from under us" race
28//! (a per-blob fallback that must NOT poison the batch). Overloading the
29//! last two — as a bare `Ok(false)` did — makes one concurrently-pruned
30//! loose mirror needlessly disable reflinks for every remaining blob.
31
32use std::{fs, io, path::Path};
33
34/// The three outcomes of a reflink attempt, kept distinct so callers
35/// don't conflate "filesystem can't reflink" (a batch-wide property)
36/// with "this one source vanished mid-flight" (a per-blob race).
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum ReflinkOutcome {
39 /// CoW clone succeeded; `dest` now exists, sharing physical blocks
40 /// with `source` until either side is written.
41 Cloned,
42 /// The kernel reported reflinks aren't supported for this
43 /// filesystem / src+dst pair (`EXDEV`/`EOPNOTSUPP`/`ENOTSUP`/
44 /// `ENOSYS`/`EINVAL`). This is a property of the destination
45 /// filesystem, so a caller materializing a batch MAY disable
46 /// reflinks for the rest of it and fall straight to copy/write.
47 Unsupported,
48 /// The `source` was gone by the time we looked (concurrent prune /
49 /// torn NoSync promote). Reflink is only an optimization, so the
50 /// caller should degrade to a real copy / bytes-write for THIS blob
51 /// only — and crucially keep reflinks enabled for the rest of the
52 /// batch, since the filesystem itself is perfectly capable. A blob
53 /// that is genuinely absent (not just unreflinkable) still surfaces
54 /// downstream when the copy/write fallback can't find its bytes.
55 SourceVanished,
56}
57
58/// Try a filesystem-level reflink (copy-on-write clone) from `source`
59/// to `dest`. On success the destination has its own inode and shares
60/// physical blocks with the source until either side is modified.
61///
62/// On a successful reflink: returns `Ok(ReflinkOutcome::Cloned)`. The
63/// destination file has been created with the kernel's choice of
64/// permissions (typically the source's). Callers should
65/// `set_permissions` afterwards if they need a specific mode.
66///
67/// On a "filesystem doesn't support reflinks" verdict (`EXDEV`,
68/// `EOPNOTSUPP`, `ENOTSUP`, `ENOSYS`, `EINVAL` from the ioctl form):
69/// returns `Ok(ReflinkOutcome::Unsupported)`. The caller should fall
70/// back to `fs::copy` and may skip future reflink attempts on this
71/// filesystem.
72///
73/// When the `source` is gone (missing at the pre-check, or `ENOENT`
74/// from the syscall in the TOCTOU window after it): returns
75/// `Ok(ReflinkOutcome::SourceVanished)`. The caller should fall back
76/// to a copy/bytes-write for this blob only and keep reflinks enabled
77/// for the rest of the batch — a vanished mirror says nothing about
78/// the filesystem's reflink capability.
79///
80/// On any other I/O error: returns `Err`.
81///
82/// `dest` must not already exist on macOS (`clonefile` requires a
83/// nonexistent destination). On Linux `FICLONE` requires the dest fd
84/// be opened for writing on a regular file, which we create with
85/// `O_CREAT | O_WRONLY | O_TRUNC`.
86pub fn try_reflink(source: &Path, dest: &Path) -> io::Result<ReflinkOutcome> {
87 // Never hand `clonefile`/`FICLONE` a source that isn't there: a missing
88 // source is reported as ENOENT, which `reflink_unsupported` deliberately
89 // does NOT swallow (ENOENT is a genuinely-missing file, not "reflink
90 // unsupported"), so it would hard-error. Reflink is only an optimization —
91 // a vanished loose mirror (concurrent prune / torn promote) must degrade to
92 // the caller's copy/bytes-write fallback, not crash. This is reported as
93 // `SourceVanished` (NOT `Unsupported`) so a single pruned blob doesn't
94 // disable reflinks for the whole batch. This guard is what stopped `heddle
95 // start` from failing on macOS/APFS with `conflict: No such file or
96 // directory` (heddle#571). A genuinely-missing blob still errors loudly
97 // downstream — `get_blob` returns `NotFound` with the hash when the copy
98 // fallback also can't find the bytes.
99 if !source.exists() {
100 return Ok(ReflinkOutcome::SourceVanished);
101 }
102 #[cfg(target_os = "macos")]
103 {
104 try_clonefile_macos(source, dest)
105 }
106 #[cfg(target_os = "linux")]
107 {
108 try_ficlone_linux(source, dest)
109 }
110 #[cfg(not(any(target_os = "macos", target_os = "linux")))]
111 {
112 let _ = (source, dest);
113 Ok(ReflinkOutcome::Unsupported)
114 }
115}
116
117/// Reflink if possible, otherwise fall back to a real copy. Returns
118/// the same `Ok(true)/Ok(false)` discriminator as [`try_reflink`] —
119/// `true` when the OS gave us a CoW clone, `false` when we paid the
120/// full copy cost. Either way, on `Ok` the destination exists and has
121/// the source's bytes.
122///
123/// The destination's permission bits are not normalized here. Callers
124/// that need a specific mode (`0o644`, `0o755`) should call
125/// `fs::set_permissions` after a successful return.
126pub fn clonefile_or_copy(source: &Path, dest: &Path) -> io::Result<bool> {
127 // `clonefile`/FICLONE require dest not to exist; remove any stale
128 // entry first. Ignored if dest doesn't exist.
129 let _ = fs::remove_file(dest);
130 if matches!(try_reflink(source, dest)?, ReflinkOutcome::Cloned) {
131 return Ok(true);
132 }
133 fs::copy(source, dest)?;
134 Ok(false)
135}
136
137#[cfg(target_os = "macos")]
138fn try_clonefile_macos(source: &Path, dest: &Path) -> io::Result<ReflinkOutcome> {
139 use std::{ffi::CString, os::unix::ffi::OsStrExt};
140
141 // SAFETY: linking the system `clonefile(2)` symbol. Signature
142 // matches `<sys/clonefile.h>`:
143 // int clonefile(const char *src, const char *dst, uint32_t flags);
144 unsafe extern "C" {
145 fn clonefile(src: *const libc::c_char, dst: *const libc::c_char, flags: u32)
146 -> libc::c_int;
147 }
148
149 let src_c = CString::new(source.as_os_str().as_bytes()).map_err(|_| {
150 io::Error::new(
151 io::ErrorKind::InvalidInput,
152 "source path contains interior NUL",
153 )
154 })?;
155 let dst_c = CString::new(dest.as_os_str().as_bytes()).map_err(|_| {
156 io::Error::new(
157 io::ErrorKind::InvalidInput,
158 "destination path contains interior NUL",
159 )
160 })?;
161
162 // SAFETY: both pointers are NUL-terminated C strings owned by
163 // the local CStrings; flags=0 requests the default behavior
164 // (clone metadata + data, follow no symlinks on the source).
165 let rc = unsafe { clonefile(src_c.as_ptr(), dst_c.as_ptr(), 0) };
166 if rc == 0 {
167 return Ok(ReflinkOutcome::Cloned);
168 }
169
170 let err = io::Error::last_os_error();
171 classify_clone_err(source, err)
172}
173
174#[cfg(target_os = "linux")]
175fn try_ficlone_linux(source: &Path, dest: &Path) -> io::Result<ReflinkOutcome> {
176 use std::{fs::OpenOptions, os::unix::io::AsRawFd};
177
178 // FICLONE = _IOW(0x94, 9, int) on Linux. The kernel header
179 // `<linux/fs.h>` (and `<linux/fs.h>` UAPI) define this as
180 // 0x40049409 = (1 << 30) | (4 << 16) | (0x94 << 8) | 9
181 // i.e. _IOC_WRITE | sizeof(int) | type=0x94 | nr=9.
182 const FICLONE: libc::c_ulong = 0x4004_9409;
183
184 // Opening the source can race a concurrent prune: the pre-check in
185 // `try_reflink` saw it, but it can vanish before this open. Map that
186 // to `SourceVanished` so the caller degrades per-blob rather than
187 // disabling reflinks for the batch (or hard-erroring).
188 let src = match OpenOptions::new().read(true).open(source) {
189 Ok(f) => f,
190 Err(err) => return classify_clone_err(source, err),
191 };
192 let dst = OpenOptions::new()
193 .write(true)
194 .create(true)
195 .truncate(true)
196 .open(dest)?;
197
198 // SAFETY: ioctl with two valid fds; FICLONE expects an `int` fd
199 // as the third arg.
200 let rc = unsafe { libc::ioctl(dst.as_raw_fd(), FICLONE, src.as_raw_fd()) };
201 if rc == 0 {
202 return Ok(ReflinkOutcome::Cloned);
203 }
204
205 let err = io::Error::last_os_error();
206 // Clean up the empty dest we just created so the caller's
207 // `fs::copy` fallback starts from a known state.
208 drop(dst);
209 let _ = fs::remove_file(dest);
210 classify_clone_err(source, err)
211}
212
213/// Classify a clonefile/FICLONE (or source-open) failure into the
214/// caller-meaningful [`ReflinkOutcome`] or a genuine error.
215///
216/// * `Unsupported` — the filesystem (or src/dst pair) can't reflink
217/// (`reflink_unsupported`). A batch-wide property.
218/// * `SourceVanished` — the failure is `ENOENT` and the source is in
219/// fact gone now (concurrent prune / torn promote in the TOCTOU
220/// window after the pre-check). A per-blob race; reflinks stay viable
221/// for the rest of the batch. An `ENOENT` whose source still exists
222/// (e.g. a missing dest parent) is NOT swallowed here — it surfaces
223/// as an `Err` for the caller to attribute correctly.
224/// * `Err` — anything else; the caller should surface it.
225#[cfg(any(target_os = "macos", target_os = "linux"))]
226fn classify_clone_err(source: &Path, err: io::Error) -> io::Result<ReflinkOutcome> {
227 if reflink_unsupported(&err) {
228 Ok(ReflinkOutcome::Unsupported)
229 } else if err.kind() == io::ErrorKind::NotFound && !source.exists() {
230 Ok(ReflinkOutcome::SourceVanished)
231 } else {
232 Err(err)
233 }
234}
235
236/// Decide whether a clonefile/FICLONE error means "this filesystem
237/// (or this src/dst pair) won't ever reflink" vs a transient or
238/// caller-bug failure that we should surface.
239#[cfg(any(target_os = "macos", target_os = "linux"))]
240fn reflink_unsupported(err: &io::Error) -> bool {
241 let Some(code) = err.raw_os_error() else {
242 return false;
243 };
244 // EXDEV: cross-device — the two paths live on different filesystems.
245 // EOPNOTSUPP / ENOTSUP: filesystem doesn't implement reflinks
246 // (e.g. ext4 on Linux, HFS+ on macOS). On Linux these two are
247 // aliases (both = 95) so listing both makes one branch
248 // unreachable; on macOS they're distinct (102 vs 45), so we need
249 // both to be matched. `#[allow(unreachable_patterns)]` keeps the
250 // portable spelling without a `cfg`-split.
251 // ENOSYS: kernel too old to know the syscall.
252 // EINVAL: FICLONE returns this when the src/dst aren't on the same
253 // filesystem on some kernels, or when the filesystem is mounted
254 // without reflink support.
255 #[allow(unreachable_patterns)]
256 let is_unsupported = matches!(
257 code,
258 libc::EXDEV | libc::EOPNOTSUPP | libc::ENOTSUP | libc::ENOSYS | libc::EINVAL
259 );
260 is_unsupported
261}
262
263/// Test whether the filesystem at `parent_dir` supports reflinks by
264/// trying one against a temp source/dest pair. Returns `true` on
265/// success. Useful for tests that want to soft-skip on filesystems
266/// without CoW support, and for any caller that wants a runtime
267/// capability check before asserting on reflink-specific properties.
268pub fn filesystem_supports_reflink(parent_dir: &Path) -> bool {
269 use std::io::Write;
270
271 let src = parent_dir.join(".heddle-reflink-probe-src");
272 let dst = parent_dir.join(".heddle-reflink-probe-dst");
273 let _ = fs::remove_file(&src);
274 let _ = fs::remove_file(&dst);
275
276 let mut f = match fs::File::create(&src) {
277 Ok(f) => f,
278 Err(_) => return false,
279 };
280 if f.write_all(b"reflink-probe").is_err() {
281 let _ = fs::remove_file(&src);
282 return false;
283 }
284 drop(f);
285
286 let supported = matches!(try_reflink(&src, &dst), Ok(ReflinkOutcome::Cloned));
287 let _ = fs::remove_file(&src);
288 let _ = fs::remove_file(&dst);
289 supported
290}
291
292#[cfg(test)]
293mod tests {
294 use tempfile::TempDir;
295
296 use super::*;
297
298 /// heddle#571 (Bug 2): reflink must be gated on the source existing. A
299 /// vanished loose mirror (concurrent prune / torn promote) must degrade to
300 /// the caller's copy/bytes-write fallback, NOT hard-error with the ENOENT
301 /// that `clonefile` raises on macOS (and that `reflink_unsupported`
302 /// correctly refuses to swallow). It must report `SourceVanished` —
303 /// distinct from `Unsupported` — so one pruned blob doesn't disable
304 /// reflinks for the whole batch (heddle#571 r3). Verifiable on Linux: no
305 /// syscall is issued.
306 #[test]
307 fn try_reflink_missing_source_reports_vanished_not_unsupported() {
308 let temp = TempDir::new().unwrap();
309 let src = temp.path().join("does-not-exist.txt");
310 let dst = temp.path().join("dst.txt");
311 assert!(!src.exists());
312
313 let result = try_reflink(&src, &dst);
314 assert!(
315 matches!(result, Ok(ReflinkOutcome::SourceVanished)),
316 "a missing reflink source must report SourceVanished (per-blob fallback, \
317 NOT the batch-wide Unsupported), got {result:?}"
318 );
319 assert!(
320 !dst.exists(),
321 "no destination should be created when the source is missing"
322 );
323 }
324
325 #[test]
326 fn clonefile_or_copy_creates_destination_with_source_bytes() {
327 let temp = TempDir::new().unwrap();
328 let src = temp.path().join("src.txt");
329 let dst = temp.path().join("dst.txt");
330 fs::write(&src, b"hello reflink").unwrap();
331
332 let _ = clonefile_or_copy(&src, &dst).unwrap();
333 assert_eq!(fs::read(&dst).unwrap(), b"hello reflink");
334 }
335
336 #[test]
337 fn clonefile_or_copy_overwrites_existing_destination() {
338 let temp = TempDir::new().unwrap();
339 let src = temp.path().join("src.txt");
340 let dst = temp.path().join("dst.txt");
341 fs::write(&src, b"new content").unwrap();
342 fs::write(&dst, b"old content").unwrap();
343
344 let _ = clonefile_or_copy(&src, &dst).unwrap();
345 assert_eq!(fs::read(&dst).unwrap(), b"new content");
346 }
347
348 /// Core isolation property: writing to the cloned destination
349 /// must not change the source's bytes. With a real CoW clone the
350 /// kernel forks blocks on first write; with the `fs::copy`
351 /// fallback the dest is a separate file from the start. Either
352 /// way the source must be untouched.
353 #[test]
354 fn writing_to_destination_does_not_mutate_source() {
355 let temp = TempDir::new().unwrap();
356 let src = temp.path().join("src.txt");
357 let dst = temp.path().join("dst.txt");
358 fs::write(&src, b"original source").unwrap();
359
360 let _ = clonefile_or_copy(&src, &dst).unwrap();
361 fs::write(&dst, b"mutated dest").unwrap();
362
363 assert_eq!(fs::read(&src).unwrap(), b"original source");
364 assert_eq!(fs::read(&dst).unwrap(), b"mutated dest");
365 }
366
367 /// Reflinks (unlike hardlinks) give the destination its own
368 /// inode. On a CoW filesystem this is the key correctness
369 /// distinction: agents can chmod or write in place without
370 /// reaching across worktrees.
371 #[cfg(unix)]
372 #[test]
373 fn successful_reflink_yields_distinct_inode() {
374 use std::os::unix::fs::MetadataExt;
375
376 let temp = TempDir::new().unwrap();
377 if !filesystem_supports_reflink(temp.path()) {
378 eprintln!(
379 "[skip] filesystem at {:?} does not support reflinks; cannot assert inode property",
380 temp.path()
381 );
382 return;
383 }
384
385 let src = temp.path().join("src.txt");
386 let dst = temp.path().join("dst.txt");
387 fs::write(&src, b"reflink inode test").unwrap();
388
389 let outcome = try_reflink(&src, &dst).unwrap();
390 assert_eq!(
391 outcome,
392 ReflinkOutcome::Cloned,
393 "filesystem advertised reflink support"
394 );
395
396 let src_inode = fs::metadata(&src).unwrap().ino();
397 let dst_inode = fs::metadata(&dst).unwrap().ino();
398 assert_ne!(
399 src_inode, dst_inode,
400 "reflinked files must have distinct inodes (got {} for both)",
401 src_inode
402 );
403 }
404}