Skip to main content

microsandbox_utils/
copy.rs

1//! Sparse-aware fast copy with reflink fallback.
2//!
3//! Two-tier strategy that preserves sparseness on every supported
4//! platform:
5//!
6//! 1. **Reflink** (zero-copy COW). Tries `clonefile(2)` on macOS and
7//!    `ioctl(FICLONE)` on Linux via `reflink-copy`. Succeeds instantly
8//!    on APFS, btrfs, XFS (with `reflink=1`), and bcachefs. Returns
9//!    `EOPNOTSUPP` (or similar) on ext4 and other non-COW filesystems.
10//!
11//! 2. **Sparse-aware copy**. POSIX `SEEK_DATA` / `SEEK_HOLE` walk of
12//!    the source's allocation map, with `copy_file_range(2)` on Linux
13//!    for in-kernel zero-copy of data extents. The destination is
14//!    `ftruncate`d to the source size up front so unallocated regions
15//!    stay holes.
16//!
17//! Never falls back to a naive byte-for-byte copy — that would
18//! densify a 4 GiB sparse file with a few MB of data into 4 GiB on
19//! disk, which is the exact failure mode this module exists to
20//! prevent.
21//!
22//! See `planning/microsandbox/implementation/snapshots.md` for the
23//! full design and tradeoffs.
24
25use std::fs::{File, OpenOptions};
26use std::io;
27#[cfg(windows)]
28use std::io::{Read, Seek, SeekFrom, Write};
29#[cfg(unix)]
30use std::os::unix::io::{AsRawFd, RawFd};
31#[cfg(windows)]
32use std::os::windows::io::AsRawHandle;
33use std::path::Path;
34#[cfg(windows)]
35use std::ptr;
36
37#[cfg(windows)]
38use windows_sys::Win32::Foundation::HANDLE;
39#[cfg(windows)]
40use windows_sys::Win32::System::IO::DeviceIoControl;
41#[cfg(windows)]
42use windows_sys::Win32::System::Ioctl::FSCTL_SET_SPARSE;
43
44//--------------------------------------------------------------------------------------------------
45// Functions
46//--------------------------------------------------------------------------------------------------
47
48/// Copy `src` to `dst`, preserving sparseness. Returns the apparent
49/// size of the destination in bytes.
50///
51/// Tries reflink first (zero-copy COW); on filesystems without reflink
52/// support, walks the source's allocation map and copies only its
53/// data extents into a `ftruncate`-established sparse destination.
54///
55/// **Blocking.** Callers in async contexts should wrap in
56/// `tokio::task::spawn_blocking`.
57pub fn fast_copy(src: &Path, dst: &Path) -> io::Result<u64> {
58    // Stat the source up front. This makes the missing-source error
59    // kind platform-consistent (`NotFound` everywhere); without it,
60    // reflink-copy on Linux surfaces `InvalidInput` with no errno
61    // for a non-existent path, which our `is_reflink_unsupported`
62    // check can't recognize as a fall-through.
63    let src_len = std::fs::metadata(src)?.len();
64
65    // Tier 1: reflink. Errors on unsupported FSes; we fall through to
66    // Tier 2. We do NOT use `reflink_or_copy`, which densifies on
67    // fallback via `std::fs::copy`.
68    match reflink_copy::reflink(src, dst) {
69        Ok(()) => return Ok(src_len),
70        Err(e) if is_reflink_unsupported(&e) => {
71            // fall through to sparse copy
72        }
73        Err(e) => return Err(e),
74    }
75
76    sparse_copy(src, dst)
77}
78
79/// Sparse-aware copy via `SEEK_DATA`/`SEEK_HOLE` and per-extent copy.
80///
81/// Public for callers that want to skip the reflink attempt — e.g.
82/// when they already know the destination filesystem doesn't support
83/// reflinks, or for tests that want to exercise the fallback path.
84pub fn sparse_copy(src: &Path, dst: &Path) -> io::Result<u64> {
85    sparse_copy_impl(src, dst)
86}
87
88#[cfg(unix)]
89fn sparse_copy_impl(src: &Path, dst: &Path) -> io::Result<u64> {
90    let src_file = File::open(src)?;
91    let len = src_file.metadata()?.len();
92
93    let dst_file = OpenOptions::new()
94        .read(true)
95        .write(true)
96        .create(true)
97        .truncate(true)
98        .open(dst)?;
99    // Establish destination as a fully-sparse hole of `len` bytes;
100    // only data extents will materialize into allocated blocks below.
101    dst_file.set_len(len)?;
102
103    let src_fd = src_file.as_raw_fd();
104    let dst_fd = dst_file.as_raw_fd();
105
106    let mut off: i64 = 0;
107    while (off as u64) < len {
108        // Find next data extent.
109        let data_start = unsafe { libc::lseek(src_fd, off, libc::SEEK_DATA) };
110        if data_start < 0 {
111            let err = io::Error::last_os_error();
112            // ENXIO: no more data past this offset → done.
113            if err.raw_os_error() == Some(libc::ENXIO) {
114                break;
115            }
116            return Err(err);
117        }
118        // Find the end of that extent (start of next hole, or EOF).
119        let data_end = unsafe { libc::lseek(src_fd, data_start, libc::SEEK_HOLE) };
120        if data_end < 0 {
121            return Err(io::Error::last_os_error());
122        }
123        let data_end = (data_end as u64).min(len);
124        let data_start = data_start as u64;
125        if data_end <= data_start {
126            break;
127        }
128
129        copy_extent(src_fd, dst_fd, data_start, data_end - data_start)?;
130        off = data_end as i64;
131    }
132
133    dst_file.sync_all()?;
134    Ok(len)
135}
136
137#[cfg(windows)]
138fn sparse_copy_impl(src: &Path, dst: &Path) -> io::Result<u64> {
139    const BUF_SIZE: usize = 1024 * 1024;
140
141    let mut src_file = File::open(src)?;
142    let len = src_file.metadata()?.len();
143
144    let mut dst_file = OpenOptions::new()
145        .read(true)
146        .write(true)
147        .create(true)
148        .truncate(true)
149        .open(dst)?;
150    dst_file.set_len(len)?;
151    mark_sparse(&dst_file)?;
152
153    let mut offset = 0u64;
154    let mut buf = vec![0u8; BUF_SIZE];
155    loop {
156        let n = src_file.read(&mut buf)?;
157        if n == 0 {
158            break;
159        }
160
161        write_nonzero_runs(&mut dst_file, offset, &buf[..n])?;
162        offset += n as u64;
163    }
164
165    dst_file.sync_all()?;
166    Ok(len)
167}
168
169//--------------------------------------------------------------------------------------------------
170// Functions: Helpers
171//--------------------------------------------------------------------------------------------------
172
173/// Reflink can fail with several different errnos depending on the
174/// filesystem and platform. Treat them all as "fall through to Tier 2"
175/// rather than propagating to the caller.
176///
177/// On Linux `ENOTSUP == EOPNOTSUPP`, so a single arm covers both;
178/// macOS / BSDs assign them distinct values and need both arms.
179fn is_reflink_unsupported(e: &io::Error) -> bool {
180    if matches!(e.kind(), io::ErrorKind::Unsupported) {
181        return true;
182    }
183
184    let Some(code) = e.raw_os_error() else {
185        return false;
186    };
187
188    #[cfg(target_os = "linux")]
189    let aliases: &[i32] = &[libc::ENOTSUP, libc::EXDEV, libc::EINVAL];
190    #[cfg(all(unix, not(target_os = "linux")))]
191    let aliases: &[i32] = &[libc::ENOTSUP, libc::EOPNOTSUPP, libc::EXDEV, libc::EINVAL];
192    #[cfg(windows)]
193    let aliases: &[i32] = &[
194        1,  // ERROR_INVALID_FUNCTION
195        17, // ERROR_NOT_SAME_DEVICE
196        50, // ERROR_NOT_SUPPORTED
197        87, // ERROR_INVALID_PARAMETER
198    ];
199
200    #[cfg(windows)]
201    {
202        let win32_code = (code as u32 & 0xffff) as i32;
203        return aliases.contains(&code) || aliases.contains(&win32_code);
204    }
205
206    #[cfg(unix)]
207    aliases.contains(&code)
208}
209
210#[cfg(target_os = "linux")]
211fn copy_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
212    let mut src_off = off as i64;
213    let mut dst_off = off as i64;
214    let mut remaining = len;
215
216    while remaining > 0 {
217        let chunk = remaining.min(usize::MAX as u64 / 2) as usize;
218        let n =
219            unsafe { libc::copy_file_range(src_fd, &mut src_off, dst_fd, &mut dst_off, chunk, 0) };
220        if n < 0 {
221            let err = io::Error::last_os_error();
222            // copy_file_range may not be supported on every kernel/FS
223            // combination (notably across-FS prior to 5.3, or older
224            // kernels). Fall back to pread/pwrite for the remainder of
225            // this extent.
226            if matches!(
227                err.raw_os_error(),
228                Some(libc::ENOSYS)
229                    | Some(libc::EXDEV)
230                    | Some(libc::EINVAL)
231                    | Some(libc::EOPNOTSUPP)
232            ) {
233                let consumed = len - remaining;
234                return read_write_extent(src_fd, dst_fd, off + consumed, remaining);
235            }
236            return Err(err);
237        }
238        if n == 0 {
239            // EOF — should not happen for a valid extent, but guard.
240            break;
241        }
242        remaining -= n as u64;
243    }
244    Ok(())
245}
246
247#[cfg(all(unix, not(target_os = "linux")))]
248fn copy_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
249    read_write_extent(src_fd, dst_fd, off, len)
250}
251
252/// Copy `len` bytes from `src_fd` at `off` to `dst_fd` at `off` using
253/// `pread`/`pwrite`. Universal fallback for `copy_extent` on platforms
254/// or filesystems where `copy_file_range` doesn't apply.
255#[cfg(unix)]
256fn read_write_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
257    const BUF_SIZE: usize = 64 * 1024;
258    let mut buf = [0u8; BUF_SIZE];
259    let mut copied: u64 = 0;
260
261    while copied < len {
262        let to_read = (len - copied).min(BUF_SIZE as u64) as usize;
263        let read_off = (off + copied) as i64;
264        let n = unsafe {
265            libc::pread(
266                src_fd,
267                buf.as_mut_ptr() as *mut libc::c_void,
268                to_read,
269                read_off,
270            )
271        };
272        if n < 0 {
273            return Err(io::Error::last_os_error());
274        }
275        if n == 0 {
276            return Err(io::Error::new(
277                io::ErrorKind::UnexpectedEof,
278                "unexpected EOF mid-extent",
279            ));
280        }
281        let n = n as usize;
282
283        let mut written: usize = 0;
284        while written < n {
285            let w_off = (off + copied + written as u64) as i64;
286            let w = unsafe {
287                libc::pwrite(
288                    dst_fd,
289                    buf[written..n].as_ptr() as *const libc::c_void,
290                    n - written,
291                    w_off,
292                )
293            };
294            if w < 0 {
295                return Err(io::Error::last_os_error());
296            }
297            if w == 0 {
298                return Err(io::Error::new(
299                    io::ErrorKind::WriteZero,
300                    "pwrite returned 0",
301                ));
302            }
303            written += w as usize;
304        }
305        copied += n as u64;
306    }
307    Ok(())
308}
309
310#[cfg(windows)]
311fn mark_sparse(file: &File) -> io::Result<()> {
312    let mut bytes_returned = 0;
313    let ok = unsafe {
314        DeviceIoControl(
315            file.as_raw_handle() as HANDLE,
316            FSCTL_SET_SPARSE,
317            ptr::null(),
318            0,
319            ptr::null_mut(),
320            0,
321            &mut bytes_returned,
322            ptr::null_mut(),
323        )
324    };
325    if ok == 0 {
326        return Err(io::Error::last_os_error());
327    }
328
329    Ok(())
330}
331
332#[cfg(windows)]
333fn write_nonzero_runs(dst: &mut File, base_offset: u64, bytes: &[u8]) -> io::Result<()> {
334    let mut cursor = 0;
335    while cursor < bytes.len() {
336        while cursor < bytes.len() && bytes[cursor] == 0 {
337            cursor += 1;
338        }
339        if cursor == bytes.len() {
340            break;
341        }
342
343        let start = cursor;
344        while cursor < bytes.len() && bytes[cursor] != 0 {
345            cursor += 1;
346        }
347
348        dst.seek(SeekFrom::Start(base_offset + start as u64))?;
349        dst.write_all(&bytes[start..cursor])?;
350    }
351
352    Ok(())
353}
354
355//--------------------------------------------------------------------------------------------------
356// Tests
357//--------------------------------------------------------------------------------------------------
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362    use std::io::{Read, Seek, SeekFrom, Write};
363    #[cfg(unix)]
364    use std::os::unix::fs::MetadataExt;
365
366    /// Build a sparse source file: total apparent size `len`, with
367    /// 64 KiB of data written at each of the given offsets.
368    fn make_sparse(path: &Path, len: u64, data_offsets: &[u64]) -> io::Result<()> {
369        let mut f = OpenOptions::new()
370            .read(true)
371            .write(true)
372            .create(true)
373            .truncate(true)
374            .open(path)?;
375        f.set_len(len)?;
376        for &off in data_offsets {
377            let buf = vec![0xAB_u8; 64 * 1024];
378            f.seek(SeekFrom::Start(off))?;
379            f.write_all(&buf)?;
380        }
381        f.sync_all()?;
382        Ok(())
383    }
384
385    #[test]
386    fn round_trip_small() {
387        let dir = tempfile::tempdir().unwrap();
388        let src = dir.path().join("src.bin");
389        let dst = dir.path().join("dst.bin");
390
391        std::fs::write(&src, b"hello world").unwrap();
392        let n = fast_copy(&src, &dst).unwrap();
393        assert_eq!(n, 11);
394        assert_eq!(std::fs::read(&dst).unwrap(), b"hello world");
395    }
396
397    #[test]
398    fn sparse_copy_preserves_holes_and_data() {
399        // 16 MiB sparse file with 4 data extents at known offsets.
400        // Use sparse_copy directly to exercise Tier 2 regardless of
401        // the test-host filesystem.
402        let dir = tempfile::tempdir().unwrap();
403        let src = dir.path().join("src.bin");
404        let dst = dir.path().join("dst.bin");
405
406        let len: u64 = 16 * 1024 * 1024;
407        let offsets = [0u64, 4 * 1024 * 1024, 8 * 1024 * 1024, 12 * 1024 * 1024];
408        make_sparse(&src, len, &offsets).unwrap();
409
410        let n = sparse_copy(&src, &dst).unwrap();
411        assert_eq!(n, len);
412
413        // Apparent size matches.
414        let dst_meta = std::fs::metadata(&dst).unwrap();
415        assert_eq!(dst_meta.len(), len);
416
417        // Each data extent's bytes round-trip.
418        let mut buf = [0u8; 64 * 1024];
419        let mut dst_file = File::open(&dst).unwrap();
420        for &off in &offsets {
421            dst_file.seek(SeekFrom::Start(off)).unwrap();
422            dst_file.read_exact(&mut buf).unwrap();
423            assert!(buf.iter().all(|&b| b == 0xAB));
424        }
425
426        // Sparseness preservation: only meaningful if the source
427        // itself is sparse on this filesystem. Some test hosts (FAT,
428        // certain APFS configurations under tempfile mounts) don't
429        // produce a sparse source from `ftruncate + pwrite` — in that
430        // case sparseness is unachievable and we just confirm the
431        // destination didn't blow up beyond the source's footprint.
432        #[cfg(unix)]
433        {
434            let src_bytes_on_disk = std::fs::metadata(&src).unwrap().blocks() * 512;
435            let dst_bytes_on_disk = dst_meta.blocks() * 512;
436            if src_bytes_on_disk < len / 2 {
437                // Source IS sparse. Destination must also be sparse —
438                // this is the load-bearing regression test for the whole
439                // module.
440                assert!(
441                    dst_bytes_on_disk < len / 2,
442                    "source is sparse ({src_bytes_on_disk} bytes on disk) but destination densified to {dst_bytes_on_disk} bytes for an apparent size of {len}",
443                );
444                assert!(
445                    dst_bytes_on_disk <= src_bytes_on_disk * 4 + 1024 * 1024,
446                    "destination allocated significantly more than source: src={src_bytes_on_disk} dst={dst_bytes_on_disk}",
447                );
448            } else {
449                eprintln!(
450                    "filesystem did not sparsify the source (src_bytes_on_disk={src_bytes_on_disk}, apparent={len}); sparseness preservation not exercised in this run",
451                );
452                // Without source sparseness we can't exceed source's
453                // footprint by much — guard against gross regressions.
454                assert!(
455                    dst_bytes_on_disk <= src_bytes_on_disk + 1024 * 1024,
456                    "destination grew beyond source footprint: src={src_bytes_on_disk} dst={dst_bytes_on_disk}",
457                );
458            }
459        }
460    }
461
462    #[test]
463    fn fast_copy_matches_source_size() {
464        let dir = tempfile::tempdir().unwrap();
465        let src = dir.path().join("src.bin");
466        let dst = dir.path().join("dst.bin");
467
468        let len: u64 = 4 * 1024 * 1024;
469        make_sparse(&src, len, &[0, 2 * 1024 * 1024]).unwrap();
470
471        let n = fast_copy(&src, &dst).unwrap();
472        assert_eq!(n, len);
473        assert_eq!(std::fs::metadata(&dst).unwrap().len(), len);
474    }
475
476    #[test]
477    fn missing_source_errors() {
478        let dir = tempfile::tempdir().unwrap();
479        let err = fast_copy(&dir.path().join("nope.bin"), &dir.path().join("dst.bin")).unwrap_err();
480        assert_eq!(err.kind(), io::ErrorKind::NotFound);
481    }
482}