Skip to main content

microsandbox_utils/
copy.rs

1//! Sparse-aware fast copy with reflink fallback.
2//!
3//! Two-tier strategy that preserves sparseness on every supported
4//! platform:
5//!
6//! 1. **Reflink** (zero-copy COW). Tries `clonefile(2)` on macOS and
7//!    `ioctl(FICLONE)` on Linux via `reflink-copy`. Succeeds instantly
8//!    on APFS, btrfs, XFS (with `reflink=1`), and bcachefs. Returns
9//!    `EOPNOTSUPP` (or similar) on ext4 and other non-COW filesystems.
10//!
11//! 2. **Sparse-aware copy**. POSIX `SEEK_DATA` / `SEEK_HOLE` walk of
12//!    the source's allocation map, with `copy_file_range(2)` on Linux
13//!    for in-kernel zero-copy of data extents. The destination is
14//!    `ftruncate`d to the source size up front so unallocated regions
15//!    stay holes.
16//!
17//! Never falls back to a naive byte-for-byte copy — that would
18//! densify a 4 GiB sparse file with a few MB of data into 4 GiB on
19//! disk, which is the exact failure mode this module exists to
20//! prevent.
21//!
22//! See `planning/microsandbox/implementation/snapshots.md` for the
23//! full design and tradeoffs.
24
25use std::fs::{File, OpenOptions};
26use std::io;
27use std::os::unix::io::{AsRawFd, RawFd};
28use std::path::Path;
29
30//--------------------------------------------------------------------------------------------------
31// Functions
32//--------------------------------------------------------------------------------------------------
33
34/// Copy `src` to `dst`, preserving sparseness. Returns the apparent
35/// size of the destination in bytes.
36///
37/// Tries reflink first (zero-copy COW); on filesystems without reflink
38/// support, walks the source's allocation map and copies only its
39/// data extents into a `ftruncate`-established sparse destination.
40///
41/// **Blocking.** Callers in async contexts should wrap in
42/// `tokio::task::spawn_blocking`.
43pub fn fast_copy(src: &Path, dst: &Path) -> io::Result<u64> {
44    // Stat the source up front. This makes the missing-source error
45    // kind platform-consistent (`NotFound` everywhere); without it,
46    // reflink-copy on Linux surfaces `InvalidInput` with no errno
47    // for a non-existent path, which our `is_reflink_unsupported`
48    // check can't recognize as a fall-through.
49    let src_len = std::fs::metadata(src)?.len();
50
51    // Tier 1: reflink. Errors on unsupported FSes; we fall through to
52    // Tier 2. We do NOT use `reflink_or_copy`, which densifies on
53    // fallback via `std::fs::copy`.
54    match reflink_copy::reflink(src, dst) {
55        Ok(()) => return Ok(src_len),
56        Err(e) if is_reflink_unsupported(&e) => {
57            // fall through to sparse copy
58        }
59        Err(e) => return Err(e),
60    }
61
62    sparse_copy(src, dst)
63}
64
65/// Sparse-aware copy via `SEEK_DATA`/`SEEK_HOLE` and per-extent copy.
66///
67/// Public for callers that want to skip the reflink attempt — e.g.
68/// when they already know the destination filesystem doesn't support
69/// reflinks, or for tests that want to exercise the fallback path.
70pub fn sparse_copy(src: &Path, dst: &Path) -> io::Result<u64> {
71    let src_file = File::open(src)?;
72    let len = src_file.metadata()?.len();
73
74    let dst_file = OpenOptions::new()
75        .read(true)
76        .write(true)
77        .create(true)
78        .truncate(true)
79        .open(dst)?;
80    // Establish destination as a fully-sparse hole of `len` bytes;
81    // only data extents will materialize into allocated blocks below.
82    dst_file.set_len(len)?;
83
84    let src_fd = src_file.as_raw_fd();
85    let dst_fd = dst_file.as_raw_fd();
86
87    let mut off: i64 = 0;
88    while (off as u64) < len {
89        // Find next data extent.
90        let data_start = unsafe { libc::lseek(src_fd, off, libc::SEEK_DATA) };
91        if data_start < 0 {
92            let err = io::Error::last_os_error();
93            // ENXIO: no more data past this offset → done.
94            if err.raw_os_error() == Some(libc::ENXIO) {
95                break;
96            }
97            return Err(err);
98        }
99        // Find the end of that extent (start of next hole, or EOF).
100        let data_end = unsafe { libc::lseek(src_fd, data_start, libc::SEEK_HOLE) };
101        if data_end < 0 {
102            return Err(io::Error::last_os_error());
103        }
104        let data_end = (data_end as u64).min(len);
105        let data_start = data_start as u64;
106        if data_end <= data_start {
107            break;
108        }
109
110        copy_extent(src_fd, dst_fd, data_start, data_end - data_start)?;
111        off = data_end as i64;
112    }
113
114    dst_file.sync_all()?;
115    Ok(len)
116}
117
118//--------------------------------------------------------------------------------------------------
119// Functions: Helpers
120//--------------------------------------------------------------------------------------------------
121
122/// Reflink can fail with several different errnos depending on the
123/// filesystem and platform. Treat them all as "fall through to Tier 2"
124/// rather than propagating to the caller.
125///
126/// On Linux `ENOTSUP == EOPNOTSUPP`, so a single arm covers both;
127/// macOS / BSDs assign them distinct values and need both arms.
128fn is_reflink_unsupported(e: &io::Error) -> bool {
129    let Some(code) = e.raw_os_error() else {
130        return false;
131    };
132
133    #[cfg(target_os = "linux")]
134    let aliases: &[i32] = &[libc::ENOTSUP, libc::EXDEV, libc::EINVAL];
135    #[cfg(not(target_os = "linux"))]
136    let aliases: &[i32] = &[libc::ENOTSUP, libc::EOPNOTSUPP, libc::EXDEV, libc::EINVAL];
137
138    aliases.contains(&code)
139}
140
141#[cfg(target_os = "linux")]
142fn copy_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
143    let mut src_off = off as i64;
144    let mut dst_off = off as i64;
145    let mut remaining = len;
146
147    while remaining > 0 {
148        let chunk = remaining.min(usize::MAX as u64 / 2) as usize;
149        let n =
150            unsafe { libc::copy_file_range(src_fd, &mut src_off, dst_fd, &mut dst_off, chunk, 0) };
151        if n < 0 {
152            let err = io::Error::last_os_error();
153            // copy_file_range may not be supported on every kernel/FS
154            // combination (notably across-FS prior to 5.3, or older
155            // kernels). Fall back to pread/pwrite for the remainder of
156            // this extent.
157            if matches!(
158                err.raw_os_error(),
159                Some(libc::ENOSYS)
160                    | Some(libc::EXDEV)
161                    | Some(libc::EINVAL)
162                    | Some(libc::EOPNOTSUPP)
163            ) {
164                let consumed = len - remaining;
165                return read_write_extent(src_fd, dst_fd, off + consumed, remaining);
166            }
167            return Err(err);
168        }
169        if n == 0 {
170            // EOF — should not happen for a valid extent, but guard.
171            break;
172        }
173        remaining -= n as u64;
174    }
175    Ok(())
176}
177
178#[cfg(not(target_os = "linux"))]
179fn copy_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
180    read_write_extent(src_fd, dst_fd, off, len)
181}
182
183/// Copy `len` bytes from `src_fd` at `off` to `dst_fd` at `off` using
184/// `pread`/`pwrite`. Universal fallback for `copy_extent` on platforms
185/// or filesystems where `copy_file_range` doesn't apply.
186fn read_write_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
187    const BUF_SIZE: usize = 64 * 1024;
188    let mut buf = [0u8; BUF_SIZE];
189    let mut copied: u64 = 0;
190
191    while copied < len {
192        let to_read = (len - copied).min(BUF_SIZE as u64) as usize;
193        let read_off = (off + copied) as i64;
194        let n = unsafe {
195            libc::pread(
196                src_fd,
197                buf.as_mut_ptr() as *mut libc::c_void,
198                to_read,
199                read_off,
200            )
201        };
202        if n < 0 {
203            return Err(io::Error::last_os_error());
204        }
205        if n == 0 {
206            return Err(io::Error::new(
207                io::ErrorKind::UnexpectedEof,
208                "unexpected EOF mid-extent",
209            ));
210        }
211        let n = n as usize;
212
213        let mut written: usize = 0;
214        while written < n {
215            let w_off = (off + copied + written as u64) as i64;
216            let w = unsafe {
217                libc::pwrite(
218                    dst_fd,
219                    buf[written..n].as_ptr() as *const libc::c_void,
220                    n - written,
221                    w_off,
222                )
223            };
224            if w < 0 {
225                return Err(io::Error::last_os_error());
226            }
227            if w == 0 {
228                return Err(io::Error::new(
229                    io::ErrorKind::WriteZero,
230                    "pwrite returned 0",
231                ));
232            }
233            written += w as usize;
234        }
235        copied += n as u64;
236    }
237    Ok(())
238}
239
240//--------------------------------------------------------------------------------------------------
241// Tests
242//--------------------------------------------------------------------------------------------------
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247    use std::os::unix::fs::MetadataExt;
248
249    /// Build a sparse source file: total apparent size `len`, with
250    /// 64 KiB of data written at each of the given offsets.
251    fn make_sparse(path: &Path, len: u64, data_offsets: &[u64]) -> io::Result<()> {
252        let f = OpenOptions::new()
253            .read(true)
254            .write(true)
255            .create(true)
256            .truncate(true)
257            .open(path)?;
258        f.set_len(len)?;
259        for &off in data_offsets {
260            let buf = vec![0xAB_u8; 64 * 1024];
261            let fd = f.as_raw_fd();
262            let n = unsafe { libc::pwrite(fd, buf.as_ptr() as *const _, buf.len(), off as i64) };
263            assert!(n > 0, "pwrite failed: {}", io::Error::last_os_error());
264        }
265        f.sync_all()?;
266        Ok(())
267    }
268
269    #[test]
270    fn round_trip_small() {
271        let dir = tempfile::tempdir().unwrap();
272        let src = dir.path().join("src.bin");
273        let dst = dir.path().join("dst.bin");
274
275        std::fs::write(&src, b"hello world").unwrap();
276        let n = fast_copy(&src, &dst).unwrap();
277        assert_eq!(n, 11);
278        assert_eq!(std::fs::read(&dst).unwrap(), b"hello world");
279    }
280
281    #[test]
282    fn sparse_copy_preserves_holes_and_data() {
283        // 16 MiB sparse file with 4 data extents at known offsets.
284        // Use sparse_copy directly to exercise Tier 2 regardless of
285        // the test-host filesystem.
286        let dir = tempfile::tempdir().unwrap();
287        let src = dir.path().join("src.bin");
288        let dst = dir.path().join("dst.bin");
289
290        let len: u64 = 16 * 1024 * 1024;
291        let offsets = [0u64, 4 * 1024 * 1024, 8 * 1024 * 1024, 12 * 1024 * 1024];
292        make_sparse(&src, len, &offsets).unwrap();
293
294        let n = sparse_copy(&src, &dst).unwrap();
295        assert_eq!(n, len);
296
297        // Apparent size matches.
298        let dst_meta = std::fs::metadata(&dst).unwrap();
299        assert_eq!(dst_meta.len(), len);
300
301        // Each data extent's bytes round-trip.
302        let mut buf = [0u8; 64 * 1024];
303        let dst_file = File::open(&dst).unwrap();
304        for &off in &offsets {
305            let n = unsafe {
306                libc::pread(
307                    dst_file.as_raw_fd(),
308                    buf.as_mut_ptr() as *mut _,
309                    buf.len(),
310                    off as i64,
311                )
312            };
313            assert_eq!(n as usize, buf.len());
314            assert!(buf.iter().all(|&b| b == 0xAB));
315        }
316
317        // Sparseness preservation: only meaningful if the source
318        // itself is sparse on this filesystem. Some test hosts (FAT,
319        // certain APFS configurations under tempfile mounts) don't
320        // produce a sparse source from `ftruncate + pwrite` — in that
321        // case sparseness is unachievable and we just confirm the
322        // destination didn't blow up beyond the source's footprint.
323        let src_bytes_on_disk = std::fs::metadata(&src).unwrap().blocks() * 512;
324        let dst_bytes_on_disk = dst_meta.blocks() * 512;
325        if src_bytes_on_disk < len / 2 {
326            // Source IS sparse. Destination must also be sparse —
327            // this is the load-bearing regression test for the whole
328            // module.
329            assert!(
330                dst_bytes_on_disk < len / 2,
331                "source is sparse ({src_bytes_on_disk} bytes on disk) but destination densified to {dst_bytes_on_disk} bytes for an apparent size of {len}",
332            );
333            assert!(
334                dst_bytes_on_disk <= src_bytes_on_disk * 4 + 1024 * 1024,
335                "destination allocated significantly more than source: src={src_bytes_on_disk} dst={dst_bytes_on_disk}",
336            );
337        } else {
338            eprintln!(
339                "filesystem did not sparsify the source (src_bytes_on_disk={src_bytes_on_disk}, apparent={len}); sparseness preservation not exercised in this run",
340            );
341            // Without source sparseness we can't exceed source's
342            // footprint by much — guard against gross regressions.
343            assert!(
344                dst_bytes_on_disk <= src_bytes_on_disk + 1024 * 1024,
345                "destination grew beyond source footprint: src={src_bytes_on_disk} dst={dst_bytes_on_disk}",
346            );
347        }
348    }
349
350    #[test]
351    fn fast_copy_matches_source_size() {
352        let dir = tempfile::tempdir().unwrap();
353        let src = dir.path().join("src.bin");
354        let dst = dir.path().join("dst.bin");
355
356        let len: u64 = 4 * 1024 * 1024;
357        make_sparse(&src, len, &[0, 2 * 1024 * 1024]).unwrap();
358
359        let n = fast_copy(&src, &dst).unwrap();
360        assert_eq!(n, len);
361        assert_eq!(std::fs::metadata(&dst).unwrap().len(), len);
362    }
363
364    #[test]
365    fn missing_source_errors() {
366        let dir = tempfile::tempdir().unwrap();
367        let err = fast_copy(&dir.path().join("nope.bin"), &dir.path().join("dst.bin")).unwrap_err();
368        assert_eq!(err.kind(), io::ErrorKind::NotFound);
369    }
370}