microsandbox-utils 0.6.0

Shared constants and utilities for the microsandbox project.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
//! Sparse-aware fast copy with reflink fallback.
//!
//! Two-tier strategy that preserves sparseness on every supported
//! platform:
//!
//! 1. **Reflink** (zero-copy COW). Tries `clonefile(2)` on macOS and
//!    `ioctl(FICLONE)` on Linux via `reflink-copy`. Succeeds instantly
//!    on APFS, btrfs, XFS (with `reflink=1`), and bcachefs. Returns
//!    `EOPNOTSUPP` (or similar) on ext4 and other non-COW filesystems.
//!
//! 2. **Sparse-aware copy**. POSIX `SEEK_DATA` / `SEEK_HOLE` walk of
//!    the source's allocation map, with `copy_file_range(2)` on Linux
//!    for in-kernel zero-copy of data extents. The destination is
//!    `ftruncate`d to the source size up front so unallocated regions
//!    stay holes.
//!
//! Never falls back to a naive byte-for-byte copy — that would
//! densify a 4 GiB sparse file with a few MB of data into 4 GiB on
//! disk, which is the exact failure mode this module exists to
//! prevent.
//!
//! See `planning/microsandbox/implementation/snapshots.md` for the
//! full design and tradeoffs.

use std::fs::{File, OpenOptions};
use std::io;
#[cfg(windows)]
use std::io::{Read, Seek, SeekFrom, Write};
#[cfg(unix)]
use std::os::unix::io::{AsRawFd, RawFd};
#[cfg(windows)]
use std::os::windows::io::AsRawHandle;
use std::path::Path;
#[cfg(windows)]
use std::ptr;

#[cfg(windows)]
use windows_sys::Win32::Foundation::HANDLE;
#[cfg(windows)]
use windows_sys::Win32::System::IO::DeviceIoControl;
#[cfg(windows)]
use windows_sys::Win32::System::Ioctl::FSCTL_SET_SPARSE;

//--------------------------------------------------------------------------------------------------
// Functions
//--------------------------------------------------------------------------------------------------

/// Copy `src` to `dst`, preserving sparseness. Returns the apparent
/// size of the destination in bytes.
///
/// Tries reflink first (zero-copy COW); on filesystems without reflink
/// support, walks the source's allocation map and copies only its
/// data extents into a `ftruncate`-established sparse destination.
///
/// **Blocking.** Callers in async contexts should wrap in
/// `tokio::task::spawn_blocking`.
pub fn fast_copy(src: &Path, dst: &Path) -> io::Result<u64> {
    // Stat the source up front. This makes the missing-source error
    // kind platform-consistent (`NotFound` everywhere); without it,
    // reflink-copy on Linux surfaces `InvalidInput` with no errno
    // for a non-existent path, which our `is_reflink_unsupported`
    // check can't recognize as a fall-through.
    let src_len = std::fs::metadata(src)?.len();

    // Tier 1: reflink. Errors on unsupported FSes; we fall through to
    // Tier 2. We do NOT use `reflink_or_copy`, which densifies on
    // fallback via `std::fs::copy`.
    match reflink_copy::reflink(src, dst) {
        Ok(()) => return Ok(src_len),
        Err(e) if is_reflink_unsupported(&e) => {
            // fall through to sparse copy
        }
        Err(e) => return Err(e),
    }

    sparse_copy(src, dst)
}

/// Sparse-aware copy via `SEEK_DATA`/`SEEK_HOLE` and per-extent copy.
///
/// Public for callers that want to skip the reflink attempt — e.g.
/// when they already know the destination filesystem doesn't support
/// reflinks, or for tests that want to exercise the fallback path.
pub fn sparse_copy(src: &Path, dst: &Path) -> io::Result<u64> {
    sparse_copy_impl(src, dst)
}

#[cfg(unix)]
fn sparse_copy_impl(src: &Path, dst: &Path) -> io::Result<u64> {
    let src_file = File::open(src)?;
    let len = src_file.metadata()?.len();

    let dst_file = OpenOptions::new()
        .read(true)
        .write(true)
        .create(true)
        .truncate(true)
        .open(dst)?;
    // Establish destination as a fully-sparse hole of `len` bytes;
    // only data extents will materialize into allocated blocks below.
    dst_file.set_len(len)?;

    let src_fd = src_file.as_raw_fd();
    let dst_fd = dst_file.as_raw_fd();

    let mut off: i64 = 0;
    while (off as u64) < len {
        // Find next data extent.
        let data_start = unsafe { libc::lseek(src_fd, off, libc::SEEK_DATA) };
        if data_start < 0 {
            let err = io::Error::last_os_error();
            // ENXIO: no more data past this offset → done.
            if err.raw_os_error() == Some(libc::ENXIO) {
                break;
            }
            return Err(err);
        }
        // Find the end of that extent (start of next hole, or EOF).
        let data_end = unsafe { libc::lseek(src_fd, data_start, libc::SEEK_HOLE) };
        if data_end < 0 {
            return Err(io::Error::last_os_error());
        }
        let data_end = (data_end as u64).min(len);
        let data_start = data_start as u64;
        if data_end <= data_start {
            break;
        }

        copy_extent(src_fd, dst_fd, data_start, data_end - data_start)?;
        off = data_end as i64;
    }

    dst_file.sync_all()?;
    Ok(len)
}

#[cfg(windows)]
fn sparse_copy_impl(src: &Path, dst: &Path) -> io::Result<u64> {
    const BUF_SIZE: usize = 1024 * 1024;

    let mut src_file = File::open(src)?;
    let len = src_file.metadata()?.len();

    let mut dst_file = OpenOptions::new()
        .read(true)
        .write(true)
        .create(true)
        .truncate(true)
        .open(dst)?;
    dst_file.set_len(len)?;
    mark_sparse(&dst_file)?;

    let mut offset = 0u64;
    let mut buf = vec![0u8; BUF_SIZE];
    loop {
        let n = src_file.read(&mut buf)?;
        if n == 0 {
            break;
        }

        write_nonzero_runs(&mut dst_file, offset, &buf[..n])?;
        offset += n as u64;
    }

    dst_file.sync_all()?;
    Ok(len)
}

//--------------------------------------------------------------------------------------------------
// Functions: Helpers
//--------------------------------------------------------------------------------------------------

/// Reflink can fail with several different errnos depending on the
/// filesystem and platform. Treat them all as "fall through to Tier 2"
/// rather than propagating to the caller.
///
/// On Linux `ENOTSUP == EOPNOTSUPP`, so a single arm covers both;
/// macOS / BSDs assign them distinct values and need both arms.
fn is_reflink_unsupported(e: &io::Error) -> bool {
    if matches!(e.kind(), io::ErrorKind::Unsupported) {
        return true;
    }

    let Some(code) = e.raw_os_error() else {
        return false;
    };

    #[cfg(target_os = "linux")]
    let aliases: &[i32] = &[libc::ENOTSUP, libc::EXDEV, libc::EINVAL];
    #[cfg(all(unix, not(target_os = "linux")))]
    let aliases: &[i32] = &[libc::ENOTSUP, libc::EOPNOTSUPP, libc::EXDEV, libc::EINVAL];
    #[cfg(windows)]
    let aliases: &[i32] = &[
        1,  // ERROR_INVALID_FUNCTION
        17, // ERROR_NOT_SAME_DEVICE
        50, // ERROR_NOT_SUPPORTED
        87, // ERROR_INVALID_PARAMETER
    ];

    #[cfg(windows)]
    {
        let win32_code = (code as u32 & 0xffff) as i32;
        return aliases.contains(&code) || aliases.contains(&win32_code);
    }

    #[cfg(unix)]
    aliases.contains(&code)
}

#[cfg(target_os = "linux")]
fn copy_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
    let mut src_off = off as i64;
    let mut dst_off = off as i64;
    let mut remaining = len;

    while remaining > 0 {
        let chunk = remaining.min(usize::MAX as u64 / 2) as usize;
        let n =
            unsafe { libc::copy_file_range(src_fd, &mut src_off, dst_fd, &mut dst_off, chunk, 0) };
        if n < 0 {
            let err = io::Error::last_os_error();
            // copy_file_range may not be supported on every kernel/FS
            // combination (notably across-FS prior to 5.3, or older
            // kernels). Fall back to pread/pwrite for the remainder of
            // this extent.
            if matches!(
                err.raw_os_error(),
                Some(libc::ENOSYS)
                    | Some(libc::EXDEV)
                    | Some(libc::EINVAL)
                    | Some(libc::EOPNOTSUPP)
            ) {
                let consumed = len - remaining;
                return read_write_extent(src_fd, dst_fd, off + consumed, remaining);
            }
            return Err(err);
        }
        if n == 0 {
            // EOF — should not happen for a valid extent, but guard.
            break;
        }
        remaining -= n as u64;
    }
    Ok(())
}

#[cfg(all(unix, not(target_os = "linux")))]
fn copy_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
    read_write_extent(src_fd, dst_fd, off, len)
}

/// Copy `len` bytes from `src_fd` at `off` to `dst_fd` at `off` using
/// `pread`/`pwrite`. Universal fallback for `copy_extent` on platforms
/// or filesystems where `copy_file_range` doesn't apply.
#[cfg(unix)]
fn read_write_extent(src_fd: RawFd, dst_fd: RawFd, off: u64, len: u64) -> io::Result<()> {
    const BUF_SIZE: usize = 64 * 1024;
    let mut buf = [0u8; BUF_SIZE];
    let mut copied: u64 = 0;

    while copied < len {
        let to_read = (len - copied).min(BUF_SIZE as u64) as usize;
        let read_off = (off + copied) as i64;
        let n = unsafe {
            libc::pread(
                src_fd,
                buf.as_mut_ptr() as *mut libc::c_void,
                to_read,
                read_off,
            )
        };
        if n < 0 {
            return Err(io::Error::last_os_error());
        }
        if n == 0 {
            return Err(io::Error::new(
                io::ErrorKind::UnexpectedEof,
                "unexpected EOF mid-extent",
            ));
        }
        let n = n as usize;

        let mut written: usize = 0;
        while written < n {
            let w_off = (off + copied + written as u64) as i64;
            let w = unsafe {
                libc::pwrite(
                    dst_fd,
                    buf[written..n].as_ptr() as *const libc::c_void,
                    n - written,
                    w_off,
                )
            };
            if w < 0 {
                return Err(io::Error::last_os_error());
            }
            if w == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::WriteZero,
                    "pwrite returned 0",
                ));
            }
            written += w as usize;
        }
        copied += n as u64;
    }
    Ok(())
}

#[cfg(windows)]
fn mark_sparse(file: &File) -> io::Result<()> {
    let mut bytes_returned = 0;
    let ok = unsafe {
        DeviceIoControl(
            file.as_raw_handle() as HANDLE,
            FSCTL_SET_SPARSE,
            ptr::null(),
            0,
            ptr::null_mut(),
            0,
            &mut bytes_returned,
            ptr::null_mut(),
        )
    };
    if ok == 0 {
        return Err(io::Error::last_os_error());
    }

    Ok(())
}

#[cfg(windows)]
fn write_nonzero_runs(dst: &mut File, base_offset: u64, bytes: &[u8]) -> io::Result<()> {
    let mut cursor = 0;
    while cursor < bytes.len() {
        while cursor < bytes.len() && bytes[cursor] == 0 {
            cursor += 1;
        }
        if cursor == bytes.len() {
            break;
        }

        let start = cursor;
        while cursor < bytes.len() && bytes[cursor] != 0 {
            cursor += 1;
        }

        dst.seek(SeekFrom::Start(base_offset + start as u64))?;
        dst.write_all(&bytes[start..cursor])?;
    }

    Ok(())
}

//--------------------------------------------------------------------------------------------------
// Tests
//--------------------------------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::{Read, Seek, SeekFrom, Write};
    #[cfg(unix)]
    use std::os::unix::fs::MetadataExt;

    /// Build a sparse source file: total apparent size `len`, with
    /// 64 KiB of data written at each of the given offsets.
    fn make_sparse(path: &Path, len: u64, data_offsets: &[u64]) -> io::Result<()> {
        let mut f = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .truncate(true)
            .open(path)?;
        f.set_len(len)?;
        for &off in data_offsets {
            let buf = vec![0xAB_u8; 64 * 1024];
            f.seek(SeekFrom::Start(off))?;
            f.write_all(&buf)?;
        }
        f.sync_all()?;
        Ok(())
    }

    #[test]
    fn round_trip_small() {
        let dir = tempfile::tempdir().unwrap();
        let src = dir.path().join("src.bin");
        let dst = dir.path().join("dst.bin");

        std::fs::write(&src, b"hello world").unwrap();
        let n = fast_copy(&src, &dst).unwrap();
        assert_eq!(n, 11);
        assert_eq!(std::fs::read(&dst).unwrap(), b"hello world");
    }

    #[test]
    fn sparse_copy_preserves_holes_and_data() {
        // 16 MiB sparse file with 4 data extents at known offsets.
        // Use sparse_copy directly to exercise Tier 2 regardless of
        // the test-host filesystem.
        let dir = tempfile::tempdir().unwrap();
        let src = dir.path().join("src.bin");
        let dst = dir.path().join("dst.bin");

        let len: u64 = 16 * 1024 * 1024;
        let offsets = [0u64, 4 * 1024 * 1024, 8 * 1024 * 1024, 12 * 1024 * 1024];
        make_sparse(&src, len, &offsets).unwrap();

        let n = sparse_copy(&src, &dst).unwrap();
        assert_eq!(n, len);

        // Apparent size matches.
        let dst_meta = std::fs::metadata(&dst).unwrap();
        assert_eq!(dst_meta.len(), len);

        // Each data extent's bytes round-trip.
        let mut buf = [0u8; 64 * 1024];
        let mut dst_file = File::open(&dst).unwrap();
        for &off in &offsets {
            dst_file.seek(SeekFrom::Start(off)).unwrap();
            dst_file.read_exact(&mut buf).unwrap();
            assert!(buf.iter().all(|&b| b == 0xAB));
        }

        // Sparseness preservation: only meaningful if the source
        // itself is sparse on this filesystem. Some test hosts (FAT,
        // certain APFS configurations under tempfile mounts) don't
        // produce a sparse source from `ftruncate + pwrite` — in that
        // case sparseness is unachievable and we just confirm the
        // destination didn't blow up beyond the source's footprint.
        #[cfg(unix)]
        {
            let src_bytes_on_disk = std::fs::metadata(&src).unwrap().blocks() * 512;
            let dst_bytes_on_disk = dst_meta.blocks() * 512;
            if src_bytes_on_disk < len / 2 {
                // Source IS sparse. Destination must also be sparse —
                // this is the load-bearing regression test for the whole
                // module.
                assert!(
                    dst_bytes_on_disk < len / 2,
                    "source is sparse ({src_bytes_on_disk} bytes on disk) but destination densified to {dst_bytes_on_disk} bytes for an apparent size of {len}",
                );
                assert!(
                    dst_bytes_on_disk <= src_bytes_on_disk * 4 + 1024 * 1024,
                    "destination allocated significantly more than source: src={src_bytes_on_disk} dst={dst_bytes_on_disk}",
                );
            } else {
                eprintln!(
                    "filesystem did not sparsify the source (src_bytes_on_disk={src_bytes_on_disk}, apparent={len}); sparseness preservation not exercised in this run",
                );
                // Without source sparseness we can't exceed source's
                // footprint by much — guard against gross regressions.
                assert!(
                    dst_bytes_on_disk <= src_bytes_on_disk + 1024 * 1024,
                    "destination grew beyond source footprint: src={src_bytes_on_disk} dst={dst_bytes_on_disk}",
                );
            }
        }
    }

    #[test]
    fn fast_copy_matches_source_size() {
        let dir = tempfile::tempdir().unwrap();
        let src = dir.path().join("src.bin");
        let dst = dir.path().join("dst.bin");

        let len: u64 = 4 * 1024 * 1024;
        make_sparse(&src, len, &[0, 2 * 1024 * 1024]).unwrap();

        let n = fast_copy(&src, &dst).unwrap();
        assert_eq!(n, len);
        assert_eq!(std::fs::metadata(&dst).unwrap().len(), len);
    }

    #[test]
    fn missing_source_errors() {
        let dir = tempfile::tempdir().unwrap();
        let err = fast_copy(&dir.path().join("nope.bin"), &dir.path().join("dst.bin")).unwrap_err();
        assert_eq!(err.kind(), io::ErrorKind::NotFound);
    }
}