mmap-snapshot 0.3.0

Safe mmap with snapshot isolation and atomic commits
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
#![warn(missing_docs)]
#![warn(clippy::undocumented_unsafe_blocks)]

/*! **Safe** `mmap()` with **snapshot isolation** and **atomic commits**.

([Linux-only](#os-support), [works best](#performance) on XFS/btrfs.)

## Example

Mmap a file as `&[u8]`:

```rust
# use mmap_snapshot::Mmap;
# fn foo() -> std::io::Result<()> {
# let path = std::path::Path::new("/tmp/doctest_1");
# std::fs::write(&path, b"Hello world!")?;
let mmap = Mmap::open(&path)?;
assert_eq!(mmap.len(), 12);
assert_eq!(&mmap[..], b"Hello world!");
# std::fs::remove_file(path)?;
# Ok(())
# }
```

Mmap a file as `&mut [u8]`, committing the changes back to disk:

```rust
# use mmap_snapshot::MmapMut;
# fn foo() -> std::io::Result<()> {
# let path = std::path::Path::new("/tmp/doctest_2");
# std::fs::write(&path, b"Hello world!")?;
let mut mmap = MmapMut::open(&path)?;
mmap[6..11].copy_from_slice(b"sekai");
mmap.commit()?;
assert_eq!(std::fs::read_to_string(&path)?, "Hello sekai!");
# std::fs::remove_file(path)?;
# Ok(())
# }
```

## Safety

The unsafe thing about mmapping a file is that it gives you _volatile memory_:
when someone modifies the file, the memory changes.  This is not the way a
respectable `&[u8]` (or even `&mut [u8]`) should behave.

So we use a trick: instead of mapping the file directly, we map a private
"snapshot" of the file which doesn't change, even when the file is externally
modified. The *only* way to modify the snapshot is via the mmap, which makes it
valid according to Rust's rules.

See the SAFETY comments in the code for a more thorough explanation.

## Performance

The cost of safety?
On my machine, `Mmap::open()` takes just 0.1 ms longer than `File::open()` -
that's it!
And it doesn't matter how big the file is.
A small price to pay.

But there's a catch:
if the file is on a filesystem which doesn’t support reflinks
then we have to copy the whole file.
Therefore, while the semantics are the same on all filesystems,
the performance characteristics vary wildly.

This table shows whether methods are constant-time or linear-time in
the size of the file:

Method | XFS | btrfs | ext4 | tmpfs
-------|-----|-------|------|-------
[`open()`][`Mmap::open`]                            | O(1) | O(1) | O(n) | O(n)
[`commit()`][`MmapMut::commit`]                     | O(1) | O(1) | O(n) | O(n)
[`commit_and_close()`][`MmapMut::commit_and_close`] | O(1) | O(1) | O(1) | O(1)

See the method docs for more details.

If the file is on a reflink-capable filesystem, the overhead is so tiny that
there's really no reason not to snapshot it.  However, although many distros
now default to reflink-capable filesystems for new installs[^debian], it will
obviously be common to encounter ext4 in the wild for many years to come.  So
be aware that a subset of your users may experience stalls when mmapping large
files.

[^debian]: The major exceptions are Debian and Ubuntu, which select ext4 by
    default in the installer.  This is, frankly, a bad decision. From its
    creation, ext4 was intended as a "stop-gap" to give people more time to
    migrate away from the ext* family of filesystems.  Encouraging its use on
    fresh installs is poor.

## Platform support

We make the snapshot by cloning the original file into a private (unlinked) file.
It's impossible for anyone else to modify this file, which is what makes it safe to mmap.
On Linux we use `O_TMPFILE` for this.
I don't know of a race-free way to create an unlinked file on MacOS/Windows;
if one exists, please open an issue to let me know!

*/

use rustix::{
    fs::{AtFlags, Mode, OFlags, copy_file_range, ftruncate, ioctl_ficlone, linkat, open, rename},
    io::Errno,
    mm::{MapFlags, MremapFlags, MsyncFlags, ProtFlags, mmap, mremap, msync, munmap},
};
use std::{
    ffi::c_void,
    fs::File,
    io,
    ops::{Deref, DerefMut},
    os::fd::AsFd,
    path::{Path, PathBuf},
};

/// Returns whether it fell back
fn ficlone(fd_out: impl AsFd, fd_in: impl AsFd, len: usize) -> io::Result<bool> {
    match ioctl_ficlone(&fd_out, &fd_in) {
        Ok(()) => Ok(false),
        Err(Errno::OPNOTSUPP) => {
            ftruncate(&fd_out, len as u64)?;
            let mut off_in = 0;
            let mut off_out = 0;
            while off_in < len as u64 {
                let rem = len - off_in as usize;
                let n =
                    copy_file_range(&fd_in, Some(&mut off_in), &fd_out, Some(&mut off_out), rem)?;
                assert_eq!(off_in, off_out);
                assert!(
                    n <= rem,
                    "copy_file_range() copied more bytes than requested"
                );
                if n == 0 {
                    Err(io::ErrorKind::UnexpectedEof)?;
                }
            }
            assert_eq!(off_out, len as u64);
            Ok(true)
        }
        Err(e) => Err(e.into()),
    }
}

/// A point-in-time snapshot of a file
///
/// Read the file contents using the `Deref` impl.  The data you see will
/// reflect the state of the file at the time `open()` was called; writes by other
/// process are not reflected.  In other words, `Mmap` will show you a consistent
/// point-in-time snapshot of the file.
///
/// Data is not loaded eagerly into memory.  It will be read in from disk on demand.
/// For this we rely on the COW capabilities of the underlying filesystem.
pub struct Mmap {
    ptr: *mut c_void, // null iff len == 0
    len: usize,
}

// SAFETY: `ptr`+`len` are just "plain old memory" (that's the point of the
// trick with the private unlinked file.)  It can be read from any thread.
unsafe impl Send for Mmap {}
// SAFETY: `ptr`+`len` are just "plain old memory" (that's the point of the
// trick with the private unlinked file.)  It can be read concurrently from
// multiple threads.
unsafe impl Sync for Mmap {}

impl Mmap {
    /// Take a snapshot of the file and map it into memory.
    ///
    /// # Performance
    ///
    /// If the filesystem _doesn't_ support reflinks (eg. ext4) then this will
    /// physically duplicate the file on disk.  If the file is large then this
    /// will be slow and consume disk space.  The duplicate will be deleted when
    /// the `Mmap` is dropped.
    ///
    /// If the filesystem _does_ support reflinks (XFS, btrfs) then we simply
    /// mark the file as "copy on write" until the `Mmap` is dropped.  This is
    /// O(1) and fast: on my machine it takes just 0.1 ms longer than a plain
    /// old `File::open()`.  Disk usage will not increase unless the file is
    /// externally modified.
    pub fn open(path: impl AsRef<Path>) -> io::Result<Self> {
        let path = path.as_ref();
        let original = File::open(path)?;
        let len = original.metadata()?.len() as usize;
        if len >= isize::MAX as usize {
            return Err(io::ErrorKind::FileTooLarge.into());
        }
        let dir = path.parent().filter(|x| *x != "").unwrap_or(Path::new("."));
        // Create an unlinked clone of `original`
        let private: File =
            open(dir, OFlags::TMPFILE | OFlags::RDWR, Mode::RUSR | Mode::WUSR)?.into();
        ficlone(&private, &original, len)?;

        let ptr;
        if len == 0 {
            ptr = std::ptr::null_mut();
        } else {
            // SAFETY:
            // > If `ptr` is not null, it must be aligned...
            //
            // `ptr` is null.
            //
            // > If there exist any Rust references referring to the memory region
            //
            // We're letting the kernel pick an unused region so there shouldn't be any.
            //
            // > or if you subsequently create a Rust reference referring to the
            // > resulting region,
            //
            // We will be doing this.
            //
            // > it is your responsibility to ensure that the Rust reference invariants are
            // > preserved, including ensuring that the memory is not mutated in a way that
            // > a Rust reference would not expect.
            //
            // See the safety comment in the Deref impl.
            unsafe {
                ptr = mmap(
                    std::ptr::null_mut(),
                    len,
                    ProtFlags::READ,
                    MapFlags::SHARED,
                    &private,
                    0,
                )?;
            }
        };
        assert!(ptr.is_null() == (len == 0));
        // We drop `original` and `private` here, closing both fds.  The mapping
        // itself keeps `private`'s inode alive.  Unmapping will drop the
        // linkcount on the inode to zero and destroy it.
        Ok(Self { ptr, len })
    }
}

impl Deref for Mmap {
    type Target = [u8];

    fn deref(&self) -> &[u8] {
        if self.len == 0 {
            &[]
        } else {
            // SAFETY:
            // > `ptr` must be non-null
            //
            // We just checked that `len` is non-zero.  This implies that `ptr`
            // is not null (as per the assert in `open()`).
            //
            // > `ptr` must be valid for reads for `len * size_of::<T>()` many bytes
            // > ...
            // > The entire memory range of this slice must be contained within a single allocation!
            //
            // The whole range comes from a single call to `mmap()` with length
            // `len`.
            //
            // >   * `ptr` must be properly aligned
            //
            // The element type is `u8`, so `ptr` is trivially aligned.
            //
            // > `ptr` must point to `len` consecutive properly initialized values
            // > of type `u8`.
            //
            // File-backed VMAs count as initialized.  There's no such thing as a
            // file which contains uninitialised bytes.  (Even sparse regions are
            // well-defined as containing zeroes.)
            //
            // > The memory referenced by the returned slice must not be mutated for
            // > the duration of lifetime `'a`, except inside an `UnsafeCell`.
            //
            // Since we never modify the memory directly, the only way for it to
            // change is via writes to the underlying file. However, we can be sure
            // that no such writes will take place. That's because:
            //
            // * The file was created with `O_TMPFILE`, which means it's impossible
            //   to create a new fd for the file via the filesystem.
            // * We close our fd without ever exposing it, which means it's not
            //   possible that anyone cloned it.
            // * Therefore no fds exist referencing the underlying file
            // * Therefore the memory can only be accessed via the mmap
            //
            // > The total size `len * size_of::<T>()` of the slice must be no
            // > larger than `isize::MAX`
            //
            // We check this in open().
            //
            // > adding `len * size_of::<T>()` to `ptr` must not "wrap around" the
            // > address space.
            //
            // `mmap()` puts the mapping somewhere where it fits, so
            // `self.ptr.add(self.len)` will never overflow the address space.
            unsafe { core::slice::from_raw_parts(self.ptr as *const u8, self.len) }
        }
    }
}

impl Drop for Mmap {
    fn drop(&mut self) {
        if self.len != 0 {
            // SAFETY:
            // > `ptr` must be aligned to the applicable page size, and the range of memory
            // > starting at `ptr` and extending for `len` bytes, rounded up to the
            // > applicable page size, must be valid to mutate with `ptr`'s provenance.
            //
            // `self.ptr` comes from an mmap with length `self.len`.
            //
            // > And there must be no Rust references referring to that memory.
            //
            // The only way to get references to the mapping is via the Deref impl,
            // which take borrows on the `Mmap`.  Since this method takes `&mut
            // self`, we know that no such references are live.
            unsafe {
                match munmap(self.ptr, self.len) {
                    Ok(()) => (),
                    Err(e) => eprintln!("munmap failed: {e}"),
                }
            }
        }
    }
}

/// A mutable snapshot of a file
///
/// The snapshot can be modified and then atomically committed to disk,
/// overwriting the contents of file.
///
/// ## Reading
///
/// Read the file contents using the `Deref` impl.  The data you see will
/// reflect the state of the file at the time `open()` was called; writes by other
/// process are not reflected.  In other words, `MmapMut` will show you a consistent
/// point-in-time snapshot of the file.
///
/// Data is not loaded eagerly into memory.  It will be read in from disk on demand.
/// For this we rely on the COW capabilities of the underlying filesystem.
///
/// ## Writing
///
/// Modify the contents using the `DerefMut` impl.  Writes will not be visible
/// to other processes reading the file until you call `commit()`.  Once you
/// call `commit()`, all your modifications will be atomically visible to other
/// readers.  If you drop the `MmapMut` without calling `commit()`, your writes
/// will be lost!
///
/// Modifications are written to disk continuously in the background; `commit()`
/// simply waits for writeback to finish, and then makes the written changes
/// visible.
pub struct MmapMut {
    original: OriginalFile,
    private: File, // Unlinked; initially a clone of `original`
    ptr: *mut c_void,
    len: usize,
}

enum OriginalFile {
    /// In this case the file is on a reflink-capable filesystem
    Fd(File),
    /// In this case the file is on a reflink-incapable filesystem
    Path(PathBuf),
}

// SAFETY:
// All members of `MmapMut` implement Send except for `ptr`.  `ptr`+`len` are just
// "plain old memory" (that's the point of the trick with the private unlinked
// file.)  Accessing it from multiple threads is fine.
unsafe impl Send for MmapMut {}
// SAFETY:
// The mapping (`ptr`+`len`) is fine to read concurrently from multiple threads.
//
unsafe impl Sync for MmapMut {}

impl MmapMut {
    /// Take a snapshot of the file and map it into memory.
    ///
    /// Note that changes to the snapshot will be discarded unless you call
    /// [`MmapMut::commit`].
    ///
    /// # Performance
    ///
    /// If the filesystem _doesn't_ support reflinks (eg. ext4) then this
    /// will physically duplicate the file on disk.  If the file is large then
    /// clearly this will be slow and consume I/O bandwidth.  The duplicate will
    /// be deleted when the `MmapMut` is dropped.
    ///
    /// If the filesystem _does_ support reflinks (XFS, btrfs) then we simply
    /// mark the file as "copy on write" until the `MmapMut` is dropped.  This is
    /// O(1) and fast: on my machine it takes just 0.1 ms longer than a plain
    /// old `File::open()`.  Disk usage will not increase until the file is
    /// modified.
    pub fn open(path: impl AsRef<Path>) -> io::Result<Self> {
        let path = path.as_ref();
        let original = File::options().read(true).write(true).open(path)?;
        let len = original.metadata()?.len() as usize;
        if len >= isize::MAX as usize {
            return Err(io::ErrorKind::FileTooLarge.into());
        }
        let dir = path.parent().filter(|x| *x != "").unwrap_or(Path::new("."));
        let private: File =
            open(dir, OFlags::TMPFILE | OFlags::RDWR, Mode::RUSR | Mode::WUSR)?.into();
        let fellback = ficlone(&private, &original, len)?;

        let ptr;
        if len == 0 {
            ptr = std::ptr::null_mut();
        } else {
            // SAFETY:
            // > If `ptr` is not null, it must be aligned...
            //
            // `ptr` is null.
            //
            // > If there exist any Rust references referring to the memory region
            //
            // We're letting the kernel pick an unused region so there shouldn't be any.
            //
            // > or if you subsequently create a Rust reference referring to the
            // > resulting region,
            //
            // We will be doing this.
            //
            // > it is your responsibility to ensure that the Rust reference invariants are
            // > preserved, including ensuring that the memory is not mutated in a way that
            // > a Rust reference would not expect.
            //
            // See the safety comment in the DerefMut impl.
            unsafe {
                ptr = mmap(
                    std::ptr::null_mut(),
                    len,
                    ProtFlags::READ | ProtFlags::WRITE,
                    MapFlags::SHARED,
                    &private,
                    0,
                )?;
            }
        };
        assert!(ptr.is_null() == (len == 0));
        Ok(Self {
            private,
            ptr,
            len,
            original: if fellback {
                OriginalFile::Path(path.to_owned())
            } else {
                OriginalFile::Fd(original)
            },
        })
    }

    /// Create a mapping with zero length.
    ///
    /// Use [`MmapMut::resize`] to increase the length.  The file won't be
    /// created until the first time you call [`MmapMut::commit`].
    pub fn create(path: impl AsRef<Path>) -> io::Result<Self> {
        let path = path.as_ref();
        let original = File::create(path)?;
        let dir = path.parent().filter(|x| *x != "").unwrap_or(Path::new("."));
        let private: File =
            open(dir, OFlags::TMPFILE | OFlags::RDWR, Mode::RUSR | Mode::WUSR)?.into();
        let ptr = std::ptr::null_mut();
        let len = 0;
        // The clone is a no-op of course; we just do this to find out whether
        // reflinks are available
        let fellback = matches!(ioctl_ficlone(&private, &original), Err(Errno::OPNOTSUPP));
        Ok(Self {
            private,
            ptr,
            len,
            original: if fellback {
                OriginalFile::Path(path.to_owned())
            } else {
                OriginalFile::Fd(original)
            },
        })
    }

    /// Atomically replace the original file with the contents of the snapshot.
    ///
    /// You can continue to read/write the mmap after calling `commit()`.
    ///
    /// # Performance
    ///
    /// It's a similar story to [`MmapMut::open()`]: if the filesystem supports
    /// reflinks it'll be a fast O(1) (after waiting for writeback to finish);
    /// otherwise it'll be O(n).
    ///
    /// If you're done with the file you can use [`MmapMut::commit_and_close`],
    /// which is always O(1).
    pub fn commit(&mut self) -> io::Result<()> {
        self.sync()?;
        match &self.original {
            OriginalFile::Fd(original) => ioctl_ficlone(original, &self.private)?,
            OriginalFile::Path(path) => {
                // We can't just copy self.private to self.original, since
                // this would not be atomic. And we need to keep self.private
                // unlinked. So we create a new private file, copy over the
                // contents, and link it.
                let dir = path.parent().filter(|x| *x != "").unwrap_or(Path::new("."));
                let private2: File =
                    open(dir, OFlags::TMPFILE | OFlags::RDWR, Mode::RUSR | Mode::WUSR)?.into();
                // This is non-atomic but that's fine, since we're holding &mut
                // self and therefore `self.private` can't receive modifications
                // while the copy is in-progress
                ficlone(&private2, &self.private, self.len)?;
                link(&private2, path)?;
            }
        }
        Ok(())
    }

    /// Atomically replace the original file with the contents of the snapshot and close it.
    ///
    /// Atomic and O(1).
    pub fn commit_and_close(self) -> io::Result<()> {
        self.sync()?;
        match &self.original {
            OriginalFile::Fd(original) => ioctl_ficlone(original, &self.private)?,
            OriginalFile::Path(path) => {
                // `path` is always on the same filesystem as the original file - it
                // _is_ the original file!  So this is atomic.
                link(&self.private, path)?;
            }
        }
        Ok(())
    }

    /// Link this snapshot to the directory tree at the given path.
    ///
    /// Atomic and O(1) if `path` is on the same filesystem as the original
    /// file.
    pub fn link(self, path: impl AsRef<Path>) -> io::Result<()> {
        self.sync()?;
        link(&self.private, path.as_ref())?;
        Ok(())
    }

    fn sync(&self) -> io::Result<()> {
        if self.len != 0 {
            // SAFETY:
            // > `addr` must be a valid pointer to memory that is appropriate to call
            // > `msync` on.
            //
            // Given that len is non-zero, `self.ptr` is a pointer which
            // came from `mmap()`, and `self.len` is the length we passed to
            // `mmap()`, so together these describe an mmapped region and are
            // safe to pass to `msync()`.
            unsafe {
                msync(self.ptr, self.len, MsyncFlags::SYNC)?;
            }
        }
        Ok(())
    }

    /// Change the size of the file.  If extending, the extension is filled with zeroes.
    pub fn resize(&mut self, new_len: usize) -> io::Result<()> {
        if new_len >= isize::MAX as usize {
            return Err(io::ErrorKind::FileTooLarge.into());
        }
        if new_len == self.len {
            return Ok(());
        }
        ftruncate(&self.private, new_len as u64)?;
        if new_len == 0 {
            // SAFETY: See the Drop impl
            unsafe {
                munmap(self.ptr, self.len)?;
            }
            self.ptr = std::ptr::null_mut();
        } else if self.len == 0 {
            // SAFETY: See MmapMut::open()
            unsafe {
                self.ptr = mmap(
                    std::ptr::null_mut(),
                    new_len,
                    ProtFlags::READ | ProtFlags::WRITE,
                    MapFlags::SHARED,
                    &self.private,
                    0,
                )?;
            }
        } else {
            // SAFETY:
            // > `self.ptr` must be aligned to the applicable page size, and the range of
            // > memory starting at `self.ptr` and extending for `self.len` bytes,
            // > rounded up to the applicable page size, must be valid to mutate with
            // > `self.ptr`'s provenance.
            //
            // `self.ptr` comes from an mmap with length `self.len`, so this should
            // all hold.
            //
            // > If `MremapFlags::MAY_MOVE` is set in `flags`,
            // > there must be no Rust references referring to that the memory.
            //
            // This flag is set, so `mremap()` might move the mapping to a
            // completely new address. The only way to get references to the mapping
            // is via the Deref/DerefMut impls, which take borrows on the `MmapMut`. Since
            // this method takes `&mut self`, we know that no such references are
            // live.
            //
            // > If `new_len` is less than `self.len`, than there must be no Rust
            // > references referring to the memory starting at offset `new_len` and ending
            // > at `self.len`.
            //
            // As per the above, there are no live references at all into the
            // mapping.
            unsafe {
                self.ptr = mremap(self.ptr, self.len, new_len, MremapFlags::MAYMOVE)?;
            }
        }
        self.len = new_len;
        assert!(self.ptr.is_null() == (self.len == 0));
        Ok(())
    }
}

// Now the most annoying part: you can't just link the file to
// `path` because it'll fail if `path` already exists (which it
// does).  So we have to do another little dance, and this one
// is actually racy :-(
// TODO: We can avoid all this if/when Linux ever gets `AtFlags::REPLACE`.
fn link(fd: &File, path: &Path) -> io::Result<()> {
    let mut tmppath = path.with_added_extension(".tmp");
    loop {
        match linkat(fd, "", rustix::fs::CWD, &tmppath, AtFlags::EMPTY_PATH) {
            Ok(()) => {
                rename(tmppath, path)?;
                break; // we did it!
            }
            Err(Errno::EXIST) => {
                tmppath = tmppath.with_added_extension(".tmp");
                // try again...
            }
            Err(e) => Err(e)?,
        }
    }
    Ok(())
}

impl Deref for MmapMut {
    type Target = [u8];

    fn deref(&self) -> &[u8] {
        if self.len == 0 {
            &[]
        } else {
            // SAFETY: See the `DerefMut` impl.
            unsafe { core::slice::from_raw_parts(self.ptr as *const u8, self.len) }
        }
    }
}

impl DerefMut for MmapMut {
    fn deref_mut(&mut self) -> &mut [u8] {
        if self.len == 0 {
            &mut []
        } else {
            // SAFETY:
            // > `ptr` must be valid for both reads and writes for `len *
            // >  size_of::<T>()` many bytes ...
            // > The entire memory range of this slice must be contained within a
            // >  single allocation!
            //
            // The whole range comes from a single call to `mmap()` with length
            // `len`.
            //
            // > `ptr` must be non-null
            //
            // So long as len is non-zero, `ptr` is asserted to be non-null wherever
            // it is modified (`open()` and `resize()`).
            //
            // > `ptr` must be properly aligned
            //
            // The element type is `u8`, so `ptr` is trivially aligned.
            //
            // > `ptr` must point to `len` consecutive properly initialized values
            // > of type `u8`.
            //
            // File-backed VMAs count as initialized.  There's no such thing as a
            // file which contains uninitialised bytes.  (Even sparse regions are
            // well-defined as containing zeroes.)
            //
            // > The memory referenced by the returned slice must not be accessed
            // > through any other pointer (not derived from the return value) for
            // > the duration of lifetime `'a`. Both read and write accesses are
            // > forbidden.
            //
            // This is the big one.  I believe this is satisfied if both of the
            // following hold true:
            //
            // * the only way to mutate the memory is via this `DerefMut` impl
            // * the only way to read the memory is via this `DerefMut` impl or the
            //   `Deref` impl
            //
            // This memory can be accessed via these impls of course, and also via
            // operations on the underlying file. However, we can be sure that no
            // such file operations will take place. That's because:
            //
            // * The file was created with `O_TMPFILE`, which means it's impossible
            //   to create a new fd for the file via the filesystem.
            // * We never expose our fd, which means it's impossible to create a new
            //   fd via `clone()`.
            // * Therefore the _only_ fd referencing the underlying file is
            //   `self.private`.
            // * All public methods which access the fd (self.private) take
            //   `&mut self`.
            // * Therefore we don't access the file via that fd while `'a` is live.
            // * Therefore the memory can only be accessed via the mmap
            //
            // > The total size `len * size_of::<T>()` of the slice must be no
            // > larger than `isize::MAX`, and adding that size to `ptr` must not
            // > "wrap around" the address space. See the safety documentation of
            // > [`pointer::offset`].
            //
            // `mmap()` puts the mapping somewhere where it fits, so
            // `self.ptr.add(self.len)` will never overflow the address space.
            // `self.len < isize::MAX` is asserted in open() and resize().
            unsafe { core::slice::from_raw_parts_mut(self.ptr as *mut u8, self.len) }
        }
    }
}

impl Drop for MmapMut {
    fn drop(&mut self) {
        if self.len != 0 {
            // SAFETY:
            // > `ptr` must be aligned to the applicable page size, and the range of memory
            // > starting at `ptr` and extending for `len` bytes, rounded up to the
            // > applicable page size, must be valid to mutate with `ptr`'s provenance.
            //
            // `self.ptr` comes from an mmap with length `self.len`.
            //
            // > And there must be no Rust references referring to that memory.
            //
            // The only way to get references to the mapping is via the
            // Deref/DerefMut impls, which take borrows on the `MmapMut`.  Since this
            // method takes `&mut self`, we know that no such references are live.
            unsafe {
                match munmap(self.ptr, self.len) {
                    Ok(()) => (),
                    Err(e) => eprintln!("munmap failed: {e}"),
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // TODO: These could use some improvement.  Ideally I'd mount a bunch
    // of different filesystems... but that requires root.  Anyway, I should
    // systematically make sure the files used by different tests don't
    // conflict, and that they're cleaned up at the end.

    fn paths(name: &str) -> impl Iterator<Item = PathBuf> {
        ["/tmp", "/var/tmp"].into_iter().map(move |d| {
            let d = Path::new(d).join("mmap-snapshot");
            std::fs::create_dir_all(&d).unwrap();
            d.join(name)
        })
    }

    #[test]
    fn mmap() -> std::io::Result<()> {
        for p in paths("mmap") {
            std::fs::write(&p, b"Hello world!")?;
            let f = Mmap::open(&p)?;
            std::fs::write(&p, b"Goodbye world!")?;
            assert_eq!(&*f, b"Hello world!");
            std::fs::remove_file(&p)?;
            assert_eq!(&*f, b"Hello world!");
        }
        Ok(())
    }

    #[test]
    fn mmap_mut() -> std::io::Result<()> {
        for p in paths("mmap_mut") {
            std::fs::write(&p, b"Hello world!")?;
            let mut f = MmapMut::open(&p)?;
            assert_eq!(&*f, b"Hello world!");
            f[6..11].copy_from_slice(b"sekai");
            assert_eq!(&*f, b"Hello sekai!");
            assert_eq!(std::fs::read_to_string(&p)?, "Hello world!");
            f.commit()?;
            std::mem::drop(f);
            assert_eq!(std::fs::read_to_string(&p)?, "Hello sekai!");
            std::fs::remove_file(&p)?;
        }
        Ok(())
    }

    #[test]
    fn zero_len() -> std::io::Result<()> {
        for p in paths("zero_len") {
            File::create(&p)?;
            let f = Mmap::open(&p)?;
            assert_eq!(&*f, b"");
            std::fs::remove_file(&p)?;
            assert_eq!(&*f, b"");
        }
        Ok(())
    }

    #[test]
    fn zero_len_mut() -> std::io::Result<()> {
        for p in paths("zero_len_mut") {
            File::create(&p)?;
            let mut f = MmapMut::open(&p)?;
            assert_eq!(&*f, b"");
            f.resize(12)?;
            f.copy_from_slice(b"Hello world!");
            assert_eq!(std::fs::read_to_string(&p)?, "");
            f.commit()?;
            assert_eq!(std::fs::read_to_string(&p)?, "Hello world!");
            f[6..11].copy_from_slice(b"sekai");
            assert_eq!(&*f, b"Hello sekai!");
            assert_eq!(std::fs::read_to_string(&p)?, "Hello world!");
            f.commit()?;
            std::mem::drop(f);
            assert_eq!(std::fs::read_to_string(&p)?, "Hello sekai!");
            std::fs::remove_file(&p)?;
        }
        Ok(())
    }
}