Skip to main content

kevy_madvise/
lib.rs

1//! kevy-madvise — thin pure-Rust `madvise` hints.
2//!
3//! A single best-effort kernel hint: tell Linux a region is a candidate for
4//! transparent huge pages (`MADV_HUGEPAGE`). Hand-bound with `unsafe extern
5//! "C"` against glibc — no `libc` crate, no third-party dependency. Off Linux
6//! every entry point compile-time no-ops.
7//!
8//! Carved out of `kevy-sys` so it can be used by other library crates (like
9//! `kevy-map`) without dragging the rest of the OS-boundary internals along. See
10//! [`advise_hugepage`] for the only entry point.
11//!
12//! # Safety
13//!
14//! `unsafe` is confined to a single `extern "C"` declaration of `madvise(3)`
15//! and one wrapper call site. The wrapper rounds the request to page
16//! boundaries, never reads or writes Rust memory, and silently no-ops when
17//! the kernel returns `EINVAL` — making it safe to expose as a plain `fn`.
18
19#![forbid(unsafe_op_in_unsafe_fn)]
20
21#[cfg(target_os = "linux")]
22mod ffi {
23    use core::ffi::{c_int, c_void};
24
25    // The four libc symbols kevy-madvise touches; every call site is in this
26    // file. glibc resolves these via `std`'s existing linkage — no extra
27    // link directive needed.
28    unsafe extern "C" {
29        pub fn madvise(addr: *mut c_void, length: usize, advice: c_int) -> c_int;
30        pub fn mmap(
31            addr: *mut c_void,
32            length: usize,
33            prot: c_int,
34            flags: c_int,
35            fd: c_int,
36            offset: i64,
37        ) -> *mut c_void;
38        pub fn munmap(addr: *mut c_void, length: usize) -> c_int;
39    }
40}
41
42/// Hint the kernel that the region `[ptr, ptr+len)` is a candidate for
43/// transparent huge pages (Linux `MADV_HUGEPAGE`). A best-effort kernel
44/// hint — returns nothing; mis-alignment / unsupported kernels silently
45/// no-op. Off Linux this is a compile-time no-op.
46///
47/// Used by [`kevy-map`](https://crates.io/crates/kevy-map) to drop dTLB-load
48/// misses on the metadata + slot arrays of large keyspace tables. madvise
49/// expects page-aligned `addr` and a page-multiple `length`; we round addr
50/// UP and len DOWN to 4 KiB. If nothing remains, we don't call. Regions
51/// smaller than ~ a few pages are not worth a syscall.
52pub fn advise_hugepage(ptr: *const u8, len: usize) {
53    // Miri cannot execute foreign syscalls; madvise is purely advisory, so
54    // a no-op under miri preserves correctness and lets miri exercise the
55    // rest of the program.
56    if cfg!(miri) {
57        let _ = (ptr, len);
58        return;
59    }
60    #[cfg(target_os = "linux")]
61    {
62        use core::ffi::{c_int, c_void};
63        // 4 KiB base page is universal on x86_64 / aarch64 Linux setups
64        // kevy targets. (On systems using 16 KiB / 64 KiB pages the wider
65        // alignment still happens to be a 4-KiB multiple, so this is
66        // correct, just slightly more conservative.)
67        const PAGE: usize = 4096;
68        if len < PAGE * 2 {
69            return;
70        }
71        let start = ptr as usize;
72        let aligned_start = (start + PAGE - 1) & !(PAGE - 1);
73        let end = start + len;
74        if aligned_start >= end {
75            return;
76        }
77        let aligned_len = (end - aligned_start) & !(PAGE - 1);
78        if aligned_len < PAGE * 2 {
79            return;
80        }
81        // Linux MADV_HUGEPAGE = 14 (mm/madvise.c, asm-generic/mman-common.h).
82        const MADV_HUGEPAGE: c_int = 14;
83        // SAFETY: ffi::madvise is a kernel advise call; it reads no Rust
84        // memory, performs no writes, and is benign on error (EINVAL on
85        // mis-aligned / unsupported kernels is what we want — no-op).
86        unsafe {
87            let _ = ffi::madvise(
88                aligned_start as *mut c_void,
89                aligned_len,
90                MADV_HUGEPAGE,
91            );
92        }
93    }
94    #[cfg(not(target_os = "linux"))]
95    {
96        let _ = (ptr, len);
97    }
98}
99
100/// 2 MiB — the x86_64 / aarch64 transparent-huge-page boundary.
101#[cfg(target_os = "linux")]
102const HUGE_PAGE: usize = 2 * 1024 * 1024;
103
104/// Allocate `len` bytes via anonymous `mmap`, with the returned address
105/// **2 MiB-aligned** AND the mapped length rounded up to a 2 MiB multiple.
106/// Then calls `MADV_HUGEPAGE` on the returned region.
107///
108/// 2 MiB alignment is what transparent huge pages require for the kernel
109/// to promote a region: the global allocator (jemalloc-like chunk
110/// placement) puts even MB-scale allocations at 4 KiB-aligned addresses
111/// inside its arenas, so `khugepaged` cannot find a 2 MiB-aligned
112/// candidate to promote even with `advise_hugepage` set. Allocating
113/// straight from `mmap` and explicitly aligning gives the kernel a
114/// promotion target.
115///
116/// **Linux only**: off Linux this returns `None` (the caller is expected
117/// to fall back to the global allocator). Returns `None` on `mmap`
118/// failure too — the caller should not panic; fall back instead.
119///
120/// The returned pointer must be released via [`munmap_2mb`]; passing it
121/// to `dealloc()` is UB.
122pub fn mmap_anon_aligned_2mb(len: usize) -> Option<core::ptr::NonNull<u8>> {
123    if cfg!(miri) || len == 0 {
124        return None;
125    }
126    #[cfg(target_os = "linux")]
127    {
128        use core::ffi::c_void;
129        // Linux mmap flags (asm-generic/mman.h + sys/mman.h):
130        const PROT_READ: i32 = 0x1;
131        const PROT_WRITE: i32 = 0x2;
132        const MAP_PRIVATE: i32 = 0x2;
133        const MAP_ANONYMOUS: i32 = 0x20;
134        const MAP_FAILED: *mut c_void = !0usize as *mut c_void;
135        let rounded = (len + HUGE_PAGE - 1) & !(HUGE_PAGE - 1);
136        // Over-allocate by one HP so we can trim down to a 2 MiB-aligned
137        // start — mmap returns page-aligned (4 KiB), not HP-aligned.
138        let total = rounded.checked_add(HUGE_PAGE)?;
139        // SAFETY: mmap is the canonical anonymous map; no Rust memory is
140        // read or written. NULL addr lets the kernel pick.
141        let raw = unsafe {
142            ffi::mmap(
143                core::ptr::null_mut(),
144                total,
145                PROT_READ | PROT_WRITE,
146                MAP_PRIVATE | MAP_ANONYMOUS,
147                -1,
148                0,
149            )
150        };
151        if raw == MAP_FAILED {
152            return None;
153        }
154        let raw_addr = raw as usize;
155        let aligned_start = (raw_addr + HUGE_PAGE - 1) & !(HUGE_PAGE - 1);
156        let prefix = aligned_start - raw_addr;
157        let suffix = total - prefix - rounded;
158        // Trim the unaligned prefix.
159        if prefix > 0 {
160            // SAFETY: prefix bytes at `raw` are exactly what we just mapped.
161            unsafe {
162                ffi::munmap(raw, prefix);
163            }
164        }
165        // Trim the trailing slack past the aligned region.
166        if suffix > 0 {
167            // SAFETY: `aligned_start + rounded` is inside the mapping.
168            unsafe {
169                ffi::munmap((aligned_start + rounded) as *mut c_void, suffix);
170            }
171        }
172        // Best-effort huge-page hint. EINVAL on unsupported kernels =
173        // benign — the mapping still works at 4 KiB pages.
174        const MADV_HUGEPAGE: i32 = 14;
175        // SAFETY: `aligned_start..aligned_start+rounded` is fully mapped,
176        // HP-aligned, HP-multiple. madvise reads no Rust memory.
177        unsafe {
178            let _ = ffi::madvise(aligned_start as *mut c_void, rounded, MADV_HUGEPAGE);
179        }
180        return core::ptr::NonNull::new(aligned_start as *mut u8);
181    }
182    #[cfg(not(target_os = "linux"))]
183    {
184        let _ = len;
185        None
186    }
187}
188
189/// Release a buffer previously returned by [`mmap_anon_aligned_2mb`].
190/// `len` must equal the original allocation length (or any value within
191/// the same 2 MiB-rounded total — the function rounds internally to match).
192/// Passing a pointer NOT obtained from [`mmap_anon_aligned_2mb`] is UB.
193///
194/// **Linux only**; on other targets this is a compile-time no-op (the
195/// caller should never have a non-None pointer to free).
196///
197/// # Safety
198/// `ptr` must come from a successful [`mmap_anon_aligned_2mb`] call and
199/// not yet have been munmap'd. `len` must match the original `len` arg.
200pub unsafe fn munmap_2mb(ptr: core::ptr::NonNull<u8>, len: usize) {
201    if cfg!(miri) {
202        let _ = (ptr, len);
203        return;
204    }
205    #[cfg(target_os = "linux")]
206    {
207        use core::ffi::c_void;
208        let rounded = (len + HUGE_PAGE - 1) & !(HUGE_PAGE - 1);
209        // SAFETY: caller guarantees ptr is a live mapping of `rounded`
210        // bytes from this module.
211        unsafe {
212            let _ = ffi::munmap(ptr.as_ptr() as *mut c_void, rounded);
213        }
214    }
215    #[cfg(not(target_os = "linux"))]
216    {
217        let _ = (ptr, len);
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224
225    #[test]
226    fn no_call_below_two_pages() {
227        // Smaller than 2 * 4 KiB: short-circuit, never reaches the syscall.
228        // We cannot directly assert "no syscall" without a hook, but the
229        // function must at least return cleanly on a tiny buffer.
230        let buf = [0u8; 1024];
231        advise_hugepage(buf.as_ptr(), buf.len());
232    }
233
234    #[test]
235    fn unaligned_buffer_does_not_panic() {
236        // 16 KiB unaligned buffer; the wrapper rounds inward and either
237        // calls madvise on the aligned subset or no-ops. Either way, no
238        // panic, no UB.
239        let buf = vec![0u8; 16 * 1024];
240        advise_hugepage(buf.as_ptr().wrapping_add(7), buf.len() - 7);
241    }
242
243    #[test]
244    fn zero_length_is_noop() {
245        advise_hugepage(core::ptr::null(), 0);
246    }
247
248    #[test]
249    fn large_aligned_region_runs() {
250        // 64 KiB region — enough to clear all the page-alignment guards.
251        // On Linux this issues the syscall; on macOS it's compile-time
252        // out. We only assert the function completes.
253        let buf = vec![0u8; 64 * 1024];
254        advise_hugepage(buf.as_ptr(), buf.len());
255    }
256}