kevy_madvise/lib.rs
1//! kevy-madvise — thin pure-Rust `madvise` hints.
2//!
3//! A single best-effort kernel hint: tell Linux a region is a candidate for
4//! transparent huge pages (`MADV_HUGEPAGE`). Hand-bound with `unsafe extern
5//! "C"` against glibc — no `libc` crate, no third-party dependency. Off Linux
6//! every entry point compile-time no-ops.
7//!
8//! Carved out of `kevy-sys` so it can be used by other library crates (like
9//! `kevy-map`) without dragging the rest of the OS-boundary internals along. See
10//! [`advise_hugepage`] for the only entry point.
11//!
12//! # Safety
13//!
14//! `unsafe` is confined to a single `extern "C"` declaration of `madvise(3)`
15//! and one wrapper call site. The wrapper rounds the request to page
16//! boundaries, never reads or writes Rust memory, and silently no-ops when
17//! the kernel returns `EINVAL` — making it safe to expose as a plain `fn`.
18
19#![forbid(unsafe_op_in_unsafe_fn)]
20
21#[cfg(target_os = "linux")]
22mod ffi {
23 use core::ffi::{c_int, c_void};
24
25 // The four libc symbols kevy-madvise touches; every call site is in this
26 // file. glibc resolves these via `std`'s existing linkage — no extra
27 // link directive needed.
28 unsafe extern "C" {
29 pub fn madvise(addr: *mut c_void, length: usize, advice: c_int) -> c_int;
30 pub fn mmap(
31 addr: *mut c_void,
32 length: usize,
33 prot: c_int,
34 flags: c_int,
35 fd: c_int,
36 offset: i64,
37 ) -> *mut c_void;
38 pub fn munmap(addr: *mut c_void, length: usize) -> c_int;
39 }
40}
41
42/// Hint the kernel that the region `[ptr, ptr+len)` is a candidate for
43/// transparent huge pages (Linux `MADV_HUGEPAGE`). A best-effort kernel
44/// hint — returns nothing; mis-alignment / unsupported kernels silently
45/// no-op. Off Linux this is a compile-time no-op.
46///
47/// Used by [`kevy-map`](https://crates.io/crates/kevy-map) to drop dTLB-load
48/// misses on the metadata + slot arrays of large keyspace tables. madvise
49/// expects page-aligned `addr` and a page-multiple `length`; we round addr
50/// UP and len DOWN to 4 KiB. If nothing remains, we don't call. Regions
51/// smaller than ~ a few pages are not worth a syscall.
52pub fn advise_hugepage(ptr: *const u8, len: usize) {
53 // Miri cannot execute foreign syscalls; madvise is purely advisory, so
54 // a no-op under miri preserves correctness and lets miri exercise the
55 // rest of the program.
56 if cfg!(miri) {
57 let _ = (ptr, len);
58 return;
59 }
60 #[cfg(target_os = "linux")]
61 {
62 use core::ffi::{c_int, c_void};
63 // 4 KiB base page is universal on x86_64 / aarch64 Linux setups
64 // kevy targets. (On systems using 16 KiB / 64 KiB pages the wider
65 // alignment still happens to be a 4-KiB multiple, so this is
66 // correct, just slightly more conservative.)
67 const PAGE: usize = 4096;
68 if len < PAGE * 2 {
69 return;
70 }
71 let start = ptr as usize;
72 let aligned_start = (start + PAGE - 1) & !(PAGE - 1);
73 let end = start + len;
74 if aligned_start >= end {
75 return;
76 }
77 let aligned_len = (end - aligned_start) & !(PAGE - 1);
78 if aligned_len < PAGE * 2 {
79 return;
80 }
81 // Linux MADV_HUGEPAGE = 14 (mm/madvise.c, asm-generic/mman-common.h).
82 const MADV_HUGEPAGE: c_int = 14;
83 // SAFETY: ffi::madvise is a kernel advise call; it reads no Rust
84 // memory, performs no writes, and is benign on error (EINVAL on
85 // mis-aligned / unsupported kernels is what we want — no-op).
86 unsafe {
87 let _ = ffi::madvise(
88 aligned_start as *mut c_void,
89 aligned_len,
90 MADV_HUGEPAGE,
91 );
92 }
93 }
94 #[cfg(not(target_os = "linux"))]
95 {
96 let _ = (ptr, len);
97 }
98}
99
100/// 2 MiB — the x86_64 / aarch64 transparent-huge-page boundary.
101#[cfg(target_os = "linux")]
102const HUGE_PAGE: usize = 2 * 1024 * 1024;
103
104/// Allocate `len` bytes via anonymous `mmap`, with the returned address
105/// **2 MiB-aligned** AND the mapped length rounded up to a 2 MiB multiple.
106/// Then calls `MADV_HUGEPAGE` on the returned region.
107///
108/// 2 MiB alignment is what transparent huge pages require for the kernel
109/// to promote a region: the global allocator (jemalloc-like chunk
110/// placement) puts even MB-scale allocations at 4 KiB-aligned addresses
111/// inside its arenas, so `khugepaged` cannot find a 2 MiB-aligned
112/// candidate to promote even with `advise_hugepage` set. Allocating
113/// straight from `mmap` and explicitly aligning gives the kernel a
114/// promotion target.
115///
116/// **Linux only**: off Linux this returns `None` (the caller is expected
117/// to fall back to the global allocator). Returns `None` on `mmap`
118/// failure too — the caller should not panic; fall back instead.
119///
120/// The returned pointer must be released via [`munmap_2mb`]; passing it
121/// to `dealloc()` is UB.
122pub fn mmap_anon_aligned_2mb(len: usize) -> Option<core::ptr::NonNull<u8>> {
123 if cfg!(miri) || len == 0 {
124 return None;
125 }
126 #[cfg(target_os = "linux")]
127 {
128 use core::ffi::c_void;
129 // Linux mmap flags (asm-generic/mman.h + sys/mman.h):
130 const PROT_READ: i32 = 0x1;
131 const PROT_WRITE: i32 = 0x2;
132 const MAP_PRIVATE: i32 = 0x2;
133 const MAP_ANONYMOUS: i32 = 0x20;
134 const MAP_FAILED: *mut c_void = !0usize as *mut c_void;
135 let rounded = (len + HUGE_PAGE - 1) & !(HUGE_PAGE - 1);
136 // Over-allocate by one HP so we can trim down to a 2 MiB-aligned
137 // start — mmap returns page-aligned (4 KiB), not HP-aligned.
138 let total = rounded.checked_add(HUGE_PAGE)?;
139 // SAFETY: mmap is the canonical anonymous map; no Rust memory is
140 // read or written. NULL addr lets the kernel pick.
141 let raw = unsafe {
142 ffi::mmap(
143 core::ptr::null_mut(),
144 total,
145 PROT_READ | PROT_WRITE,
146 MAP_PRIVATE | MAP_ANONYMOUS,
147 -1,
148 0,
149 )
150 };
151 if raw == MAP_FAILED {
152 return None;
153 }
154 let raw_addr = raw as usize;
155 let aligned_start = (raw_addr + HUGE_PAGE - 1) & !(HUGE_PAGE - 1);
156 let prefix = aligned_start - raw_addr;
157 let suffix = total - prefix - rounded;
158 // Trim the unaligned prefix.
159 if prefix > 0 {
160 // SAFETY: prefix bytes at `raw` are exactly what we just mapped.
161 unsafe {
162 ffi::munmap(raw, prefix);
163 }
164 }
165 // Trim the trailing slack past the aligned region.
166 if suffix > 0 {
167 // SAFETY: `aligned_start + rounded` is inside the mapping.
168 unsafe {
169 ffi::munmap((aligned_start + rounded) as *mut c_void, suffix);
170 }
171 }
172 // Best-effort huge-page hint. EINVAL on unsupported kernels =
173 // benign — the mapping still works at 4 KiB pages.
174 const MADV_HUGEPAGE: i32 = 14;
175 // SAFETY: `aligned_start..aligned_start+rounded` is fully mapped,
176 // HP-aligned, HP-multiple. madvise reads no Rust memory.
177 unsafe {
178 let _ = ffi::madvise(aligned_start as *mut c_void, rounded, MADV_HUGEPAGE);
179 }
180 return core::ptr::NonNull::new(aligned_start as *mut u8);
181 }
182 #[cfg(not(target_os = "linux"))]
183 {
184 let _ = len;
185 None
186 }
187}
188
189/// Release a buffer previously returned by [`mmap_anon_aligned_2mb`].
190/// `len` must equal the original allocation length (or any value within
191/// the same 2 MiB-rounded total — the function rounds internally to match).
192/// Passing a pointer NOT obtained from [`mmap_anon_aligned_2mb`] is UB.
193///
194/// **Linux only**; on other targets this is a compile-time no-op (the
195/// caller should never have a non-None pointer to free).
196///
197/// # Safety
198/// `ptr` must come from a successful [`mmap_anon_aligned_2mb`] call and
199/// not yet have been munmap'd. `len` must match the original `len` arg.
200pub unsafe fn munmap_2mb(ptr: core::ptr::NonNull<u8>, len: usize) {
201 if cfg!(miri) {
202 let _ = (ptr, len);
203 return;
204 }
205 #[cfg(target_os = "linux")]
206 {
207 use core::ffi::c_void;
208 let rounded = (len + HUGE_PAGE - 1) & !(HUGE_PAGE - 1);
209 // SAFETY: caller guarantees ptr is a live mapping of `rounded`
210 // bytes from this module.
211 unsafe {
212 let _ = ffi::munmap(ptr.as_ptr() as *mut c_void, rounded);
213 }
214 }
215 #[cfg(not(target_os = "linux"))]
216 {
217 let _ = (ptr, len);
218 }
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224
225 #[test]
226 fn no_call_below_two_pages() {
227 // Smaller than 2 * 4 KiB: short-circuit, never reaches the syscall.
228 // We cannot directly assert "no syscall" without a hook, but the
229 // function must at least return cleanly on a tiny buffer.
230 let buf = [0u8; 1024];
231 advise_hugepage(buf.as_ptr(), buf.len());
232 }
233
234 #[test]
235 fn unaligned_buffer_does_not_panic() {
236 // 16 KiB unaligned buffer; the wrapper rounds inward and either
237 // calls madvise on the aligned subset or no-ops. Either way, no
238 // panic, no UB.
239 let buf = vec![0u8; 16 * 1024];
240 advise_hugepage(buf.as_ptr().wrapping_add(7), buf.len() - 7);
241 }
242
243 #[test]
244 fn zero_length_is_noop() {
245 advise_hugepage(core::ptr::null(), 0);
246 }
247
248 #[test]
249 fn large_aligned_region_runs() {
250 // 64 KiB region — enough to clear all the page-alignment guards.
251 // On Linux this issues the syscall; on macOS it's compile-time
252 // out. We only assert the function completes.
253 let buf = vec![0u8; 64 * 1024];
254 advise_hugepage(buf.as_ptr(), buf.len());
255 }
256}