1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
//! NUMA-aware thread pinning for multi-socket servers.
//!
//! On multi-socket servers, memory access latency doubles when a thread
//! crosses NUMA node boundaries. This module detects the NUMA topology
//! at runtime and pins worker threads to the node they belong to, keeping
//! memory accesses local.
//!
//! # Linux
//! Uses `sched_setaffinity(2)` via `libc` FFI to bind the calling thread
//! to CPUs within a single NUMA node. Topology is read from
//! `/sys/devices/system/node/`.
//!
//! # Non-Linux
//! Compiles to no-ops — NUMA pinning is a Linux-specific optimisation.
use std::sync::atomic::{AtomicBool, Ordering};
/// Whether NUMA topology was successfully detected.
static NUMA_DETECTED: AtomicBool = AtomicBool::new(false);
/// Whether NUMA pinning is enabled and available.
#[inline]
pub fn is_numa_available() -> bool {
NUMA_DETECTED.load(Ordering::Relaxed)
}
// ─── Linux implementation ───────────────────────────────────────────────────
#[cfg(target_os = "linux")]
mod inner {
use super::*;
use std::fs;
use std::path::Path;
/// A NUMA node with its associated CPU set.
#[derive(Debug, Clone)]
pub struct NumaNode {
/// Node id (0, 1, …).
pub id: u32,
/// CPU ids belonging to this node.
pub cpus: Vec<u32>,
}
/// Cached NUMA topology.
static TOPOLOGY: std::sync::OnceLock<Vec<NumaNode>> = std::sync::OnceLock::new();
/// Parse a Linux CPU list string like "0-3,8-11" into individual CPU ids.
fn parse_cpu_list(s: &str) -> Vec<u32> {
let mut cpus = Vec::new();
for part in s.split(',') {
let part = part.trim();
if part.is_empty() {
continue;
}
if let Some((start, end)) = part.split_once('-') {
if let (Ok(s), Ok(e)) = (start.trim().parse::<u32>(), end.trim().parse::<u32>()) {
cpus.extend(s..=e);
}
} else if let Ok(cpu) = part.parse::<u32>() {
cpus.push(cpu);
}
}
cpus
}
/// Detect NUMA topology from sysfs.
///
/// Returns `None` on single-node systems (NUMA pinning would be pointless).
fn detect_topology() -> Option<Vec<NumaNode>> {
let node_dir = Path::new("/sys/devices/system/node");
if !node_dir.exists() {
return None;
}
let mut nodes = Vec::new();
let entries = fs::read_dir(node_dir).ok()?;
for entry in entries.flatten() {
let name = entry.file_name();
let name_str = name.to_string_lossy();
if !name_str.starts_with("node") {
continue;
}
let id: u32 = name_str[4..].parse().ok()?;
let cpulist_path = entry.path().join("cpulist");
let cpulist = fs::read_to_string(cpulist_path).ok()?;
let cpus = parse_cpu_list(&cpulist);
if !cpus.is_empty() {
nodes.push(NumaNode { id, cpus });
}
}
// Sort by node id for deterministic assignment.
nodes.sort_by_key(|n| n.id);
// Only useful on multi-node systems.
if nodes.len() >= 2 {
Some(nodes)
} else {
None
}
}
/// Initialise NUMA detection. Safe to call multiple times (idempotent).
pub fn init_numa() {
TOPOLOGY.get_or_init(|| match detect_topology() {
Some(topo) => {
log::info!(
"NUMA topology detected: {} nodes, {} total CPUs",
topo.len(),
topo.iter().map(|n| n.cpus.len()).sum::<usize>()
);
for node in &topo {
log::debug!(" node {}: cpus {:?}", node.id, node.cpus);
}
NUMA_DETECTED.store(true, Ordering::Relaxed);
topo
}
None => {
log::debug!("Single NUMA node or sysfs unavailable — pinning disabled");
Vec::new()
}
});
}
/// Get the detected topology (empty if single-node or not initialised).
pub fn topology() -> &'static [NumaNode] {
TOPOLOGY.get().map(|v| v.as_slice()).unwrap_or(&[])
}
/// Returns the number of NUMA nodes detected.
pub fn node_count() -> usize {
topology().len()
}
/// Pin the **calling** thread to CPUs belonging to `node_id`.
///
/// Returns `Ok(())` on success, `Err` with an OS error on failure.
/// Silently succeeds if NUMA is not available or the node id is invalid.
pub fn pin_thread_to_node(node_id: usize) -> Result<(), std::io::Error> {
let topo = topology();
if topo.is_empty() || node_id >= topo.len() {
return Ok(());
}
let node = &topo[node_id];
pin_thread_to_cpus(&node.cpus)
}
/// Pin the **calling** thread to a specific set of CPU ids.
pub fn pin_thread_to_cpus(cpus: &[u32]) -> Result<(), std::io::Error> {
if cpus.is_empty() {
return Ok(());
}
unsafe {
let mut cpuset: libc::cpu_set_t = std::mem::zeroed();
for &cpu in cpus {
libc::CPU_SET(cpu as usize, &mut cpuset);
}
let ret = libc::sched_setaffinity(
0, // 0 = calling thread
std::mem::size_of::<libc::cpu_set_t>(),
&cpuset,
);
if ret == 0 {
Ok(())
} else {
Err(std::io::Error::last_os_error())
}
}
}
/// Pin the calling thread to the NUMA node selected by round-robin
/// based on `worker_index`.
///
/// This is the primary API for use in `on_thread_start` callbacks.
pub fn pin_worker(worker_index: usize) {
let topo = topology();
if topo.is_empty() {
return;
}
let node_id = worker_index % topo.len();
if let Err(e) = pin_thread_to_node(node_id) {
log::warn!(
"Failed to pin worker {} to NUMA node {}: {}",
worker_index,
node_id,
e
);
} else {
log::trace!("Worker {} pinned to NUMA node {}", worker_index, node_id);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_cpu_list_range() {
assert_eq!(parse_cpu_list("0-3"), vec![0, 1, 2, 3]);
}
#[test]
fn test_parse_cpu_list_mixed() {
assert_eq!(parse_cpu_list("0-2,5,8-9"), vec![0, 1, 2, 5, 8, 9]);
}
#[test]
fn test_parse_cpu_list_single() {
assert_eq!(parse_cpu_list("7"), vec![7]);
}
#[test]
fn test_parse_cpu_list_empty() {
assert!(parse_cpu_list("").is_empty());
}
#[test]
fn test_parse_cpu_list_whitespace() {
assert_eq!(parse_cpu_list(" 1 - 3 , 5 "), vec![1, 2, 3, 5]);
}
#[test]
fn test_init_numa_idempotent() {
init_numa();
init_numa(); // second call should be no-op
}
#[test]
fn test_topology_returns_slice() {
init_numa();
let topo = topology();
// On single-socket dev machines this is empty; on multi-socket it's >1.
// Either way it shouldn't panic.
let _ = topo.len();
}
#[test]
fn test_pin_worker_no_panic() {
init_numa();
// Should not panic even on single-node systems.
pin_worker(0);
pin_worker(999);
}
#[test]
fn test_pin_thread_to_node_invalid() {
init_numa();
// Invalid node id should silently succeed.
assert!(pin_thread_to_node(9999).is_ok());
}
#[test]
fn test_pin_thread_to_cpus_empty() {
assert!(pin_thread_to_cpus(&[]).is_ok());
}
#[test]
fn test_node_count() {
init_numa();
// Just check it doesn't panic.
let _ = node_count();
}
}
}
// ─── Non-Linux stub ─────────────────────────────────────────────────────────
#[cfg(not(target_os = "linux"))]
mod inner {
/// Initialise NUMA detection (no-op on non-Linux).
#[inline]
pub fn init_numa() {}
/// Returns 0 on non-Linux platforms.
#[inline]
pub fn node_count() -> usize {
0
}
/// No-op on non-Linux.
#[inline]
pub fn pin_worker(_worker_index: usize) {}
/// No-op on non-Linux.
#[inline]
pub fn pin_thread_to_node(_node_id: usize) -> Result<(), std::io::Error> {
Ok(())
}
/// No-op on non-Linux.
#[inline]
pub fn pin_thread_to_cpus(_cpus: &[u32]) -> Result<(), std::io::Error> {
Ok(())
}
}
// Re-export the platform-specific implementation.
pub use inner::*;