solti_exec/utils/cgroups.rs
1//! # Cgroups: cgroup v2 resource limits for subprocess runners.
2//!
3//! [`CgroupLimits`] applies cgroup v2 resource constraints (CPU, memory, PIDs) to child processes spawned by subprocess runners.
4//!
5//! **Linux (cgroup v2):**
6//! - Zero heap allocation in the child (closure captures only `Copy` types)
7//! - Two-phase lifecycle: `prepare` (before fork) + `attach` (pre_exec hook)
8//! - Phase 1 creates a cgroup directory and writes limit files via `std::fs`
9//! - Phase 2 joins the child PID to the cgroup via raw libc syscalls
10//!
11//! **Other platforms:**
12//! - `tracing::warn` and no-op.
13//!
14//! ## Also
15//!
16//! - [`SubprocessBackendConfig`](crate::subprocess::SubprocessBackendConfig) builder that consumes `CgroupLimits`.
17//! - [`RlimitConfig`](super::RlimitConfig) complementary POSIX rlimits.
18//!
19//! ## Two-phase lifecycle
20//! ```text
21//! parent process (async context)
22//! │
23//! Phase 1: prepare_cgroup(name, limits)
24//! │
25//! ├──► is_cgroup_v2? (check cgroup.controllers)
26//! │ └──► no → warn + return Ok(false)
27//! │
28//! ├──► mkdir /sys/fs/cgroup/{name}
29//! │
30//! ├──► write limit files:
31//! │ ├──► cpu.max ← "50000 100000\n"
32//! │ ├──► memory.max ← "134217728\n"
33//! │ └──► pids.max ← "32\n"
34//! │
35//! └──► return Ok(true)
36//! │
37//! Phase 2: attach_cgroup(&mut cmd, name, limits)
38//! │
39//! └──► install pre_exec closure
40//! │
41//! fork()
42//! │
43//! ┌────────────────────┼───────────────────────────────┐
44//! │ child process │
45//! │ │
46//! │ ┌── pre_exec hook ─────────────────────────────┐ │
47//! │ │ 1. open /sys/fs/cgroup/{name}/cgroup.procs │ │
48//! │ │ 2. getpid() → format to stack buf │ │
49//! │ │ 3. write PID to fd │ │
50//! │ │ 4. close fd │ │
51//! │ └──────────────────────────────────────────────┘ │
52//! │ │
53//! │ execve("echo", ["hello"]) │
54//! │ (runs inside cgroup with limits) │
55//! └────────────────────────────────────────────────────┘
56//! │
57//! Cleanup: process exits → kernel auto-removes empty cgroup or explicit cleanup_cgroup(name)
58//! ```
59//!
60//! ## join_cgroup_raw: step by step
61//! ```text
62//! join_cgroup_raw(procs_path, fail_on_error)
63//! │
64//! ├──► libc::open(procs_path, O_WRONLY)
65//! │ └──► fail? → log + Err if strict, Ok(()) if best-effort
66//! │
67//! ├──► libc::getpid() → format_pid(pid, &mut [u8; 24])
68//! │ └──► stack-only int→ASCII, appends '\n'
69//! │
70//! ├──► libc::write(fd, pid_bytes)
71//! │ └──► save errno BEFORE close (close can clobber it)
72//! │
73//! ├──► libc::close(fd)
74//! │
75//! └──► write failed? → log + Err if strict, Ok(()) if best-effort
76//! ```
77//!
78//! ## Configuration
79//!
80//! | Field | cgroup file | What it does | If it fails |
81//! |-----------------|----------------|-------------------------------------|---------------------------------|
82//! | `cpu` | `cpu.max` | quota/period CPU time window | depends on `fail_on_error` |
83//! | `memory` | `memory.max` | memory limit in bytes | depends on `fail_on_error` |
84//! | `pids` | `pids.max` | max number of processes | depends on `fail_on_error` |
85//! | `fail_on_error` | — | strict mode: abort spawn on failure | — |
86//!
87//! ## Async-signal safety
88//!
89//! Phase 2 (`pre_exec` hook) runs **between `fork()` and `execve()`**.
90//!
91//! | What we call | Why it's safe |
92//! |------------------------------|--------------------------------------|
93//! | `libc::open()` | async-signal-safe per POSIX |
94//! | `libc::write()` | async-signal-safe per POSIX |
95//! | `libc::close()` | async-signal-safe per POSIX |
96//! | `libc::getpid()` | async-signal-safe per POSIX |
97//! | `io::Error::last_os_error()` | reads `errno`, no heap (Rust ≥ 1.74) |
98//!
99//! The closure captures **only `Copy` types** (`ProcsPath`: `[u8; 256]` + `usize`, + `bool`).
100//!
101//! ## Rules
102//! - Phase 1 (`prepare`) runs in normal async context - `std::fs` is safe
103//! - Phase 2 (`attach`) runs in `pre_exec` - only raw libc syscalls
104//! - Kernel auto-removes empty cgroups; `cleanup_cgroup` is best-effort convenience
105//! - `fail_on_error = false` (default): cgroup failures are **non-fatal** (best-effort)
106//! - `fail_on_error = true`: cgroup failures **abort spawn**
107//! - `CgroupLimits::is_empty()` → no cgroup created, zero overhead
108use tokio::process::Command;
109
110use crate::ExecError;
111
112/// CPU limit (`cpu.max`) for cgroup v2.
113/// - `<quota> <period>` sets a quota/period time window.
114#[derive(Debug, Clone, Copy)]
115pub struct CpuMax {
116 /// CPU quota in microseconds for each period. (`None` is unlimited).
117 pub quota: Option<u64>,
118 /// Period in microseconds (usually 100_000 = 100ms).
119 pub period: u64,
120}
121
122impl Default for CpuMax {
123 fn default() -> Self {
124 Self {
125 quota: None,
126 period: 100_000,
127 }
128 }
129}
130
131/// Declarative cgroup limits for a child process.
132///
133/// All fields are optional. `None` means "no limit".
134#[derive(Debug, Clone, Default)]
135pub struct CgroupLimits {
136 /// CPU limit.
137 pub cpu: Option<CpuMax>,
138 /// Memory limit in bytes.
139 pub memory: Option<u64>,
140 /// Max number of processes (pids).
141 pub pids: Option<u64>,
142 /// If `true`, cgroup setup failures abort the subprocess spawn.
143 /// If `false` (default), failures are logged and the process runs without cgroup isolation.
144 pub fail_on_error: bool,
145}
146
147impl CgroupLimits {
148 /// Returns `true` if all limits are `None`.
149 #[inline]
150 pub fn is_empty(&self) -> bool {
151 self.cpu.is_none() && self.memory.is_none() && self.pids.is_none()
152 }
153}
154
155/// Prepare cgroup v2 limits: create cgroup directory and write limit files.
156///
157/// Returns `Ok(true)` if the cgroup was created and limits applied (the command needs a `pre_exec` hook to join the cgroup).
158/// Returns `Ok(false)` if cgroups are unavailable or the platform is not Linux.
159///
160/// # Cgroup lifecycle
161/// - Kernel auto-removes empty cgroups when all processes exit
162/// - Use [`cleanup_cgroup`] to explicitly remove a cgroup (best-effort)
163pub(crate) fn prepare_cgroup(cgroup_name: &str, limits: &CgroupLimits) -> Result<bool, ExecError> {
164 if limits.is_empty() {
165 return Ok(false);
166 }
167
168 #[cfg(target_os = "linux")]
169 {
170 linux_impl::prepare(cgroup_name, limits)
171 }
172 #[cfg(not(target_os = "linux"))]
173 {
174 tracing::warn!(
175 "cgroup v2 limits requested for '{}', but OS={} does not support them; limits will be ignored",
176 cgroup_name,
177 std::env::consts::OS
178 );
179 Ok(false)
180 }
181}
182
183/// Attach cgroup v2 join hook to a `tokio::process::Command`.
184///
185/// The `pre_exec` hook only writes the child PID to `cgroup.procs` using raw libc syscalls — fully async-signal-safe.
186///
187/// Must be called after [`prepare_cgroup`] succeeds with `Ok(true)`.
188pub(crate) fn attach_cgroup(
189 cmd: &mut Command,
190 cgroup_name: &str,
191 limits: &CgroupLimits,
192) -> Result<(), ExecError> {
193 if limits.is_empty() {
194 return Ok(());
195 }
196
197 #[cfg(target_os = "linux")]
198 {
199 linux_impl::attach_join_hook(cmd, cgroup_name, limits.fail_on_error);
200 }
201 #[cfg(not(target_os = "linux"))]
202 {
203 let _ = (&cmd, cgroup_name, limits);
204 }
205 Ok(())
206}
207
208/// Best-effort cgroup cleanup: attempt to remove the cgroup directory.
209///
210/// Cgroup removal can fail for many reasons (permission denied, busy, not found, read-only cgroupfs in containers, etc.).
211/// All failures are logged and swallowed because the kernel auto-removes empty cgroups when all member processes exit.
212#[cfg(target_os = "linux")]
213pub fn cleanup_cgroup(cgroup_name: &str) {
214 use std::path::Path;
215
216 let full_path = Path::new("/sys/fs/cgroup").join(cgroup_name);
217
218 match std::fs::remove_dir(&full_path) {
219 Ok(()) => {
220 tracing::debug!("removed cgroup: {}", cgroup_name);
221 }
222 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
223 tracing::trace!("cgroup '{}' not found (already removed)", cgroup_name);
224 }
225 Err(e) => {
226 tracing::debug!(
227 "cgroup '{}' cleanup skipped: {} (errno={:?})",
228 cgroup_name,
229 e,
230 e.raw_os_error(),
231 );
232 }
233 }
234}
235
236/// No-op on non-Linux platforms.
237#[cfg(not(target_os = "linux"))]
238pub fn cleanup_cgroup(_cgroup_name: &str) {}
239
240/// Build a unique cgroup name from components.
241///
242/// Format: `{runner_tag}-{slot}-{seq:x}-{timestamp:x}`
243pub fn build_cgroup_name(runner_tag: &str, slot: &str, seq: u64, timestamp: u64) -> String {
244 format!("{}-{}-{:x}-{:x}", runner_tag, slot, seq, timestamp)
245}
246
247#[cfg(target_os = "linux")]
248mod linux_impl {
249 use super::{CgroupLimits, CpuMax};
250 use crate::utils::log::{pre_exec_log, pre_exec_log_errno};
251
252 use std::{
253 fs, io,
254 path::{Path, PathBuf},
255 };
256
257 use tokio::process::Command;
258
259 const CONTROLLERS_FILE: &str = "cgroup.controllers";
260 const CGROUP_ROOT: &str = "/sys/fs/cgroup";
261 const CGROUP_PROCS_SUFFIX: &str = "/cgroup.procs";
262
263 /// Phase 1: Create cgroup directory and write limit files.
264 /// Runs before spawn in normal async context — safe to use std::fs.
265 pub fn prepare(cgroup_name: &str, limits: &CgroupLimits) -> Result<bool, crate::ExecError> {
266 if !is_cgroup_v2(Path::new(CGROUP_ROOT)) {
267 tracing::warn!("cgroup v2 not detected at /sys/fs/cgroup; limits will be ignored");
268 return if limits.fail_on_error {
269 Err(crate::ExecError::InvalidRunnerConfig(
270 "cgroup v2 not available".into(),
271 ))
272 } else {
273 Ok(false)
274 };
275 }
276
277 let cg_dir = Path::new(CGROUP_ROOT).join(cgroup_name);
278 fs::create_dir_all(&cg_dir).map_err(|e| {
279 crate::ExecError::Io(io::Error::other(format!(
280 "failed to create cgroup directory '{}': {e}",
281 cg_dir.display()
282 )))
283 })?;
284 apply_limits(&cg_dir, limits).map_err(|e| {
285 crate::ExecError::Io(io::Error::other(format!(
286 "failed to apply cgroup limits for '{}': {e}",
287 cg_dir.display()
288 )))
289 })?;
290 Ok(true)
291 }
292
293 /// Max path length for cgroup.procs:
294 /// `/sys/fs/cgroup/` (15) + cgroup_name + `/cgroup.procs` (13) + NUL (1).
295 const MAX_PROCS_PATH: usize = 256;
296
297 /// Stack-only buffer for the cgroup.procs path.
298 #[derive(Clone, Copy)]
299 struct ProcsPath {
300 buf: [u8; MAX_PROCS_PATH],
301 len: usize,
302 }
303
304 impl ProcsPath {
305 /// Build `/sys/fs/cgroup/{name}/cgroup.procs\0` into a stack buffer.
306 ///
307 /// Returns `None` if the path exceeds `MAX_PROCS_PATH`.
308 fn build(cgroup_name: &str) -> Option<Self> {
309 let total = CGROUP_ROOT.len() + 1 + cgroup_name.len() + CGROUP_PROCS_SUFFIX.len() + 1;
310 if total > MAX_PROCS_PATH {
311 return None;
312 }
313 let mut buf = [0u8; MAX_PROCS_PATH];
314 let mut pos = 0;
315 let parts: &[&[u8]] = &[
316 CGROUP_ROOT.as_bytes(),
317 b"/",
318 cgroup_name.as_bytes(),
319 CGROUP_PROCS_SUFFIX.as_bytes(),
320 b"\0",
321 ];
322 for part in parts {
323 buf[pos..pos + part.len()].copy_from_slice(part);
324 pos += part.len();
325 }
326 Some(Self { buf, len: pos })
327 }
328
329 /// Returns the null-terminated path as a byte slice.
330 fn as_bytes(&self) -> &[u8] {
331 &self.buf[..self.len]
332 }
333 }
334
335 /// Phase 2: pre_exec hook that only writes the child PID to cgroup.procs.
336 /// Uses raw libc syscalls — fully async-signal-safe.
337 pub fn attach_join_hook(cmd: &mut Command, cgroup_name: &str, fail_on_error: bool) {
338 let procs_path = match ProcsPath::build(cgroup_name) {
339 Some(p) => p,
340 None => {
341 pre_exec_log(b"solti-exec: cgroup path exceeds 256 bytes, skipping join\n");
342 return;
343 }
344 };
345
346 // SAFETY:
347 // The pre_exec closure runs between fork() and execve().
348 // It uses only libc::open, libc::write, libc::close, libc::getpid: all async-signal-safe per POSIX.
349 //
350 // The closure captures only Copy types (ProcsPath: [u8; 256] + usize, bool): zero heap allocation in the child.
351 unsafe {
352 cmd.pre_exec(move || join_cgroup_raw(procs_path.as_bytes(), fail_on_error));
353 }
354 }
355
356 fn is_cgroup_v2(root: &Path) -> bool {
357 root.join(CONTROLLERS_FILE).is_file()
358 }
359
360 fn apply_limits(dir: &Path, limits: &CgroupLimits) -> io::Result<()> {
361 if let Some(cpu) = limits.cpu {
362 write_cpu_max(dir.join("cpu.max"), cpu)?;
363 }
364 if let Some(mem) = limits.memory {
365 write_limit(dir.join("memory.max"), mem)?;
366 }
367 if let Some(pids) = limits.pids {
368 write_limit(dir.join("pids.max"), pids)?;
369 }
370 Ok(())
371 }
372
373 fn write_cpu_max(path: PathBuf, limit: CpuMax) -> io::Result<()> {
374 let content = match limit.quota {
375 None => format!("max {}\n", limit.period),
376 Some(q) => format!("{q} {}\n", limit.period),
377 };
378 fs::write(path, content)
379 }
380
381 fn write_limit(path: PathBuf, val: u64) -> io::Result<()> {
382 fs::write(path, format!("{val}\n"))
383 }
384
385 /// Write PID to cgroup.procs using raw libc syscalls only.
386 /// Fully async-signal-safe — no heap allocation, no mutexes.
387 fn join_cgroup_raw(procs_path_cstr: &[u8], fail_on_error: bool) -> io::Result<()> {
388 // SAFETY:
389 // procs_path_cstr is a null-terminated byte string built before fork().
390 // libc::open is async-signal-safe per POSIX.
391 let fd = unsafe {
392 libc::open(
393 procs_path_cstr.as_ptr() as *const libc::c_char,
394 libc::O_WRONLY,
395 )
396 };
397 if fd < 0 {
398 let e = io::Error::last_os_error();
399 pre_exec_log(b"solti-exec: failed to open cgroup.procs: ");
400 if let Some(code) = e.raw_os_error() {
401 pre_exec_log_errno(code);
402 }
403 return if fail_on_error { Err(e) } else { Ok(()) };
404 }
405
406 // Format PID into a stack buffer (no allocation).
407 // SAFETY:
408 // getpid() is async-signal-safe, always succeeds.
409 let pid = unsafe { libc::getpid() };
410 let mut buf = [0u8; 24];
411 let pid_str = super::format_pid(pid, &mut buf);
412
413 // SAFETY:
414 // fd is a valid open file descriptor. pid_str is a valid byte slice.
415 // libc::write is async-signal-safe per POSIX.
416 let written =
417 unsafe { libc::write(fd, pid_str.as_ptr() as *const libc::c_void, pid_str.len()) };
418
419 // Capture errno BEFORE close (close can clobber it).
420 let write_err = if written < 0 {
421 Some(io::Error::last_os_error())
422 } else {
423 None
424 };
425
426 // SAFETY:
427 // libc::close is async-signal-safe.
428 unsafe { libc::close(fd) };
429
430 if let Some(e) = write_err {
431 pre_exec_log(b"solti-exec: failed to write PID to cgroup.procs: ");
432 if let Some(code) = e.raw_os_error() {
433 pre_exec_log_errno(code);
434 }
435 return if fail_on_error { Err(e) } else { Ok(()) };
436 }
437
438 Ok(())
439 }
440}
441
442/// Format a PID (positive `i32`) into a stack buffer as `"<pid>\n"`.
443///
444/// Returns the written slice. Pure arithmetic — no platform deps, testable everywhere.
445#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
446fn format_pid(pid: i32, buf: &mut [u8; 24]) -> &[u8] {
447 let mut n = pid as u32;
448 let mut idx = buf.len() - 1;
449 buf[idx] = b'\n';
450 if n == 0 {
451 idx -= 1;
452 buf[idx] = b'0';
453 } else {
454 while n > 0 {
455 idx -= 1;
456 buf[idx] = b'0' + (n % 10) as u8;
457 n /= 10;
458 }
459 }
460 &buf[idx..]
461}
462
463#[cfg(test)]
464mod tests {
465 use super::*;
466
467 #[test]
468 fn empty_limits_are_noop() {
469 let limits = CgroupLimits::default();
470 assert!(limits.is_empty());
471
472 let mut cmd = Command::new("sh");
473 let r = attach_cgroup(&mut cmd, "test-cgroup", &limits);
474 assert!(r.is_ok());
475 }
476
477 #[test]
478 fn build_cgroup_name_simple_case() {
479 let name = build_cgroup_name("runner", "slot", 42, 1000);
480 let parts: Vec<&str> = name.split('-').collect();
481
482 assert_eq!(name, "runner-slot-2a-3e8");
483 assert_eq!(parts.len(), 4);
484 assert_eq!(parts[0], "runner");
485 assert_eq!(parts[1], "slot");
486 assert_eq!(u64::from_str_radix(parts[2], 16).unwrap(), 42);
487 assert_eq!(u64::from_str_radix(parts[3], 16).unwrap(), 1000);
488 }
489
490 #[test]
491 fn build_cgroup_name_with_dashes() {
492 let name = build_cgroup_name("prod-runner", "demo-task", 42, 1733045913);
493 let timestamp_hex = format!("{:x}", 1733045913u64);
494
495 assert!(name.starts_with("prod-runner-"));
496 assert!(name.contains("-demo-task-"));
497 assert!(name.contains("-2a-"));
498 assert!(name.ends_with(&format!("-{}", timestamp_hex)));
499 }
500
501 #[test]
502 fn build_cgroup_name_hex_values() {
503 let name = build_cgroup_name("r", "s", 0, 0);
504 assert_eq!(name, "r-s-0-0");
505 let name = build_cgroup_name("r", "s", 255, 255);
506 assert_eq!(name, "r-s-ff-ff");
507 let name = build_cgroup_name("r", "s", 4096, 65536);
508 assert_eq!(name, "r-s-1000-10000");
509 }
510
511 #[cfg(target_os = "linux")]
512 #[test]
513 fn attach_with_limits_does_not_error() {
514 let limits = CgroupLimits {
515 cpu: Some(CpuMax::default()),
516 memory: Some(128 * 1024 * 1024),
517 pids: Some(32),
518 ..Default::default()
519 };
520 let name = build_cgroup_name("test", "slot", 1, 1733045913);
521 let mut cmd = Command::new("true");
522 let r = attach_cgroup(&mut cmd, &name, &limits);
523 assert!(r.is_ok());
524 }
525
526 #[cfg(not(target_os = "linux"))]
527 #[test]
528 fn non_linux_platforms_ignore_limits() {
529 let limits = CgroupLimits {
530 cpu: Some(CpuMax::default()),
531 memory: Some(1),
532 pids: Some(1),
533 ..Default::default()
534 };
535 let mut cmd = Command::new("true");
536 let r = attach_cgroup(&mut cmd, "test-cgroup", &limits);
537 assert!(
538 r.is_ok(),
539 "non-Linux must ignore limits but still return Ok"
540 );
541 }
542
543 #[cfg(target_os = "linux")]
544 #[test]
545 fn cleanup_nonexistent_cgroup_does_not_panic() {
546 let name = build_cgroup_name("test", "nonexistent", 999, 1733045913);
547 cleanup_cgroup(&name); // best-effort, should not panic
548 }
549
550 fn fmt_pid(pid: i32) -> String {
551 let mut buf = [0u8; 24];
552 let slice = format_pid(pid, &mut buf);
553 String::from_utf8_lossy(slice).into_owned()
554 }
555
556 #[test]
557 fn format_pid_one() {
558 assert_eq!(fmt_pid(1), "1\n");
559 }
560
561 #[test]
562 fn format_pid_single_digit() {
563 assert_eq!(fmt_pid(9), "9\n");
564 }
565
566 #[test]
567 fn format_pid_two_digits() {
568 assert_eq!(fmt_pid(10), "10\n");
569 assert_eq!(fmt_pid(99), "99\n");
570 }
571
572 #[test]
573 fn format_pid_typical() {
574 assert_eq!(fmt_pid(32768), "32768\n");
575 }
576
577 #[test]
578 fn format_pid_large() {
579 assert_eq!(fmt_pid(4_194_304), "4194304\n");
580 }
581
582 #[test]
583 fn format_pid_zero() {
584 assert_eq!(fmt_pid(0), "0\n");
585 }
586}