Skip to main content

solti_exec/utils/
cgroups.rs

1//! # Cgroups: cgroup v2 resource limits for subprocess runners.
2//!
3//! [`CgroupLimits`] applies cgroup v2 resource constraints (CPU, memory, PIDs) to child processes spawned by subprocess runners.
4//!
5//! **Linux (cgroup v2):**
6//! - Zero heap allocation in the child (closure captures only `Copy` types)
7//! - Two-phase lifecycle: `prepare` (before fork) + `attach` (pre_exec hook)
8//! - Phase 1 creates a cgroup directory and writes limit files via `std::fs`
9//! - Phase 2 joins the child PID to the cgroup via raw libc syscalls
10//!
11//! **Other platforms:**
12//! - `tracing::warn` and no-op.
13//!
14//! ## Also
15//!
16//! - [`SubprocessBackendConfig`](crate::subprocess::SubprocessBackendConfig) builder that consumes `CgroupLimits`.
17//! - [`RlimitConfig`](super::RlimitConfig) complementary POSIX rlimits.
18//!
19//! ## Two-phase lifecycle
20//! ```text
21//!                      parent process (async context)
22//!                               │
23//!                  Phase 1: prepare_cgroup(name, limits)
24//!                               │
25//!                ├──► is_cgroup_v2? (check cgroup.controllers)
26//!                │     └──► no → warn + return Ok(false)
27//!                │
28//!                ├──► mkdir /sys/fs/cgroup/{name}
29//!                │
30//!                ├──► write limit files:
31//!                │     ├──► cpu.max    ← "50000 100000\n"
32//!                │     ├──► memory.max ← "134217728\n"
33//!                │     └──► pids.max   ← "32\n"
34//!                │
35//!                └──► return Ok(true)
36//!                               │
37//!              Phase 2: attach_cgroup(&mut cmd, name, limits)
38//!                               │
39//!                └──► install pre_exec closure
40//!                               │
41//!                             fork()
42//!                               │
43//!          ┌────────────────────┼───────────────────────────────┐
44//!          │              child process                         │
45//!          │                                                    │
46//!          │  ┌── pre_exec hook ─────────────────────────────┐  │
47//!          │  │  1. open /sys/fs/cgroup/{name}/cgroup.procs  │  │
48//!          │  │  2. getpid() → format to stack buf           │  │
49//!          │  │  3. write PID to fd                          │  │
50//!          │  │  4. close fd                                 │  │
51//!          │  └──────────────────────────────────────────────┘  │
52//!          │                                                    │
53//!          │           execve("echo", ["hello"])                │
54//!          │       (runs inside cgroup with limits)             │
55//!          └────────────────────────────────────────────────────┘
56//!                               │
57//! Cleanup: process exits → kernel auto-removes empty cgroup or explicit cleanup_cgroup(name)
58//! ```
59//!
60//! ## join_cgroup_raw: step by step
61//! ```text
62//! join_cgroup_raw(procs_path, fail_on_error)
63//!     │
64//!     ├──► libc::open(procs_path, O_WRONLY)
65//!     │     └──► fail? → log + Err if strict, Ok(()) if best-effort
66//!     │
67//!     ├──► libc::getpid() → format_pid(pid, &mut [u8; 24])
68//!     │     └──► stack-only int→ASCII, appends '\n'
69//!     │
70//!     ├──► libc::write(fd, pid_bytes)
71//!     │     └──► save errno BEFORE close (close can clobber it)
72//!     │
73//!     ├──► libc::close(fd)
74//!     │
75//!     └──► write failed? → log + Err if strict, Ok(()) if best-effort
76//! ```
77//!
78//! ## Configuration
79//!
80//! | Field           | cgroup file    | What it does                        | If it fails                     |
81//! |-----------------|----------------|-------------------------------------|---------------------------------|
82//! | `cpu`           | `cpu.max`      | quota/period CPU time window        | depends on `fail_on_error`      |
83//! | `memory`        | `memory.max`   | memory limit in bytes               | depends on `fail_on_error`      |
84//! | `pids`          | `pids.max`     | max number of processes             | depends on `fail_on_error`      |
85//! | `fail_on_error` | —              | strict mode: abort spawn on failure | —                               |
86//!
87//! ## Async-signal safety
88//!
89//! Phase 2 (`pre_exec` hook) runs **between `fork()` and `execve()`**.
90//!
91//! | What we call                 | Why it's safe                        |
92//! |------------------------------|--------------------------------------|
93//! | `libc::open()`               | async-signal-safe per POSIX          |
94//! | `libc::write()`              | async-signal-safe per POSIX          |
95//! | `libc::close()`              | async-signal-safe per POSIX          |
96//! | `libc::getpid()`             | async-signal-safe per POSIX          |
97//! | `io::Error::last_os_error()` | reads `errno`, no heap (Rust ≥ 1.74) |
98//!
99//! The closure captures **only `Copy` types** (`ProcsPath`: `[u8; 256]` + `usize`, + `bool`).
100//!
101//! ## Rules
102//! - Phase 1 (`prepare`) runs in normal async context - `std::fs` is safe
103//! - Phase 2 (`attach`) runs in `pre_exec` - only raw libc syscalls
104//! - Kernel auto-removes empty cgroups; `cleanup_cgroup` is best-effort convenience
105//! - `fail_on_error = false` (default): cgroup failures are **non-fatal** (best-effort)
106//! - `fail_on_error = true`: cgroup failures **abort spawn**
107//! - `CgroupLimits::is_empty()` → no cgroup created, zero overhead
108use tokio::process::Command;
109
110use crate::ExecError;
111
112/// CPU limit (`cpu.max`) for cgroup v2.
113/// - `<quota> <period>` sets a quota/period time window.
114#[derive(Debug, Clone, Copy)]
115pub struct CpuMax {
116    /// CPU quota in microseconds for each period. (`None` is unlimited).
117    pub quota: Option<u64>,
118    /// Period in microseconds (usually 100_000 = 100ms).
119    pub period: u64,
120}
121
122impl Default for CpuMax {
123    fn default() -> Self {
124        Self {
125            quota: None,
126            period: 100_000,
127        }
128    }
129}
130
131/// Declarative cgroup limits for a child process.
132///
133/// All fields are optional. `None` means "no limit".
134#[derive(Debug, Clone, Default)]
135pub struct CgroupLimits {
136    /// CPU limit.
137    pub cpu: Option<CpuMax>,
138    /// Memory limit in bytes.
139    pub memory: Option<u64>,
140    /// Max number of processes (pids).
141    pub pids: Option<u64>,
142    /// If `true`, cgroup setup failures abort the subprocess spawn.
143    /// If `false` (default), failures are logged and the process runs without cgroup isolation.
144    pub fail_on_error: bool,
145}
146
147impl CgroupLimits {
148    /// Returns `true` if all limits are `None`.
149    #[inline]
150    pub fn is_empty(&self) -> bool {
151        self.cpu.is_none() && self.memory.is_none() && self.pids.is_none()
152    }
153}
154
155/// Prepare cgroup v2 limits: create cgroup directory and write limit files.
156///
157/// Returns `Ok(true)` if the cgroup was created and limits applied (the command needs a `pre_exec` hook to join the cgroup).
158/// Returns `Ok(false)` if cgroups are unavailable or the platform is not Linux.
159///
160/// # Cgroup lifecycle
161/// - Kernel auto-removes empty cgroups when all processes exit
162/// - Use [`cleanup_cgroup`] to explicitly remove a cgroup (best-effort)
163pub(crate) fn prepare_cgroup(cgroup_name: &str, limits: &CgroupLimits) -> Result<bool, ExecError> {
164    if limits.is_empty() {
165        return Ok(false);
166    }
167
168    #[cfg(target_os = "linux")]
169    {
170        linux_impl::prepare(cgroup_name, limits)
171    }
172    #[cfg(not(target_os = "linux"))]
173    {
174        tracing::warn!(
175            "cgroup v2 limits requested for '{}', but OS={} does not support them; limits will be ignored",
176            cgroup_name,
177            std::env::consts::OS
178        );
179        Ok(false)
180    }
181}
182
183/// Attach cgroup v2 join hook to a `tokio::process::Command`.
184///
185/// The `pre_exec` hook only writes the child PID to `cgroup.procs` using raw libc syscalls — fully async-signal-safe.
186///
187/// Must be called after [`prepare_cgroup`] succeeds with `Ok(true)`.
188pub(crate) fn attach_cgroup(
189    cmd: &mut Command,
190    cgroup_name: &str,
191    limits: &CgroupLimits,
192) -> Result<(), ExecError> {
193    if limits.is_empty() {
194        return Ok(());
195    }
196
197    #[cfg(target_os = "linux")]
198    {
199        linux_impl::attach_join_hook(cmd, cgroup_name, limits.fail_on_error);
200    }
201    #[cfg(not(target_os = "linux"))]
202    {
203        let _ = (&cmd, cgroup_name, limits);
204    }
205    Ok(())
206}
207
208/// Best-effort cgroup cleanup: attempt to remove the cgroup directory.
209///
210/// Cgroup removal can fail for many reasons (permission denied, busy, not found, read-only cgroupfs in containers, etc.).
211/// All failures are logged and swallowed because the kernel auto-removes empty cgroups when all member processes exit.
212#[cfg(target_os = "linux")]
213pub fn cleanup_cgroup(cgroup_name: &str) {
214    use std::path::Path;
215
216    let full_path = Path::new("/sys/fs/cgroup").join(cgroup_name);
217
218    match std::fs::remove_dir(&full_path) {
219        Ok(()) => {
220            tracing::debug!("removed cgroup: {}", cgroup_name);
221        }
222        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
223            tracing::trace!("cgroup '{}' not found (already removed)", cgroup_name);
224        }
225        Err(e) => {
226            tracing::debug!(
227                "cgroup '{}' cleanup skipped: {} (errno={:?})",
228                cgroup_name,
229                e,
230                e.raw_os_error(),
231            );
232        }
233    }
234}
235
236/// No-op on non-Linux platforms.
237#[cfg(not(target_os = "linux"))]
238pub fn cleanup_cgroup(_cgroup_name: &str) {}
239
240/// Build a unique cgroup name from components.
241///
242/// Format: `{runner_tag}-{slot}-{seq:x}-{timestamp:x}`
243pub fn build_cgroup_name(runner_tag: &str, slot: &str, seq: u64, timestamp: u64) -> String {
244    format!("{}-{}-{:x}-{:x}", runner_tag, slot, seq, timestamp)
245}
246
247#[cfg(target_os = "linux")]
248mod linux_impl {
249    use super::{CgroupLimits, CpuMax};
250    use crate::utils::log::{pre_exec_log, pre_exec_log_errno};
251
252    use std::{
253        fs, io,
254        path::{Path, PathBuf},
255    };
256
257    use tokio::process::Command;
258
259    const CONTROLLERS_FILE: &str = "cgroup.controllers";
260    const CGROUP_ROOT: &str = "/sys/fs/cgroup";
261    const CGROUP_PROCS_SUFFIX: &str = "/cgroup.procs";
262
263    /// Phase 1: Create cgroup directory and write limit files.
264    /// Runs before spawn in normal async context — safe to use std::fs.
265    pub fn prepare(cgroup_name: &str, limits: &CgroupLimits) -> Result<bool, crate::ExecError> {
266        if !is_cgroup_v2(Path::new(CGROUP_ROOT)) {
267            tracing::warn!("cgroup v2 not detected at /sys/fs/cgroup; limits will be ignored");
268            return if limits.fail_on_error {
269                Err(crate::ExecError::InvalidRunnerConfig(
270                    "cgroup v2 not available".into(),
271                ))
272            } else {
273                Ok(false)
274            };
275        }
276
277        let cg_dir = Path::new(CGROUP_ROOT).join(cgroup_name);
278        fs::create_dir_all(&cg_dir).map_err(|e| {
279            crate::ExecError::Io(io::Error::other(format!(
280                "failed to create cgroup directory '{}': {e}",
281                cg_dir.display()
282            )))
283        })?;
284        apply_limits(&cg_dir, limits).map_err(|e| {
285            crate::ExecError::Io(io::Error::other(format!(
286                "failed to apply cgroup limits for '{}': {e}",
287                cg_dir.display()
288            )))
289        })?;
290        Ok(true)
291    }
292
293    /// Max path length for cgroup.procs:
294    /// `/sys/fs/cgroup/` (15) + cgroup_name + `/cgroup.procs` (13) + NUL (1).
295    const MAX_PROCS_PATH: usize = 256;
296
297    /// Stack-only buffer for the cgroup.procs path.
298    #[derive(Clone, Copy)]
299    struct ProcsPath {
300        buf: [u8; MAX_PROCS_PATH],
301        len: usize,
302    }
303
304    impl ProcsPath {
305        /// Build `/sys/fs/cgroup/{name}/cgroup.procs\0` into a stack buffer.
306        ///
307        /// Returns `None` if the path exceeds `MAX_PROCS_PATH`.
308        fn build(cgroup_name: &str) -> Option<Self> {
309            let total = CGROUP_ROOT.len() + 1 + cgroup_name.len() + CGROUP_PROCS_SUFFIX.len() + 1;
310            if total > MAX_PROCS_PATH {
311                return None;
312            }
313            let mut buf = [0u8; MAX_PROCS_PATH];
314            let mut pos = 0;
315            let parts: &[&[u8]] = &[
316                CGROUP_ROOT.as_bytes(),
317                b"/",
318                cgroup_name.as_bytes(),
319                CGROUP_PROCS_SUFFIX.as_bytes(),
320                b"\0",
321            ];
322            for part in parts {
323                buf[pos..pos + part.len()].copy_from_slice(part);
324                pos += part.len();
325            }
326            Some(Self { buf, len: pos })
327        }
328
329        /// Returns the null-terminated path as a byte slice.
330        fn as_bytes(&self) -> &[u8] {
331            &self.buf[..self.len]
332        }
333    }
334
335    /// Phase 2: pre_exec hook that only writes the child PID to cgroup.procs.
336    /// Uses raw libc syscalls — fully async-signal-safe.
337    pub fn attach_join_hook(cmd: &mut Command, cgroup_name: &str, fail_on_error: bool) {
338        let procs_path = match ProcsPath::build(cgroup_name) {
339            Some(p) => p,
340            None => {
341                pre_exec_log(b"solti-exec: cgroup path exceeds 256 bytes, skipping join\n");
342                return;
343            }
344        };
345
346        // SAFETY:
347        // The pre_exec closure runs between fork() and execve().
348        // It uses only libc::open, libc::write, libc::close, libc::getpid: all async-signal-safe per POSIX.
349        //
350        // The closure captures only Copy types (ProcsPath: [u8; 256] + usize, bool): zero heap allocation in the child.
351        unsafe {
352            cmd.pre_exec(move || join_cgroup_raw(procs_path.as_bytes(), fail_on_error));
353        }
354    }
355
356    fn is_cgroup_v2(root: &Path) -> bool {
357        root.join(CONTROLLERS_FILE).is_file()
358    }
359
360    fn apply_limits(dir: &Path, limits: &CgroupLimits) -> io::Result<()> {
361        if let Some(cpu) = limits.cpu {
362            write_cpu_max(dir.join("cpu.max"), cpu)?;
363        }
364        if let Some(mem) = limits.memory {
365            write_limit(dir.join("memory.max"), mem)?;
366        }
367        if let Some(pids) = limits.pids {
368            write_limit(dir.join("pids.max"), pids)?;
369        }
370        Ok(())
371    }
372
373    fn write_cpu_max(path: PathBuf, limit: CpuMax) -> io::Result<()> {
374        let content = match limit.quota {
375            None => format!("max {}\n", limit.period),
376            Some(q) => format!("{q} {}\n", limit.period),
377        };
378        fs::write(path, content)
379    }
380
381    fn write_limit(path: PathBuf, val: u64) -> io::Result<()> {
382        fs::write(path, format!("{val}\n"))
383    }
384
385    /// Write PID to cgroup.procs using raw libc syscalls only.
386    /// Fully async-signal-safe — no heap allocation, no mutexes.
387    fn join_cgroup_raw(procs_path_cstr: &[u8], fail_on_error: bool) -> io::Result<()> {
388        // SAFETY:
389        // procs_path_cstr is a null-terminated byte string built before fork().
390        // libc::open is async-signal-safe per POSIX.
391        let fd = unsafe {
392            libc::open(
393                procs_path_cstr.as_ptr() as *const libc::c_char,
394                libc::O_WRONLY,
395            )
396        };
397        if fd < 0 {
398            let e = io::Error::last_os_error();
399            pre_exec_log(b"solti-exec: failed to open cgroup.procs: ");
400            if let Some(code) = e.raw_os_error() {
401                pre_exec_log_errno(code);
402            }
403            return if fail_on_error { Err(e) } else { Ok(()) };
404        }
405
406        // Format PID into a stack buffer (no allocation).
407        // SAFETY:
408        // getpid() is async-signal-safe, always succeeds.
409        let pid = unsafe { libc::getpid() };
410        let mut buf = [0u8; 24];
411        let pid_str = super::format_pid(pid, &mut buf);
412
413        // SAFETY:
414        // fd is a valid open file descriptor. pid_str is a valid byte slice.
415        // libc::write is async-signal-safe per POSIX.
416        let written =
417            unsafe { libc::write(fd, pid_str.as_ptr() as *const libc::c_void, pid_str.len()) };
418
419        // Capture errno BEFORE close (close can clobber it).
420        let write_err = if written < 0 {
421            Some(io::Error::last_os_error())
422        } else {
423            None
424        };
425
426        // SAFETY:
427        // libc::close is async-signal-safe.
428        unsafe { libc::close(fd) };
429
430        if let Some(e) = write_err {
431            pre_exec_log(b"solti-exec: failed to write PID to cgroup.procs: ");
432            if let Some(code) = e.raw_os_error() {
433                pre_exec_log_errno(code);
434            }
435            return if fail_on_error { Err(e) } else { Ok(()) };
436        }
437
438        Ok(())
439    }
440}
441
442/// Format a PID (positive `i32`) into a stack buffer as `"<pid>\n"`.
443///
444/// Returns the written slice. Pure arithmetic — no platform deps, testable everywhere.
445#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
446fn format_pid(pid: i32, buf: &mut [u8; 24]) -> &[u8] {
447    let mut n = pid as u32;
448    let mut idx = buf.len() - 1;
449    buf[idx] = b'\n';
450    if n == 0 {
451        idx -= 1;
452        buf[idx] = b'0';
453    } else {
454        while n > 0 {
455            idx -= 1;
456            buf[idx] = b'0' + (n % 10) as u8;
457            n /= 10;
458        }
459    }
460    &buf[idx..]
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466
467    #[test]
468    fn empty_limits_are_noop() {
469        let limits = CgroupLimits::default();
470        assert!(limits.is_empty());
471
472        let mut cmd = Command::new("sh");
473        let r = attach_cgroup(&mut cmd, "test-cgroup", &limits);
474        assert!(r.is_ok());
475    }
476
477    #[test]
478    fn build_cgroup_name_simple_case() {
479        let name = build_cgroup_name("runner", "slot", 42, 1000);
480        let parts: Vec<&str> = name.split('-').collect();
481
482        assert_eq!(name, "runner-slot-2a-3e8");
483        assert_eq!(parts.len(), 4);
484        assert_eq!(parts[0], "runner");
485        assert_eq!(parts[1], "slot");
486        assert_eq!(u64::from_str_radix(parts[2], 16).unwrap(), 42);
487        assert_eq!(u64::from_str_radix(parts[3], 16).unwrap(), 1000);
488    }
489
490    #[test]
491    fn build_cgroup_name_with_dashes() {
492        let name = build_cgroup_name("prod-runner", "demo-task", 42, 1733045913);
493        let timestamp_hex = format!("{:x}", 1733045913u64);
494
495        assert!(name.starts_with("prod-runner-"));
496        assert!(name.contains("-demo-task-"));
497        assert!(name.contains("-2a-"));
498        assert!(name.ends_with(&format!("-{}", timestamp_hex)));
499    }
500
501    #[test]
502    fn build_cgroup_name_hex_values() {
503        let name = build_cgroup_name("r", "s", 0, 0);
504        assert_eq!(name, "r-s-0-0");
505        let name = build_cgroup_name("r", "s", 255, 255);
506        assert_eq!(name, "r-s-ff-ff");
507        let name = build_cgroup_name("r", "s", 4096, 65536);
508        assert_eq!(name, "r-s-1000-10000");
509    }
510
511    #[cfg(target_os = "linux")]
512    #[test]
513    fn attach_with_limits_does_not_error() {
514        let limits = CgroupLimits {
515            cpu: Some(CpuMax::default()),
516            memory: Some(128 * 1024 * 1024),
517            pids: Some(32),
518            ..Default::default()
519        };
520        let name = build_cgroup_name("test", "slot", 1, 1733045913);
521        let mut cmd = Command::new("true");
522        let r = attach_cgroup(&mut cmd, &name, &limits);
523        assert!(r.is_ok());
524    }
525
526    #[cfg(not(target_os = "linux"))]
527    #[test]
528    fn non_linux_platforms_ignore_limits() {
529        let limits = CgroupLimits {
530            cpu: Some(CpuMax::default()),
531            memory: Some(1),
532            pids: Some(1),
533            ..Default::default()
534        };
535        let mut cmd = Command::new("true");
536        let r = attach_cgroup(&mut cmd, "test-cgroup", &limits);
537        assert!(
538            r.is_ok(),
539            "non-Linux must ignore limits but still return Ok"
540        );
541    }
542
543    #[cfg(target_os = "linux")]
544    #[test]
545    fn cleanup_nonexistent_cgroup_does_not_panic() {
546        let name = build_cgroup_name("test", "nonexistent", 999, 1733045913);
547        cleanup_cgroup(&name); // best-effort, should not panic
548    }
549
550    fn fmt_pid(pid: i32) -> String {
551        let mut buf = [0u8; 24];
552        let slice = format_pid(pid, &mut buf);
553        String::from_utf8_lossy(slice).into_owned()
554    }
555
556    #[test]
557    fn format_pid_one() {
558        assert_eq!(fmt_pid(1), "1\n");
559    }
560
561    #[test]
562    fn format_pid_single_digit() {
563        assert_eq!(fmt_pid(9), "9\n");
564    }
565
566    #[test]
567    fn format_pid_two_digits() {
568        assert_eq!(fmt_pid(10), "10\n");
569        assert_eq!(fmt_pid(99), "99\n");
570    }
571
572    #[test]
573    fn format_pid_typical() {
574        assert_eq!(fmt_pid(32768), "32768\n");
575    }
576
577    #[test]
578    fn format_pid_large() {
579        assert_eq!(fmt_pid(4_194_304), "4194304\n");
580    }
581
582    #[test]
583    fn format_pid_zero() {
584        assert_eq!(fmt_pid(0), "0\n");
585    }
586}