Skip to main content

synwire_sandbox/platform/linux/
cgroup.rs

1//! cgroup v2 resource manager.
2//!
3//! Discovers the calling process's own cgroup path via `/proc/self/cgroup`
4//! and creates per-agent sub-cgroups as siblings of the process cgroup:
5//!
6//! ```text
7//! <process-cgroup-parent>/
8//!   <process-scope>/          ← synwire process lives here
9//!   synwire/
10//!     agents/<agent-uuid>/    ← agent processes live here
11//! ```
12//!
13//! Placing agent cgroups under the process cgroup's **parent** avoids the
14//! cgroup-v2 "no internal processes" constraint (the process itself must not
15//! be in a cgroup that also enables subtree controllers), while keeping the
16//! hierarchy as close to the running synwire process as possible.
17//!
18//! No root privileges are required — systemd already delegates the entire
19//! `user@<uid>.service/` subtree at login and the calling process's parent
20//! cgroup is writeable.
21//!
22//! # Enabling delegation
23//!
24//! On most systemd-based distributions (Fedora, Ubuntu 22.04+, Arch),
25//! delegation works out of the box. If controllers (cpu, memory, pids) are
26//! not available in your user subtree, configure systemd to delegate them:
27//!
28//! ```bash
29//! sudo mkdir -p /etc/systemd/system/user@.service.d
30//! cat <<'EOF' | sudo tee /etc/systemd/system/user@.service.d/delegate.conf
31//! [Service]
32//! Delegate=cpu cpuset io memory pids
33//! EOF
34//! sudo systemctl daemon-reload
35//! ```
36//!
37//! Log out and back in (or `sudo systemctl restart user@$(id -u).service`)
38//! for changes to take effect.
39//!
40//! **WSL2**: add `systemd=true` under `[boot]` in `/etc/wsl.conf` and
41//! restart WSL to enable systemd (required for user cgroup delegation).
42//!
43//! See the [Process Sandboxing](https://randomvariable.github.io/synwire/how-to/process-sandbox.html)
44//! guide for full setup instructions including namespace isolation.
45
46use std::path::PathBuf;
47
48use tokio::fs;
49use tracing::{debug, warn};
50use uuid::Uuid;
51
52use crate::SandboxError;
53
54/// CPU and memory statistics read from cgroup v2 controllers.
55#[derive(Debug, Clone)]
56pub struct CgroupStats {
57    /// Cumulative CPU time in nanoseconds (from `cpu.stat` `usage_usec`).
58    pub cpu_usage_ns: u64,
59    /// Current memory usage in bytes (from `memory.current`).
60    pub memory_current_bytes: u64,
61}
62
63/// cgroup v2 resource manager for a single agent.
64///
65/// One `CgroupV2Manager` is created per agent instance. It owns a sub-cgroup
66/// that is a sibling of the calling process's cgroup, providing resource
67/// accounting, enforcement, and forcible termination.
68#[derive(Debug)]
69pub struct CgroupV2Manager {
70    /// Absolute path to the agent's cgroup directory.
71    base_path: PathBuf,
72}
73
74impl CgroupV2Manager {
75    /// Check whether cgroup v2 is available on this system.
76    ///
77    /// Returns `true` if `/sys/fs/cgroup/cgroup.controllers` exists.
78    pub async fn is_available() -> bool {
79        fs::metadata("/sys/fs/cgroup/cgroup.controllers")
80            .await
81            .is_ok()
82    }
83
84    /// Discover the parent of the calling process's own cgroup.
85    ///
86    /// Parses the `0::` entry (unified hierarchy) from `/proc/self/cgroup` to
87    /// obtain the process's current cgroup path, then returns its parent.
88    /// Agent sub-cgroups are created there, making them siblings of the
89    /// process's cgroup and enabling resource controllers without violating
90    /// the cgroup-v2 "no internal processes" constraint.
91    ///
92    /// Falls back to the process's own cgroup if it has no parent (e.g.,
93    /// running directly under the cgroup root — rare in practice).
94    ///
95    /// # Errors
96    ///
97    /// Returns [`SandboxError::CgroupParseFailed`] if `/proc/self/cgroup`
98    /// cannot be read or the `0::` entry is absent.
99    pub async fn discover_cgroup_parent() -> Result<PathBuf, SandboxError> {
100        let contents = fs::read_to_string("/proc/self/cgroup")
101            .await
102            .map_err(|e| SandboxError::CgroupParseFailed(format!("read /proc/self/cgroup: {e}")))?;
103
104        // Find the unified hierarchy line: "0::<path>"
105        let cgroup_rel = contents
106            .lines()
107            .find_map(|line| {
108                let mut parts = line.splitn(3, ':');
109                let hier = parts.next()?;
110                let _ = parts.next(); // controllers field
111                let path = parts.next()?;
112                if hier == "0" {
113                    Some(path.trim().to_string())
114                } else {
115                    None
116                }
117            })
118            .ok_or_else(|| {
119                SandboxError::CgroupParseFailed(
120                    "no unified hierarchy (0::) entry in /proc/self/cgroup".into(),
121                )
122            })?;
123
124        // Construct the absolute path of the process's own cgroup.
125        let process_cgroup =
126            PathBuf::from("/sys/fs/cgroup").join(cgroup_rel.trim_start_matches('/'));
127
128        // Use the parent so agent sub-cgroups are siblings of the process cgroup.
129        // This satisfies cgroup-v2 NIP: no processes are directly in the parent.
130        let parent = process_cgroup
131            .parent()
132            .unwrap_or(&process_cgroup)
133            .to_path_buf();
134
135        debug!(?process_cgroup, ?parent, "discovered process cgroup");
136        Ok(parent)
137    }
138
139    /// Create a new cgroup manager for the given agent UUID.
140    ///
141    /// Discovers the process cgroup's parent, creates the agent sub-cgroup
142    /// directory, enables required controllers on the parent and on the
143    /// `synwire/` intermediate cgroup, and applies resource limits if provided.
144    ///
145    /// Falls back gracefully (returns error, caller should log and disable
146    /// cgroup tracking) if cgroup v2 is not available or the path is not
147    /// writable.
148    ///
149    /// # Errors
150    ///
151    /// Returns a [`SandboxError`] variant if cgroup setup fails.
152    pub async fn new(
153        agent_id: Uuid,
154        resources: Option<&synwire_core::agents::sandbox::ResourceLimits>,
155    ) -> Result<Self, SandboxError> {
156        let cgroup_parent = Self::discover_cgroup_parent().await?;
157        let synwire_root = cgroup_parent.join("synwire");
158        let agents_root = synwire_root.join("agents");
159        let base_path = agents_root.join(agent_id.to_string());
160
161        // Enable controllers in the parent cgroup so the `synwire/` subtree
162        // can use cpu/memory/pids controllers.
163        let parent_control = cgroup_parent.join("cgroup.subtree_control");
164        let _ = fs::write(&parent_control, "+cpu +memory +pids").await;
165
166        // Create the `synwire/` intermediate cgroup if needed.
167        if !synwire_root.exists() {
168            fs::create_dir_all(&synwire_root)
169                .await
170                .map_err(SandboxError::CgroupIo)?;
171        }
172
173        // Propagate controllers into the `synwire/agents/` subtree.
174        let synwire_control = synwire_root.join("cgroup.subtree_control");
175        let _ = fs::write(&synwire_control, "+cpu +memory +pids").await;
176
177        // Verify writability by probing with a temporary directory.
178        let test_path = agents_root.join("_write_test");
179        match fs::create_dir_all(&test_path).await {
180            Ok(()) => {
181                let _ = fs::remove_dir(&test_path).await;
182            }
183            Err(_) => {
184                return Err(SandboxError::CgroupNotWritable {
185                    path: agents_root.display().to_string(),
186                });
187            }
188        }
189
190        fs::create_dir_all(&base_path)
191            .await
192            .map_err(SandboxError::CgroupIo)?;
193
194        let mgr = Self { base_path };
195
196        if let Some(limits) = resources {
197            mgr.apply_limits(limits).await?;
198        }
199
200        Ok(mgr)
201    }
202
203    /// Absolute path to this agent's cgroup directory.
204    #[must_use]
205    pub fn base_path(&self) -> &std::path::Path {
206        &self.base_path
207    }
208
209    /// Move a process into this agent's cgroup.
210    ///
211    /// # Errors
212    ///
213    /// Returns [`SandboxError::CgroupIo`] if writing to `cgroup.procs` fails.
214    pub async fn move_pid(&self, pid: u32) -> Result<(), SandboxError> {
215        let procs_path = self.base_path.join("cgroup.procs");
216        fs::write(&procs_path, pid.to_string())
217            .await
218            .map_err(SandboxError::CgroupIo)
219    }
220
221    /// Read live CPU and memory stats for this cgroup.
222    ///
223    /// Returns `None` if either file is missing or unparseable (non-fatal).
224    pub async fn read_stats(&self) -> Option<CgroupStats> {
225        let cpu_ns = read_cpu_usage_ns(&self.base_path).await;
226        let memory_bytes = read_memory_current(&self.base_path).await;
227        match (cpu_ns, memory_bytes) {
228            (Some(cpu_usage_ns), Some(memory_current_bytes)) => Some(CgroupStats {
229                cpu_usage_ns,
230                memory_current_bytes,
231            }),
232            _ => None,
233        }
234    }
235
236    /// Forcibly kill all processes in this cgroup.
237    ///
238    /// Tries `cgroup.kill` (Linux 5.14+); falls back to reading `cgroup.procs`
239    /// and sending `SIGKILL` to each PID via nix.
240    ///
241    /// # Errors
242    ///
243    /// Returns [`SandboxError::CgroupIo`] if both kill mechanisms fail.
244    pub async fn kill_all(&self) -> Result<(), SandboxError> {
245        let kill_path = self.base_path.join("cgroup.kill");
246        if fs::write(&kill_path, "1").await.is_ok() {
247            return Ok(());
248        }
249
250        // Fallback: read cgroup.procs and SIGKILL each PID.
251        let procs_path = self.base_path.join("cgroup.procs");
252        let contents = fs::read_to_string(&procs_path)
253            .await
254            .map_err(SandboxError::CgroupIo)?;
255
256        for line in contents.lines() {
257            let Ok(pid_raw) = line.trim().parse::<i32>() else {
258                continue;
259            };
260            let pid = nix::unistd::Pid::from_raw(pid_raw);
261            let _ = nix::sys::signal::kill(pid, nix::sys::signal::Signal::SIGKILL);
262        }
263        Ok(())
264    }
265
266    /// Remove this agent's cgroup directory.
267    ///
268    /// Should only be called after all processes have exited. Logs a warning
269    /// if removal fails (e.g., lingering processes).
270    pub async fn destroy(&self) {
271        if let Err(e) = fs::remove_dir(&self.base_path).await {
272            warn!(path = %self.base_path.display(), error = %e, "failed to remove agent cgroup");
273        }
274    }
275
276    /// Apply resource limits to this cgroup.
277    async fn apply_limits(
278        &self,
279        limits: &synwire_core::agents::sandbox::ResourceLimits,
280    ) -> Result<(), SandboxError> {
281        if let Some(mem_bytes) = limits.memory_bytes {
282            fs::write(self.base_path.join("memory.max"), mem_bytes.to_string())
283                .await
284                .map_err(SandboxError::CgroupIo)?;
285        }
286
287        if let Some(cpu_quota) = limits.cpu_quota {
288            // cpu.max format: "<quota> <period>" where quota and period are in µs.
289            // A period of 100ms = 100_000 µs is conventional.
290            let period_us = 100_000u64;
291            #[allow(
292                clippy::cast_precision_loss,
293                clippy::cast_possible_truncation,
294                clippy::cast_sign_loss
295            )]
296            let quota_us = (f64::from(cpu_quota) * period_us as f64) as u64;
297            let content = format!("{quota_us} {period_us}");
298            fs::write(self.base_path.join("cpu.max"), content)
299                .await
300                .map_err(SandboxError::CgroupIo)?;
301        }
302
303        if let Some(max_pids) = limits.max_pids {
304            fs::write(self.base_path.join("pids.max"), max_pids.to_string())
305                .await
306                .map_err(SandboxError::CgroupIo)?;
307        }
308
309        Ok(())
310    }
311}
312
313impl Drop for CgroupV2Manager {
314    fn drop(&mut self) {
315        // Best-effort: kill all processes and remove the cgroup directory.
316        // Uses synchronous std::fs — Drop cannot be async.
317
318        // Try cgroup.kill (Linux 5.14+) first.
319        let kill_path = self.base_path.join("cgroup.kill");
320        if std::fs::write(&kill_path, "1").is_err() {
321            // Fallback: read cgroup.procs and SIGKILL each PID individually.
322            if let Ok(contents) = std::fs::read_to_string(self.base_path.join("cgroup.procs")) {
323                for line in contents.lines() {
324                    if let Ok(pid_raw) = line.trim().parse::<i32>() {
325                        let pid = nix::unistd::Pid::from_raw(pid_raw);
326                        let _ = nix::sys::signal::kill(pid, nix::sys::signal::Signal::SIGKILL);
327                    }
328                }
329            }
330        }
331
332        // Try to remove the now-empty cgroup directory.
333        if let Err(e) = std::fs::remove_dir(&self.base_path) {
334            warn!(path = %self.base_path.display(), error = %e, "failed to remove agent cgroup on drop");
335        }
336    }
337}
338
339// ── helpers ───────────────────────────────────────────────────────────────────
340
341/// Parse `usage_usec` from `cpu.stat` and return nanoseconds.
342async fn read_cpu_usage_ns(base: &std::path::Path) -> Option<u64> {
343    let content = fs::read_to_string(base.join("cpu.stat")).await.ok()?;
344    content.lines().find_map(|line| {
345        let mut parts = line.splitn(2, ' ');
346        if parts.next()? == "usage_usec" {
347            parts.next()?.trim().parse::<u64>().ok().map(|us| us * 1000)
348        } else {
349            None
350        }
351    })
352}
353
354/// Read `memory.current` and return bytes.
355async fn read_memory_current(base: &std::path::Path) -> Option<u64> {
356    let content = fs::read_to_string(base.join("memory.current")).await.ok()?;
357    content.trim().parse::<u64>().ok()
358}
359
360// ── tests ─────────────────────────────────────────────────────────────────────
361
362#[cfg(test)]
363#[allow(clippy::unwrap_used)]
364mod tests {
365    use super::*;
366
367    /// Parse the `0::` cgroup path from a `/proc/self/cgroup` contents string
368    /// and return the absolute path. Mirrors `discover_cgroup_parent`'s logic.
369    fn parse_process_cgroup(content: &str) -> PathBuf {
370        let rel = content
371            .lines()
372            .find_map(|line| {
373                let mut parts = line.splitn(3, ':');
374                let hier = parts.next()?;
375                let _ = parts.next();
376                let path = parts.next()?;
377                if hier == "0" {
378                    Some(path.trim().to_string())
379                } else {
380                    None
381                }
382            })
383            .unwrap();
384        PathBuf::from("/sys/fs/cgroup").join(rel.trim_start_matches('/'))
385    }
386
387    #[test]
388    fn parse_cgroup_line_unified_hierarchy() {
389        let content = "12:cpuset:/\n0::/user.slice/user-1000.slice/user@1000.service/app.slice\n";
390        let process_cgroup = parse_process_cgroup(content);
391        assert_eq!(
392            process_cgroup,
393            PathBuf::from("/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/app.slice")
394        );
395    }
396
397    #[test]
398    fn cgroup_parent_is_one_level_up() {
399        // Typical case: process is in a scope inside app.slice.
400        let content = "0::/user.slice/user-1000.slice/user@1000.service/app.slice/code.scope\n";
401        let process_cgroup = parse_process_cgroup(content);
402        let parent = process_cgroup
403            .parent()
404            .unwrap_or(&process_cgroup)
405            .to_path_buf();
406        assert_eq!(
407            parent,
408            PathBuf::from("/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/app.slice")
409        );
410        // Resulting agent cgroup path.
411        let agent_cgroup = parent.join("synwire").join("agents").join("test-uuid");
412        assert_eq!(
413            agent_cgroup,
414            PathBuf::from(
415                "/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/app.slice/synwire/agents/test-uuid"
416            )
417        );
418    }
419
420    #[test]
421    fn cgroup_parent_fallback_at_root_level() {
422        // Edge case: process is at the cgroup root (0::/).
423        // parse_process_cgroup("0::/") joins "" onto /sys/fs/cgroup, giving
424        // /sys/fs/cgroup itself.  .parent() then returns /sys/fs.
425        // In practice a process is never at the raw cgroup root; this test
426        // just documents the degenerate behaviour.
427        let content = "0::/\n";
428        let process_cgroup = parse_process_cgroup(content);
429        let parent = process_cgroup
430            .parent()
431            .unwrap_or(&process_cgroup)
432            .to_path_buf();
433        // parent is /sys/fs (one level above /sys/fs/cgroup)
434        assert!(parent.starts_with("/sys"));
435    }
436}