Skip to main content

hpc_node/
cgroup.rs

1//! cgroup v2 conventions and management trait.
2//!
3//! Defines the shared cgroup hierarchy layout that both pact and lattice use,
4//! regardless of which system creates it.
5
6use serde::{Deserialize, Serialize};
7
8/// cgroup slice ownership — who has exclusive write access.
9///
10/// Invariant RI1: each slice subtree is owned by exactly one system.
11/// No system writes to another's slice except during declared emergency (RI3).
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
13pub enum SliceOwner {
14    /// pact-agent owns this slice (system services).
15    Pact,
16    /// lattice-node-agent owns this slice (workloads).
17    Workload,
18}
19
20/// Well-known cgroup slice paths.
21///
22/// Both pact and lattice use these constants to ensure consistent hierarchy
23/// regardless of which system creates the slices.
24pub mod slices {
25    /// Root slice for pact-managed system services.
26    pub const PACT_ROOT: &str = "pact.slice";
27    /// Infrastructure services: chronyd, dbus-daemon, rasdaemon.
28    pub const PACT_INFRA: &str = "pact.slice/infra.slice";
29    /// Network services: `cxi_rh` instances.
30    pub const PACT_NETWORK: &str = "pact.slice/network.slice";
31    /// GPU services: nvidia-persistenced, nv-hostengine.
32    pub const PACT_GPU: &str = "pact.slice/gpu.slice";
33    /// Audit services: auditd, audit-forwarder (regulated vClusters only).
34    pub const PACT_AUDIT: &str = "pact.slice/audit.slice";
35    /// Root slice for lattice-managed workload allocations.
36    pub const WORKLOAD_ROOT: &str = "workload.slice";
37}
38
39/// Returns the owner of a given cgroup path.
40///
41/// Returns `None` for paths outside the known hierarchy (e.g., root cgroup).
42#[must_use]
43pub fn slice_owner(path: &str) -> Option<SliceOwner> {
44    if path.starts_with(slices::PACT_ROOT) {
45        Some(SliceOwner::Pact)
46    } else if path.starts_with(slices::WORKLOAD_ROOT) {
47        Some(SliceOwner::Workload)
48    } else {
49        None
50    }
51}
52
53/// Resource limits for a cgroup scope.
54///
55/// Applied when creating a scope for a service or allocation.
56#[derive(Debug, Clone, Default, Serialize, Deserialize)]
57pub struct ResourceLimits {
58    /// Memory limit in bytes (maps to `memory.max`). `None` = unlimited.
59    pub memory_max: Option<u64>,
60    /// CPU weight (1–10000, maps to `cpu.weight`). `None` = default (100).
61    pub cpu_weight: Option<u16>,
62    /// IO max in bytes/sec. `None` = unlimited.
63    pub io_max: Option<u64>,
64}
65
66/// Opaque handle to a created cgroup scope.
67///
68/// Returned by [`CgroupManager::create_scope`] and passed to process spawn
69/// for placement. Implementers store whatever is needed to reference the scope
70/// (typically the cgroup path).
71#[derive(Debug, Clone)]
72pub struct CgroupHandle {
73    /// Full cgroup path (e.g., `/sys/fs/cgroup/pact.slice/gpu.slice/nvidia-persistenced`).
74    pub path: String,
75}
76
77/// Metrics read from a cgroup.
78#[derive(Debug, Clone, Default, Serialize, Deserialize)]
79pub struct CgroupMetrics {
80    /// Current memory usage in bytes (`memory.current`).
81    pub memory_current: u64,
82    /// Memory limit in bytes (`memory.max`). `None` if unlimited.
83    pub memory_max: Option<u64>,
84    /// Total CPU usage in microseconds (`cpu.stat` → `usage_usec`).
85    pub cpu_usage_usec: u64,
86    /// Number of processes in the cgroup (`cgroup.procs` line count).
87    pub nr_processes: u32,
88}
89
90/// Trait for cgroup hierarchy management.
91///
92/// Both pact (direct cgroup v2 filesystem) and lattice (standalone mode)
93/// implement this. The trait defines the contract; ownership enforcement
94/// (RI1) and emergency override (RI3) are the implementer's responsibility.
95///
96/// # Invariants enforced
97///
98/// - RI2: every supervised process has a scope (caller must use `create_scope` before spawn)
99/// - RI5: callback on failure (caller must call `destroy_scope` on spawn failure)
100/// - RI6: shared read (any path readable via `read_metrics`)
101pub trait CgroupManager: Send + Sync {
102    /// Create the top-level slice hierarchy.
103    ///
104    /// Called once at boot. Idempotent — safe to call if hierarchy already exists.
105    /// Creates `pact.slice/` and `workload.slice/` with their sub-slices.
106    fn create_hierarchy(&self) -> Result<(), CgroupError>;
107
108    /// Create a scoped cgroup for a service or allocation.
109    ///
110    /// Returns a handle for process placement. The scope is created under
111    /// `parent_slice` with the given `name` and resource limits applied.
112    ///
113    /// # Errors
114    ///
115    /// Returns [`CgroupError::CreationFailed`] if the scope cannot be created.
116    /// Returns [`CgroupError::PermissionDenied`] if the caller doesn't own the parent slice.
117    fn create_scope(
118        &self,
119        parent_slice: &str,
120        name: &str,
121        limits: &ResourceLimits,
122    ) -> Result<CgroupHandle, CgroupError>;
123
124    /// Kill all processes in a scope and release it.
125    ///
126    /// Uses `cgroup.kill` (Linux 5.14+) for immediate cleanup. No grace period
127    /// for child processes (PS3). Falls back to iterating `cgroup.procs` + SIGKILL
128    /// on older kernels.
129    ///
130    /// # Errors
131    ///
132    /// Returns [`CgroupError::KillFailed`] if processes cannot be killed (e.g., D-state).
133    /// The scope should be marked as zombie in this case (F30).
134    fn destroy_scope(&self, handle: &CgroupHandle) -> Result<(), CgroupError>;
135
136    /// Read metrics from any cgroup path.
137    ///
138    /// Shared read access across all slices (RI6) — no ownership check.
139    fn read_metrics(&self, path: &str) -> Result<CgroupMetrics, CgroupError>;
140
141    /// Check if a scope is empty (no processes).
142    ///
143    /// Used by the supervision loop to detect completed allocations (WI5).
144    fn is_scope_empty(&self, handle: &CgroupHandle) -> Result<bool, CgroupError>;
145}
146
147/// Errors from cgroup operations.
148#[derive(Debug, thiserror::Error)]
149pub enum CgroupError {
150    #[error("cgroup creation failed: {reason}")]
151    CreationFailed { reason: String },
152
153    #[error("cgroup.kill failed for {path}: {reason}")]
154    KillFailed { path: String, reason: String },
155
156    #[error("cgroup path not found: {path}")]
157    NotFound { path: String },
158
159    #[error("permission denied: {path} owned by {owner:?}")]
160    PermissionDenied { path: String, owner: SliceOwner },
161
162    #[error("cgroup I/O error: {0}")]
163    Io(#[from] std::io::Error),
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    #[test]
171    fn slice_owner_pact() {
172        assert_eq!(slice_owner(slices::PACT_ROOT), Some(SliceOwner::Pact));
173        assert_eq!(slice_owner(slices::PACT_INFRA), Some(SliceOwner::Pact));
174        assert_eq!(slice_owner(slices::PACT_GPU), Some(SliceOwner::Pact));
175        assert_eq!(slice_owner(slices::PACT_NETWORK), Some(SliceOwner::Pact));
176        assert_eq!(slice_owner(slices::PACT_AUDIT), Some(SliceOwner::Pact));
177    }
178
179    #[test]
180    fn slice_owner_workload() {
181        assert_eq!(
182            slice_owner(slices::WORKLOAD_ROOT),
183            Some(SliceOwner::Workload)
184        );
185        assert_eq!(
186            slice_owner("workload.slice/alloc-42"),
187            Some(SliceOwner::Workload)
188        );
189    }
190
191    #[test]
192    fn slice_owner_unknown() {
193        assert_eq!(slice_owner("system.slice"), None);
194        assert_eq!(slice_owner(""), None);
195        assert_eq!(slice_owner("/sys/fs/cgroup"), None);
196    }
197
198    #[test]
199    fn resource_limits_default() {
200        let limits = ResourceLimits::default();
201        assert!(limits.memory_max.is_none());
202        assert!(limits.cpu_weight.is_none());
203        assert!(limits.io_max.is_none());
204    }
205
206    #[test]
207    fn slice_owner_nested_paths() {
208        // Deep nesting still resolves to root owner
209        assert_eq!(
210            slice_owner("pact.slice/infra.slice/chronyd.scope"),
211            Some(SliceOwner::Pact)
212        );
213        assert_eq!(
214            slice_owner("workload.slice/alloc-42/task-1.scope"),
215            Some(SliceOwner::Workload)
216        );
217    }
218
219    #[test]
220    fn slice_owner_substring_not_matched() {
221        // "not-pact.slice" should not match pact.slice prefix
222        assert_eq!(slice_owner("not-pact.slice/foo"), None);
223        // "workload.slice-extra" does match because starts_with
224        assert_eq!(
225            slice_owner("workload.slice-extra"),
226            Some(SliceOwner::Workload)
227        );
228    }
229
230    #[test]
231    fn slice_owner_serialization() {
232        let owner = SliceOwner::Pact;
233        let json = serde_json::to_string(&owner).unwrap();
234        let deser: SliceOwner = serde_json::from_str(&json).unwrap();
235        assert_eq!(deser, SliceOwner::Pact);
236
237        let owner = SliceOwner::Workload;
238        let json = serde_json::to_string(&owner).unwrap();
239        let deser: SliceOwner = serde_json::from_str(&json).unwrap();
240        assert_eq!(deser, SliceOwner::Workload);
241    }
242
243    #[test]
244    fn resource_limits_with_values() {
245        let limits = ResourceLimits {
246            memory_max: Some(512 * 1024 * 1024), // 512 MB
247            cpu_weight: Some(200),
248            io_max: Some(100_000_000),
249        };
250        assert_eq!(limits.memory_max, Some(536_870_912));
251        assert_eq!(limits.cpu_weight, Some(200));
252        assert_eq!(limits.io_max, Some(100_000_000));
253    }
254
255    #[test]
256    fn resource_limits_serialization_roundtrip() {
257        let limits = ResourceLimits {
258            memory_max: Some(1024),
259            cpu_weight: Some(500),
260            io_max: None,
261        };
262        let json = serde_json::to_string(&limits).unwrap();
263        let deser: ResourceLimits = serde_json::from_str(&json).unwrap();
264        assert_eq!(deser.memory_max, Some(1024));
265        assert_eq!(deser.cpu_weight, Some(500));
266        assert!(deser.io_max.is_none());
267    }
268
269    #[test]
270    fn cgroup_handle_path() {
271        let handle = CgroupHandle {
272            path: "/sys/fs/cgroup/pact.slice/gpu.slice/nvidia-persistenced".to_string(),
273        };
274        assert!(handle.path.contains("pact.slice"));
275    }
276
277    #[test]
278    fn cgroup_metrics_default() {
279        let metrics = CgroupMetrics::default();
280        assert_eq!(metrics.memory_current, 0);
281        assert!(metrics.memory_max.is_none());
282        assert_eq!(metrics.cpu_usage_usec, 0);
283        assert_eq!(metrics.nr_processes, 0);
284    }
285
286    #[test]
287    fn cgroup_error_display() {
288        let err = CgroupError::CreationFailed {
289            reason: "no space".to_string(),
290        };
291        assert_eq!(err.to_string(), "cgroup creation failed: no space");
292
293        let err = CgroupError::KillFailed {
294            path: "/sys/fs/cgroup/test".to_string(),
295            reason: "D-state".to_string(),
296        };
297        assert!(err.to_string().contains("D-state"));
298
299        let err = CgroupError::PermissionDenied {
300            path: "workload.slice".to_string(),
301            owner: SliceOwner::Workload,
302        };
303        assert!(err.to_string().contains("Workload"));
304    }
305
306    // Mock implementation to verify trait is implementable
307    struct MockCgroupManager;
308
309    impl CgroupManager for MockCgroupManager {
310        fn create_hierarchy(&self) -> Result<(), CgroupError> {
311            Ok(())
312        }
313        fn create_scope(
314            &self,
315            parent_slice: &str,
316            name: &str,
317            _limits: &ResourceLimits,
318        ) -> Result<CgroupHandle, CgroupError> {
319            Ok(CgroupHandle {
320                path: format!("{parent_slice}/{name}.scope"),
321            })
322        }
323        fn destroy_scope(&self, _handle: &CgroupHandle) -> Result<(), CgroupError> {
324            Ok(())
325        }
326        fn read_metrics(&self, _path: &str) -> Result<CgroupMetrics, CgroupError> {
327            Ok(CgroupMetrics::default())
328        }
329        fn is_scope_empty(&self, _handle: &CgroupHandle) -> Result<bool, CgroupError> {
330            Ok(true)
331        }
332    }
333
334    #[test]
335    fn mock_cgroup_manager_lifecycle() {
336        let mgr = MockCgroupManager;
337        mgr.create_hierarchy().unwrap();
338
339        let handle = mgr
340            .create_scope(
341                slices::PACT_GPU,
342                "nvidia-persistenced",
343                &ResourceLimits::default(),
344            )
345            .unwrap();
346        assert_eq!(
347            handle.path,
348            "pact.slice/gpu.slice/nvidia-persistenced.scope"
349        );
350
351        assert!(mgr.is_scope_empty(&handle).unwrap());
352
353        let metrics = mgr.read_metrics(&handle.path).unwrap();
354        assert_eq!(metrics.nr_processes, 0);
355
356        mgr.destroy_scope(&handle).unwrap();
357    }
358
359    #[test]
360    fn mock_cgroup_manager_permission_denied() {
361        struct StrictMockCgroupManager;
362
363        impl CgroupManager for StrictMockCgroupManager {
364            fn create_hierarchy(&self) -> Result<(), CgroupError> {
365                Ok(())
366            }
367            fn create_scope(
368                &self,
369                parent_slice: &str,
370                _name: &str,
371                _limits: &ResourceLimits,
372            ) -> Result<CgroupHandle, CgroupError> {
373                if let Some(owner) = slice_owner(parent_slice) {
374                    if owner != SliceOwner::Pact {
375                        return Err(CgroupError::PermissionDenied {
376                            path: parent_slice.to_string(),
377                            owner,
378                        });
379                    }
380                }
381                Ok(CgroupHandle {
382                    path: format!("{parent_slice}/test.scope"),
383                })
384            }
385            fn destroy_scope(&self, _handle: &CgroupHandle) -> Result<(), CgroupError> {
386                Ok(())
387            }
388            fn read_metrics(&self, _path: &str) -> Result<CgroupMetrics, CgroupError> {
389                Ok(CgroupMetrics::default())
390            }
391            fn is_scope_empty(&self, _handle: &CgroupHandle) -> Result<bool, CgroupError> {
392                Ok(true)
393            }
394        }
395
396        let mgr = StrictMockCgroupManager;
397
398        // Pact-owned slice: allowed
399        assert!(mgr
400            .create_scope(slices::PACT_INFRA, "test", &ResourceLimits::default())
401            .is_ok());
402
403        // Workload-owned slice: denied (RI1)
404        let err = mgr
405            .create_scope(slices::WORKLOAD_ROOT, "test", &ResourceLimits::default())
406            .unwrap_err();
407        assert!(matches!(err, CgroupError::PermissionDenied { .. }));
408    }
409}