Skip to main content

libcgroups/v2/
manager.rs

1use std::fs::{self};
2use std::os::unix::fs::PermissionsExt;
3use std::path::Component::RootDir;
4use std::path::{Path, PathBuf};
5use std::time::Duration;
6
7use nix::errno::Errno;
8use nix::unistd::Pid;
9
10use super::controller::Controller;
11use super::controller_type::{
12    CONTROLLER_TYPES, ControllerType, PSEUDO_CONTROLLER_TYPES, PseudoControllerType,
13};
14use super::cpu::{Cpu, V2CpuControllerError, V2CpuStatsError};
15use super::cpuset::CpuSet;
16#[cfg(feature = "cgroupsv2_devices")]
17use super::devices::Devices;
18use super::freezer::{Freezer, V2FreezerError};
19use super::hugetlb::{HugeTlb, V2HugeTlbControllerError, V2HugeTlbStatsError};
20use super::io::{Io, V2IoControllerError, V2IoStatsError};
21use super::memory::{Memory, V2MemoryControllerError, V2MemoryStatsError};
22use super::pids::Pids;
23use super::unified::{Unified, V2UnifiedError};
24use super::util::{self, CGROUP_SUBTREE_CONTROL, V2UtilError};
25use crate::common::{
26    self, AnyCgroupManager, CGROUP_PROCS, CgroupManager, ControllerOpt, FreezerState,
27    JoinSafelyError, PathBufExt, WrapIoResult, WrappedIoError,
28};
29use crate::stats::{PidStatsError, Stats, StatsProvider};
30
31pub const CGROUP_KILL: &str = "cgroup.kill";
32
33#[derive(thiserror::Error, Debug)]
34pub enum V2ManagerError {
35    #[error("io error: {0}")]
36    WrappedIo(#[from] WrappedIoError),
37    #[error("while joining paths: {0}")]
38    JoinSafely(#[from] JoinSafelyError),
39    #[error(transparent)]
40    Util(#[from] V2UtilError),
41
42    #[error(transparent)]
43    CpuController(#[from] V2CpuControllerError),
44    #[error(transparent)]
45    CpuSetController(WrappedIoError),
46    #[error(transparent)]
47    HugeTlbController(#[from] V2HugeTlbControllerError),
48    #[error(transparent)]
49    IoController(#[from] V2IoControllerError),
50    #[error(transparent)]
51    MemoryController(#[from] V2MemoryControllerError),
52    #[error(transparent)]
53    PidsController(WrappedIoError),
54    #[error(transparent)]
55    UnifiedController(#[from] V2UnifiedError),
56    #[error(transparent)]
57    FreezerController(#[from] V2FreezerError),
58    #[cfg(feature = "cgroupsv2_devices")]
59    #[error(transparent)]
60    DevicesController(#[from] super::devices::controller::DevicesControllerError),
61
62    #[error(transparent)]
63    CpuStats(#[from] V2CpuStatsError),
64    #[error(transparent)]
65    HugeTlbStats(#[from] V2HugeTlbStatsError),
66    #[error(transparent)]
67    PidsStats(PidStatsError),
68    #[error(transparent)]
69    MemoryStats(#[from] V2MemoryStatsError),
70    #[error(transparent)]
71    IoStats(#[from] V2IoStatsError),
72}
73
74/// Represents a management interface for a cgroup located at `{root_path}/{cgroup_path}`
75///
76/// This struct does not have ownership of the cgroup
77pub struct Manager {
78    root_path: PathBuf,
79    cgroup_path: PathBuf,
80    full_path: PathBuf,
81}
82
83impl Manager {
84    /// Constructs a new cgroup manager with root path being the mount point
85    /// of a cgroup v2 fs and cgroup path being a relative path from the root
86    pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result<Self, V2ManagerError> {
87        let full_path = root_path.join_safely(&cgroup_path)?;
88
89        Ok(Self {
90            root_path,
91            cgroup_path,
92            full_path,
93        })
94    }
95
96    /// Creates a unified cgroup at `self.full_path` and attaches a process to it
97    fn create_unified_cgroup(&self, pid: Pid) -> Result<(), V2ManagerError> {
98        let controllers: Vec<String> = util::get_available_controllers(&self.root_path)?
99            .iter()
100            .map(|c| format!("+{c}"))
101            .collect();
102
103        // Note: we intentionally do NOT write controllers to `self.root_path` here.
104        // In nested scenarios (running inside a container where the host's root
105        // cgroup is owned by host systemd), writing to the root's
106        // `cgroup.subtree_control` fails because the file is on a read-only
107        // view from our PoV or owned by another manager. Any ancestor up to
108        // root must already have the relevant controllers enabled — otherwise
109        // our process could not be executing inside that cgroup hierarchy in
110        // the first place. We only enable controllers on path components we
111        // ourselves create; for path components that pre-existed our process
112        // we tolerate per-controller write failures via
113        // `is_subtree_control_per_controller_failure`.
114
115        let mut current_path = self.root_path.clone();
116        let mut components = self
117            .cgroup_path
118            .components()
119            .filter(|c| c.ne(&RootDir))
120            .peekable();
121        while let Some(component) = components.next() {
122            current_path = current_path.join(component);
123            let we_created = if !current_path.exists() {
124                fs::create_dir(&current_path).wrap_create_dir(&current_path)?;
125                fs::metadata(&current_path)
126                    .wrap_other(&current_path)?
127                    .permissions()
128                    .set_mode(0o755);
129                true
130            } else {
131                false
132            };
133
134            // last component cannot have subtree_control enabled due to internal process constraint
135            // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy)
136            if components.peek().is_some() {
137                // When `we_created=true`, we own the cgroup and any failure is
138                // a real bug. When `we_created=false`, the cgroup predates us
139                // (host systemd, outer container runtime) and per-controller
140                // failures on its subtree_control are expected and tolerable
141                // — see `is_subtree_control_per_controller_failure` for the
142                // errno set we silently skip. This matches the behavior of
143                // both runc (opencontainers/cgroups fs2/create.go
144                // CreateCgroupPath) and crun (containers/crun
145                // libcrun/cgroup-utils.c enable_controllers).
146                Self::write_controllers(&current_path, &controllers, /*strict=*/ we_created)?;
147            }
148        }
149
150        common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
151        Ok(())
152    }
153
154    /// Returns true if the wrapped IO error indicates that a single
155    /// `+controller` write to a `cgroup.subtree_control` file should be
156    /// silently skipped when the caller does not own the cgroup. Mirrors the
157    /// errno allowlist used by crun's `enable_controllers`
158    /// (containers/crun libcrun/cgroup-utils.c) and the unconditional
159    /// per-controller swallow in runc's `CreateCgroupPath`
160    /// (opencontainers/cgroups fs2/create.go).
161    ///
162    ///   * `EROFS`      — read-only cgroupfs view (`cgroupns=private` inside
163    ///     a container whose root cgroup is host-owned).
164    ///   * `EACCES`     — DAC owner is another user (e.g. root-owned ancestor
165    ///     slice under a rootless user session).
166    ///   * `ENOENT`     — the controller is not present in this cgroup's own
167    ///     `cgroup.controllers` (e.g. systemd did not delegate `hugetlb` to
168    ///     `user@.service`).
169    ///   * `EPERM`      — capability missing (similar to `EACCES`, errno
170    ///     varies by kernel path).
171    ///   * `EOPNOTSUPP` — controller exists but isn't supported in this
172    ///     hierarchy/configuration.
173    ///   * `EBUSY`      — controller is temporarily contended by another
174    ///     manager (transient; the parent manager will resolve).
175    fn is_subtree_control_per_controller_failure(err: &WrappedIoError) -> bool {
176        matches!(
177            err.inner().raw_os_error().map(Errno::from_raw),
178            Some(Errno::EROFS)
179                | Some(Errno::EACCES)
180                | Some(Errno::ENOENT)
181                | Some(Errno::EPERM)
182                | Some(Errno::EOPNOTSUPP)
183                | Some(Errno::EBUSY)
184        )
185    }
186
187    /// Writes a list of controllers to the `{path}/cgroup.subtree_control`
188    /// file, one at a time, so a single unsupported controller doesn't abort
189    /// the whole list.
190    ///
191    /// When `strict=true`, any per-controller failure is returned to the
192    /// caller. When `strict=false`, per-controller failures whose errno is in
193    /// the tolerated set (see
194    /// [`Self::is_subtree_control_per_controller_failure`]) are logged at
195    /// debug level and silently skipped. This matches the behavior of both
196    /// runc (opencontainers/cgroups fs2/create.go `CreateCgroupPath`) and
197    /// crun (containers/crun libcrun/cgroup-utils.c `enable_controllers`):
198    /// the kernel will reject controllers an ancestor cgroup doesn't itself
199    /// have enabled, and trying to enable a controller on a cgroup managed
200    /// by another manager (host systemd, outer container runtime) is normal
201    /// and not an error.
202    fn write_controllers(
203        path: &Path,
204        controllers: &[String],
205        strict: bool,
206    ) -> Result<(), WrappedIoError> {
207        for controller in controllers {
208            match common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller) {
209                Ok(()) => {}
210                Err(e) if !strict && Self::is_subtree_control_per_controller_failure(&e) => {
211                    tracing::debug!(
212                        path = ?path,
213                        controller = %controller,
214                        errno = ?e.inner().raw_os_error(),
215                        "skipping unsupported controller on pre-existing ancestor cgroup",
216                    );
217                }
218                Err(e) => return Err(e),
219            }
220        }
221
222        Ok(())
223    }
224
225    pub fn any(self) -> AnyCgroupManager {
226        AnyCgroupManager::V2(self)
227    }
228}
229
230impl CgroupManager for Manager {
231    type Error = V2ManagerError;
232
233    fn add_task(&self, pid: Pid) -> Result<(), Self::Error> {
234        if self.full_path.exists() {
235            common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
236            return Ok(());
237        }
238        self.create_unified_cgroup(pid)?;
239        Ok(())
240    }
241
242    fn apply(&self, controller_opt: &ControllerOpt) -> Result<(), Self::Error> {
243        for controller in CONTROLLER_TYPES {
244            match controller {
245                ControllerType::Cpu => Cpu::apply(controller_opt, &self.full_path)?,
246                ControllerType::CpuSet => CpuSet::apply(controller_opt, &self.full_path)?,
247                ControllerType::HugeTlb => HugeTlb::apply(controller_opt, &self.full_path)?,
248                ControllerType::Io => Io::apply(controller_opt, &self.full_path)?,
249                ControllerType::Memory => Memory::apply(controller_opt, &self.full_path)?,
250                ControllerType::Pids => Pids::apply(controller_opt, &self.full_path)?,
251            }
252        }
253
254        #[cfg(feature = "cgroupsv2_devices")]
255        Devices::apply(controller_opt, &self.full_path)?;
256
257        for pseudoctlr in PSEUDO_CONTROLLER_TYPES {
258            if let PseudoControllerType::Unified = pseudoctlr {
259                Unified::apply(
260                    controller_opt,
261                    &self.full_path,
262                    util::get_available_controllers(&self.root_path)?,
263                )?;
264            }
265        }
266
267        Ok(())
268    }
269
270    fn remove(&self) -> Result<(), Self::Error> {
271        if self.full_path.exists() {
272            tracing::debug!("remove cgroup {:?}", self.full_path);
273            let kill_file = self.full_path.join(CGROUP_KILL);
274            if kill_file.exists() {
275                fs::write(&kill_file, "1").wrap_write(&kill_file, "1")?;
276            } else {
277                let procs_path = self.full_path.join(CGROUP_PROCS);
278                let procs = fs::read_to_string(&procs_path).wrap_read(&procs_path)?;
279
280                for line in procs.lines() {
281                    let pid: i32 = line
282                        .parse()
283                        .map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))
284                        .wrap_other(&procs_path)?;
285                    let _ = nix::sys::signal::kill(Pid::from_raw(pid), nix::sys::signal::SIGKILL);
286                }
287            }
288
289            common::delete_with_retry(&self.full_path, 4, Duration::from_millis(100))?;
290        }
291
292        Ok(())
293    }
294
295    fn freeze(&self, state: FreezerState) -> Result<(), Self::Error> {
296        let controller_opt = ControllerOpt {
297            resources: &Default::default(),
298            freezer_state: Some(state),
299            oom_score_adj: None,
300            disable_oom_killer: false,
301        };
302        Ok(Freezer::apply(&controller_opt, &self.full_path)?)
303    }
304
305    fn stats(&self) -> Result<Stats, Self::Error> {
306        let mut stats = Stats::default();
307
308        for subsystem in CONTROLLER_TYPES {
309            match subsystem {
310                ControllerType::Cpu => stats.cpu = Cpu::stats(&self.full_path)?,
311                ControllerType::HugeTlb => stats.hugetlb = HugeTlb::stats(&self.full_path)?,
312                ControllerType::Pids => {
313                    stats.pids = Pids::stats(&self.full_path).map_err(V2ManagerError::PidsStats)?
314                }
315                ControllerType::Memory => stats.memory = Memory::stats(&self.full_path)?,
316                ControllerType::Io => stats.blkio = Io::stats(&self.full_path)?,
317                _ => continue,
318            }
319        }
320
321        Ok(stats)
322    }
323
324    fn get_all_pids(&self) -> Result<Vec<Pid>, Self::Error> {
325        Ok(common::get_all_pids(&self.full_path)?)
326    }
327}
328
329#[cfg(test)]
330mod tests {
331    use std::fs;
332
333    use super::*;
334    use crate::test::set_fixture;
335    use crate::v2::util::CGROUP_CONTROLLERS;
336
337    /// `is_subtree_control_per_controller_failure` must match the full
338    /// crun-parity errno allowlist (EROFS, EACCES, ENOENT, EPERM,
339    /// EOPNOTSUPP, EBUSY) and must reject every other errno so we never
340    /// silently swallow legitimate write failures.
341    #[test]
342    fn is_subtree_control_per_controller_failure_matches() {
343        fn wrap(errno: Errno) -> WrappedIoError {
344            WrappedIoError::Write {
345                err: std::io::Error::from_raw_os_error(errno as i32),
346                path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
347                data: "+cpu".into(),
348            }
349        }
350
351        // Tolerated errnos — all must match.
352        assert!(Manager::is_subtree_control_per_controller_failure(&wrap(
353            Errno::EROFS
354        )));
355        assert!(Manager::is_subtree_control_per_controller_failure(&wrap(
356            Errno::EACCES
357        )));
358        assert!(Manager::is_subtree_control_per_controller_failure(&wrap(
359            Errno::ENOENT
360        )));
361        assert!(Manager::is_subtree_control_per_controller_failure(&wrap(
362            Errno::EPERM
363        )));
364        assert!(Manager::is_subtree_control_per_controller_failure(&wrap(
365            Errno::EOPNOTSUPP
366        )));
367        assert!(Manager::is_subtree_control_per_controller_failure(&wrap(
368            Errno::EBUSY
369        )));
370
371        // Untolerated errnos — must NOT match.
372        assert!(!Manager::is_subtree_control_per_controller_failure(&wrap(
373            Errno::ENOSPC
374        )));
375        assert!(!Manager::is_subtree_control_per_controller_failure(&wrap(
376            Errno::EINVAL
377        )));
378        assert!(!Manager::is_subtree_control_per_controller_failure(&wrap(
379            Errno::EIO
380        )));
381    }
382
383    /// End-to-end happy-path exercise of `create_unified_cgroup` against a
384    /// fully-writable fake cgroupfs in a tempdir. This guards against the
385    /// regression where removing the unconditional `write_controllers` on the
386    /// root path would have broken nested setups: the old code would have
387    /// required `root_path/cgroup.subtree_control` to exist and be writable,
388    /// while the new code intentionally skips that write.
389    ///
390    /// The fake layout is:
391    ///     <root>/cgroup.controllers             -> "cpu memory pids"
392    ///     <root>/parent/cgroup.subtree_control  -> "" (pre-existing)
393    ///     <root>/parent/leaf/                   -> created by create_unified_cgroup
394    ///     <root>/parent/leaf/cgroup.procs       -> pre-created so write succeeds
395    ///
396    /// Note we *do not* create `<root>/cgroup.subtree_control`; the old code
397    /// would have aborted with ENOENT trying to write it.
398    #[test]
399    fn create_unified_cgroup_skips_root_subtree_control_write() {
400        let tmp = tempfile::tempdir().expect("create temp dir");
401        let root = tmp.path();
402
403        // `get_available_controllers` reads this file.
404        set_fixture(root, CGROUP_CONTROLLERS, "cpu memory pids").expect("write cgroup.controllers");
405
406        // Pre-existing parent ancestor with a writable subtree_control file.
407        let parent = root.join("parent");
408        fs::create_dir(&parent).expect("create parent dir");
409        set_fixture(&parent, CGROUP_SUBTREE_CONTROL, "").expect("write parent subtree_control");
410
411        // We do *not* pre-create the leaf directory; create_unified_cgroup
412        // must mkdir it. However its `cgroup.procs` needs to exist for the
413        // final `write_cgroup_file` call to open it (create=false).
414        //
415        // Pre-creating the file before mkdir is impossible, so instead we
416        // wedge open by pre-creating the leaf dir + procs file (which
417        // means we exercise the `current_path.exists()` true branch for
418        // the leaf — that's fine because the leaf has no subtree_control
419        // write gate).
420        let leaf = parent.join("leaf");
421        fs::create_dir(&leaf).expect("create leaf dir");
422        set_fixture(&leaf, CGROUP_PROCS, "").expect("write leaf cgroup.procs");
423
424        let manager = Manager::new(root.to_path_buf(), PathBuf::from("/parent/leaf"))
425            .expect("construct manager");
426
427        // Pid 0 is fine for the test; we just need write_cgroup_file to
428        // round-trip the bytes into the file.
429        manager
430            .create_unified_cgroup(Pid::from_raw(0))
431            .expect("create_unified_cgroup succeeds when root subtree_control is absent");
432
433        // Sanity: the pid we wrote should be in the procs file.
434        let procs = fs::read_to_string(leaf.join(CGROUP_PROCS)).expect("read cgroup.procs");
435        assert_eq!(procs.trim(), "0");
436    }
437
438    /// `write_controllers` with `strict=true` (we own the cgroup) must
439    /// propagate any underlying write failure. `strict=false` (pre-existing
440    /// ancestor) must tolerate tolerated errnos. We can't inject specific
441    /// errnos through the tempfile-backed write path easily, but we CAN
442    /// verify the happy path round-trips in both strict modes against a
443    /// writable subtree_control file.
444    #[test]
445    fn write_controllers_happy_path_both_modes() {
446        let tmp = tempfile::tempdir().expect("create temp dir");
447        let dir = tmp.path();
448        set_fixture(dir, CGROUP_SUBTREE_CONTROL, "").expect("write subtree_control fixture");
449
450        let controllers = vec!["+cpu".to_string(), "+memory".to_string()];
451
452        // strict=true succeeds against a writable file.
453        Manager::write_controllers(dir, &controllers, /*strict=*/ true)
454            .expect("strict write_controllers succeeds on writable subtree_control");
455
456        // strict=false also succeeds against a writable file (happy path
457        // tolerance does not regress correctness).
458        Manager::write_controllers(dir, &controllers, /*strict=*/ false)
459            .expect("non-strict write_controllers succeeds on writable subtree_control");
460    }
461}