Skip to main content

libcgroups/v2/
manager.rs

1use std::fs::{self};
2use std::os::unix::fs::PermissionsExt;
3use std::path::Component::RootDir;
4use std::path::{Path, PathBuf};
5use std::time::Duration;
6
7use nix::errno::Errno;
8use nix::unistd::Pid;
9
10use super::controller::Controller;
11use super::controller_type::{
12    CONTROLLER_TYPES, ControllerType, PSEUDO_CONTROLLER_TYPES, PseudoControllerType,
13};
14use super::cpu::{Cpu, V2CpuControllerError, V2CpuStatsError};
15use super::cpuset::CpuSet;
16#[cfg(feature = "cgroupsv2_devices")]
17use super::devices::Devices;
18use super::freezer::{Freezer, V2FreezerError};
19use super::hugetlb::{HugeTlb, V2HugeTlbControllerError, V2HugeTlbStatsError};
20use super::io::{Io, V2IoControllerError, V2IoStatsError};
21use super::memory::{Memory, V2MemoryControllerError, V2MemoryStatsError};
22use super::pids::Pids;
23use super::unified::{Unified, V2UnifiedError};
24use super::util::{self, CGROUP_SUBTREE_CONTROL, V2UtilError};
25use crate::common::{
26    self, AnyCgroupManager, CGROUP_PROCS, CgroupManager, ControllerOpt, FreezerState,
27    JoinSafelyError, PathBufExt, WrapIoResult, WrappedIoError,
28};
29use crate::stats::{PidStatsError, Stats, StatsProvider};
30
31pub const CGROUP_KILL: &str = "cgroup.kill";
32
33#[derive(thiserror::Error, Debug)]
34pub enum V2ManagerError {
35    #[error("io error: {0}")]
36    WrappedIo(#[from] WrappedIoError),
37    #[error("while joining paths: {0}")]
38    JoinSafely(#[from] JoinSafelyError),
39    #[error(transparent)]
40    Util(#[from] V2UtilError),
41
42    #[error(transparent)]
43    CpuController(#[from] V2CpuControllerError),
44    #[error(transparent)]
45    CpuSetController(WrappedIoError),
46    #[error(transparent)]
47    HugeTlbController(#[from] V2HugeTlbControllerError),
48    #[error(transparent)]
49    IoController(#[from] V2IoControllerError),
50    #[error(transparent)]
51    MemoryController(#[from] V2MemoryControllerError),
52    #[error(transparent)]
53    PidsController(WrappedIoError),
54    #[error(transparent)]
55    UnifiedController(#[from] V2UnifiedError),
56    #[error(transparent)]
57    FreezerController(#[from] V2FreezerError),
58    #[cfg(feature = "cgroupsv2_devices")]
59    #[error(transparent)]
60    DevicesController(#[from] super::devices::controller::DevicesControllerError),
61
62    #[error(transparent)]
63    CpuStats(#[from] V2CpuStatsError),
64    #[error(transparent)]
65    HugeTlbStats(#[from] V2HugeTlbStatsError),
66    #[error(transparent)]
67    PidsStats(PidStatsError),
68    #[error(transparent)]
69    MemoryStats(#[from] V2MemoryStatsError),
70    #[error(transparent)]
71    IoStats(#[from] V2IoStatsError),
72}
73
74/// Represents a management interface for a cgroup located at `{root_path}/{cgroup_path}`
75///
76/// This struct does not have ownership of the cgroup
77pub struct Manager {
78    root_path: PathBuf,
79    cgroup_path: PathBuf,
80    full_path: PathBuf,
81}
82
83impl Manager {
84    /// Constructs a new cgroup manager with root path being the mount point
85    /// of a cgroup v2 fs and cgroup path being a relative path from the root
86    pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result<Self, V2ManagerError> {
87        let full_path = root_path.join_safely(&cgroup_path)?;
88
89        Ok(Self {
90            root_path,
91            cgroup_path,
92            full_path,
93        })
94    }
95
96    /// Creates a unified cgroup at `self.full_path` and attaches a process to it
97    fn create_unified_cgroup(&self, pid: Pid) -> Result<(), V2ManagerError> {
98        let controllers: Vec<String> = util::get_available_controllers(&self.root_path)?
99            .iter()
100            .map(|c| format!("+{c}"))
101            .collect();
102
103        // Note: we intentionally do NOT write controllers to `self.root_path` here.
104        // In nested scenarios (running inside a container where the host's root
105        // cgroup is owned by host systemd), writing to the root's
106        // `cgroup.subtree_control` fails with EROFS because the file is on a
107        // read-only view from our PoV. Any ancestor up to root must already
108        // have the relevant controllers enabled — otherwise our process could
109        // not be executing inside that cgroup hierarchy in the first place. We
110        // only enable controllers on path components we ourselves create.
111
112        let mut current_path = self.root_path.clone();
113        let mut components = self
114            .cgroup_path
115            .components()
116            .filter(|c| c.ne(&RootDir))
117            .peekable();
118        while let Some(component) = components.next() {
119            current_path = current_path.join(component);
120            let we_created = if !current_path.exists() {
121                fs::create_dir(&current_path).wrap_create_dir(&current_path)?;
122                fs::metadata(&current_path)
123                    .wrap_other(&current_path)?
124                    .permissions()
125                    .set_mode(0o755);
126                true
127            } else {
128                false
129            };
130
131            // last component cannot have subtree_control enabled due to internal process constraint
132            // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy)
133            if components.peek().is_some() {
134                match Self::write_controllers(&current_path, &controllers) {
135                    Ok(()) => {}
136                    Err(e) if !we_created && Self::is_inherited_ancestor_unwritable(&e) => {
137                        // Pre-existing ancestor owned by a parent cgroup
138                        // manager (e.g. host systemd, the outer container's
139                        // runtime). The write can fail in two distinct ways
140                        // depending on how that ancestor is exposed:
141                        //   * EROFS  — nested in a container whose cgroupfs
142                        //     view is read-only (cgroupns=private, the host's
143                        //     root cgroup is owned by host systemd).
144                        //   * EACCES — running as a regular user under
145                        //     systemd, where the ancestor cgroup directory
146                        //     (e.g. /sys/fs/cgroup/user.slice) is root-owned
147                        //     and its subtree_control file is mode 0644.
148                        // In both cases the ancestor predates this process; if
149                        // its subtree_control genuinely needed updating we
150                        // could not do it anyway, and our PARENT cgroup would
151                        // already have failed if controllers were truly
152                        // missing — because we could not be running in this
153                        // hierarchy in the first place. Skip silently.
154                        tracing::debug!(
155                            path = ?current_path,
156                            "skipping subtree_control write on pre-existing unwritable ancestor",
157                        );
158                    }
159                    Err(e) => return Err(e.into()),
160                }
161            }
162        }
163
164        common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
165        Ok(())
166    }
167
168    /// Returns true if the wrapped IO error indicates the target cgroup file
169    /// is owned by a parent cgroup manager that this process cannot modify.
170    ///
171    /// Two distinct errnos express this condition:
172    ///   * `EROFS`  — the file lives on a read-only view of cgroupfs (typical
173    ///     of `cgroupns=private` containers whose root cgroup is owned by the
174    ///     host).
175    ///   * `EACCES` — the file is writable in principle but its DAC owner is
176    ///     someone else (typical of rootless invocations under systemd where
177    ///     ancestor slices like `user.slice` are root-owned mode 0644).
178    ///
179    /// Callers use this to swallow `cgroup.subtree_control` write failures on
180    /// ancestor cgroup directories that pre-existed our process — controllers
181    /// in those ancestors are already enabled by whatever manager owns them,
182    /// otherwise we could not be running inside this hierarchy at all.
183    fn is_inherited_ancestor_unwritable(err: &WrappedIoError) -> bool {
184        matches!(
185            err.inner().raw_os_error().map(Errno::from_raw),
186            Some(Errno::EROFS) | Some(Errno::EACCES)
187        )
188    }
189
190    /// Writes a list of controllers to the `{path}/cgroup.subtree_control` file
191    fn write_controllers(path: &Path, controllers: &[String]) -> Result<(), WrappedIoError> {
192        for controller in controllers {
193            common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?;
194        }
195
196        Ok(())
197    }
198
199    pub fn any(self) -> AnyCgroupManager {
200        AnyCgroupManager::V2(self)
201    }
202}
203
204impl CgroupManager for Manager {
205    type Error = V2ManagerError;
206
207    fn add_task(&self, pid: Pid) -> Result<(), Self::Error> {
208        if self.full_path.exists() {
209            common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
210            return Ok(());
211        }
212        self.create_unified_cgroup(pid)?;
213        Ok(())
214    }
215
216    fn apply(&self, controller_opt: &ControllerOpt) -> Result<(), Self::Error> {
217        for controller in CONTROLLER_TYPES {
218            match controller {
219                ControllerType::Cpu => Cpu::apply(controller_opt, &self.full_path)?,
220                ControllerType::CpuSet => CpuSet::apply(controller_opt, &self.full_path)?,
221                ControllerType::HugeTlb => HugeTlb::apply(controller_opt, &self.full_path)?,
222                ControllerType::Io => Io::apply(controller_opt, &self.full_path)?,
223                ControllerType::Memory => Memory::apply(controller_opt, &self.full_path)?,
224                ControllerType::Pids => Pids::apply(controller_opt, &self.full_path)?,
225            }
226        }
227
228        #[cfg(feature = "cgroupsv2_devices")]
229        Devices::apply(controller_opt, &self.full_path)?;
230
231        for pseudoctlr in PSEUDO_CONTROLLER_TYPES {
232            if let PseudoControllerType::Unified = pseudoctlr {
233                Unified::apply(
234                    controller_opt,
235                    &self.full_path,
236                    util::get_available_controllers(&self.root_path)?,
237                )?;
238            }
239        }
240
241        Ok(())
242    }
243
244    fn remove(&self) -> Result<(), Self::Error> {
245        if self.full_path.exists() {
246            tracing::debug!("remove cgroup {:?}", self.full_path);
247            let kill_file = self.full_path.join(CGROUP_KILL);
248            if kill_file.exists() {
249                fs::write(&kill_file, "1").wrap_write(&kill_file, "1")?;
250            } else {
251                let procs_path = self.full_path.join(CGROUP_PROCS);
252                let procs = fs::read_to_string(&procs_path).wrap_read(&procs_path)?;
253
254                for line in procs.lines() {
255                    let pid: i32 = line
256                        .parse()
257                        .map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))
258                        .wrap_other(&procs_path)?;
259                    let _ = nix::sys::signal::kill(Pid::from_raw(pid), nix::sys::signal::SIGKILL);
260                }
261            }
262
263            common::delete_with_retry(&self.full_path, 4, Duration::from_millis(100))?;
264        }
265
266        Ok(())
267    }
268
269    fn freeze(&self, state: FreezerState) -> Result<(), Self::Error> {
270        let controller_opt = ControllerOpt {
271            resources: &Default::default(),
272            freezer_state: Some(state),
273            oom_score_adj: None,
274            disable_oom_killer: false,
275        };
276        Ok(Freezer::apply(&controller_opt, &self.full_path)?)
277    }
278
279    fn stats(&self) -> Result<Stats, Self::Error> {
280        let mut stats = Stats::default();
281
282        for subsystem in CONTROLLER_TYPES {
283            match subsystem {
284                ControllerType::Cpu => stats.cpu = Cpu::stats(&self.full_path)?,
285                ControllerType::HugeTlb => stats.hugetlb = HugeTlb::stats(&self.full_path)?,
286                ControllerType::Pids => {
287                    stats.pids = Pids::stats(&self.full_path).map_err(V2ManagerError::PidsStats)?
288                }
289                ControllerType::Memory => stats.memory = Memory::stats(&self.full_path)?,
290                ControllerType::Io => stats.blkio = Io::stats(&self.full_path)?,
291                _ => continue,
292            }
293        }
294
295        Ok(stats)
296    }
297
298    fn get_all_pids(&self) -> Result<Vec<Pid>, Self::Error> {
299        Ok(common::get_all_pids(&self.full_path)?)
300    }
301}
302
303#[cfg(test)]
304mod tests {
305    use std::fs;
306
307    use super::*;
308    use crate::test::set_fixture;
309    use crate::v2::util::CGROUP_CONTROLLERS;
310
311    /// `is_inherited_ancestor_unwritable` matches both EROFS (nested
312    /// container, read-only cgroupfs view) and EACCES (rootless under
313    /// systemd, ancestor slice owned by root). Other errnos must be rejected
314    /// so we never silently swallow legitimate write failures.
315    #[test]
316    fn is_inherited_ancestor_unwritable_matches_both_errnos() {
317        fn wrap(err: std::io::Error) -> WrappedIoError {
318            WrappedIoError::Write {
319                err,
320                path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
321                data: "+cpu".into(),
322            }
323        }
324
325        let erofs = std::io::Error::from_raw_os_error(Errno::EROFS as i32);
326        assert!(Manager::is_inherited_ancestor_unwritable(&wrap(erofs)));
327
328        let eacces = std::io::Error::from_raw_os_error(Errno::EACCES as i32);
329        assert!(Manager::is_inherited_ancestor_unwritable(&wrap(eacces)));
330
331        let enoent = std::io::Error::from_raw_os_error(Errno::ENOENT as i32);
332        assert!(!Manager::is_inherited_ancestor_unwritable(&wrap(enoent)));
333
334        let ebusy = WrappedIoError::Open {
335            err: std::io::Error::from_raw_os_error(Errno::EBUSY as i32),
336            path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
337        };
338        assert!(!Manager::is_inherited_ancestor_unwritable(&ebusy));
339    }
340
341    /// End-to-end happy-path exercise of `create_unified_cgroup` against a
342    /// fully-writable fake cgroupfs in a tempdir. This guards against the
343    /// regression where removing the unconditional `write_controllers` on the
344    /// root path would have broken nested setups: the old code would have
345    /// required `root_path/cgroup.subtree_control` to exist and be writable,
346    /// while the new code intentionally skips that write.
347    ///
348    /// The fake layout is:
349    ///     <root>/cgroup.controllers             -> "cpu memory pids"
350    ///     <root>/parent/cgroup.subtree_control  -> "" (pre-existing)
351    ///     <root>/parent/leaf/                   -> created by create_unified_cgroup
352    ///     <root>/parent/leaf/cgroup.procs       -> pre-created so write succeeds
353    ///
354    /// Note we *do not* create `<root>/cgroup.subtree_control`; the old code
355    /// would have aborted with ENOENT trying to write it.
356    #[test]
357    fn create_unified_cgroup_skips_root_subtree_control_write() {
358        let tmp = tempfile::tempdir().expect("create temp dir");
359        let root = tmp.path();
360
361        // `get_available_controllers` reads this file.
362        set_fixture(root, CGROUP_CONTROLLERS, "cpu memory pids").expect("write cgroup.controllers");
363
364        // Pre-existing parent ancestor with a writable subtree_control file.
365        let parent = root.join("parent");
366        fs::create_dir(&parent).expect("create parent dir");
367        set_fixture(&parent, CGROUP_SUBTREE_CONTROL, "").expect("write parent subtree_control");
368
369        // We do *not* pre-create the leaf directory; create_unified_cgroup
370        // must mkdir it. However its `cgroup.procs` needs to exist for the
371        // final `write_cgroup_file` call to open it (create=false).
372        //
373        // Pre-creating the file before mkdir is impossible, so instead we
374        // wedge open by pre-creating the leaf dir + procs file (which
375        // means we exercise the `current_path.exists()` true branch for
376        // the leaf — that's fine because the leaf has no subtree_control
377        // write gate).
378        let leaf = parent.join("leaf");
379        fs::create_dir(&leaf).expect("create leaf dir");
380        set_fixture(&leaf, CGROUP_PROCS, "").expect("write leaf cgroup.procs");
381
382        let manager = Manager::new(root.to_path_buf(), PathBuf::from("/parent/leaf"))
383            .expect("construct manager");
384
385        // Pid 0 is fine for the test; we just need write_cgroup_file to
386        // round-trip the bytes into the file.
387        manager
388            .create_unified_cgroup(Pid::from_raw(0))
389            .expect("create_unified_cgroup succeeds when root subtree_control is absent");
390
391        // Sanity: the pid we wrote should be in the procs file.
392        let procs = fs::read_to_string(leaf.join(CGROUP_PROCS)).expect("read cgroup.procs");
393        assert_eq!(procs.trim(), "0");
394    }
395}