Skip to main content

libcgroups/v2/
manager.rs

1use std::fs::{self};
2use std::os::unix::fs::PermissionsExt;
3use std::path::Component::RootDir;
4use std::path::{Path, PathBuf};
5use std::time::Duration;
6
7use nix::errno::Errno;
8use nix::unistd::Pid;
9
10use super::controller::Controller;
11use super::controller_type::{
12    CONTROLLER_TYPES, ControllerType, PSEUDO_CONTROLLER_TYPES, PseudoControllerType,
13};
14use super::cpu::{Cpu, V2CpuControllerError, V2CpuStatsError};
15use super::cpuset::CpuSet;
16#[cfg(feature = "cgroupsv2_devices")]
17use super::devices::Devices;
18use super::freezer::{Freezer, V2FreezerError};
19use super::hugetlb::{HugeTlb, V2HugeTlbControllerError, V2HugeTlbStatsError};
20use super::io::{Io, V2IoControllerError, V2IoStatsError};
21use super::memory::{Memory, V2MemoryControllerError, V2MemoryStatsError};
22use super::pids::Pids;
23use super::unified::{Unified, V2UnifiedError};
24use super::util::{self, CGROUP_SUBTREE_CONTROL, V2UtilError};
25use crate::common::{
26    self, AnyCgroupManager, CGROUP_PROCS, CgroupManager, ControllerOpt, FreezerState,
27    JoinSafelyError, PathBufExt, WrapIoResult, WrappedIoError,
28};
29use crate::stats::{PidStatsError, Stats, StatsProvider};
30
31pub const CGROUP_KILL: &str = "cgroup.kill";
32
33#[derive(thiserror::Error, Debug)]
34pub enum V2ManagerError {
35    #[error("io error: {0}")]
36    WrappedIo(#[from] WrappedIoError),
37    #[error("while joining paths: {0}")]
38    JoinSafely(#[from] JoinSafelyError),
39    #[error(transparent)]
40    Util(#[from] V2UtilError),
41
42    #[error(transparent)]
43    CpuController(#[from] V2CpuControllerError),
44    #[error(transparent)]
45    CpuSetController(WrappedIoError),
46    #[error(transparent)]
47    HugeTlbController(#[from] V2HugeTlbControllerError),
48    #[error(transparent)]
49    IoController(#[from] V2IoControllerError),
50    #[error(transparent)]
51    MemoryController(#[from] V2MemoryControllerError),
52    #[error(transparent)]
53    PidsController(WrappedIoError),
54    #[error(transparent)]
55    UnifiedController(#[from] V2UnifiedError),
56    #[error(transparent)]
57    FreezerController(#[from] V2FreezerError),
58    #[cfg(feature = "cgroupsv2_devices")]
59    #[error(transparent)]
60    DevicesController(#[from] super::devices::controller::DevicesControllerError),
61
62    #[error(transparent)]
63    CpuStats(#[from] V2CpuStatsError),
64    #[error(transparent)]
65    HugeTlbStats(#[from] V2HugeTlbStatsError),
66    #[error(transparent)]
67    PidsStats(PidStatsError),
68    #[error(transparent)]
69    MemoryStats(#[from] V2MemoryStatsError),
70    #[error(transparent)]
71    IoStats(#[from] V2IoStatsError),
72}
73
74/// Represents a management interface for a cgroup located at `{root_path}/{cgroup_path}`
75///
76/// This struct does not have ownership of the cgroup
77pub struct Manager {
78    root_path: PathBuf,
79    cgroup_path: PathBuf,
80    full_path: PathBuf,
81}
82
83impl Manager {
84    /// Constructs a new cgroup manager with root path being the mount point
85    /// of a cgroup v2 fs and cgroup path being a relative path from the root
86    pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result<Self, V2ManagerError> {
87        let full_path = root_path.join_safely(&cgroup_path)?;
88
89        Ok(Self {
90            root_path,
91            cgroup_path,
92            full_path,
93        })
94    }
95
96    /// Creates a unified cgroup at `self.full_path` and attaches a process to it
97    fn create_unified_cgroup(&self, pid: Pid) -> Result<(), V2ManagerError> {
98        let controllers: Vec<String> = util::get_available_controllers(&self.root_path)?
99            .iter()
100            .map(|c| format!("+{c}"))
101            .collect();
102
103        // Note: we intentionally do NOT write controllers to `self.root_path` here.
104        // In nested scenarios (running inside a container where the host's root
105        // cgroup is owned by host systemd), writing to the root's
106        // `cgroup.subtree_control` fails with EROFS because the file is on a
107        // read-only view from our PoV. Any ancestor up to root must already
108        // have the relevant controllers enabled — otherwise our process could
109        // not be executing inside that cgroup hierarchy in the first place. We
110        // only enable controllers on path components we ourselves create.
111
112        let mut current_path = self.root_path.clone();
113        let mut components = self
114            .cgroup_path
115            .components()
116            .filter(|c| c.ne(&RootDir))
117            .peekable();
118        while let Some(component) = components.next() {
119            current_path = current_path.join(component);
120            let we_created = if !current_path.exists() {
121                fs::create_dir(&current_path).wrap_create_dir(&current_path)?;
122                fs::metadata(&current_path)
123                    .wrap_other(&current_path)?
124                    .permissions()
125                    .set_mode(0o755);
126                true
127            } else {
128                false
129            };
130
131            // last component cannot have subtree_control enabled due to internal process constraint
132            // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy)
133            if components.peek().is_some() {
134                match Self::write_controllers(&current_path, &controllers) {
135                    Ok(()) => {}
136                    Err(e) if !we_created && Self::is_erofs(&e) => {
137                        // Pre-existing ancestor owned by a parent cgroup
138                        // manager (e.g. host systemd, the outer container's
139                        // runtime). Controllers are presumed already enabled —
140                        // otherwise we could not be running here. Skip.
141                        tracing::debug!(
142                            path = ?current_path,
143                            "skipping subtree_control write on pre-existing read-only ancestor",
144                        );
145                    }
146                    Err(e) => return Err(e.into()),
147                }
148            }
149        }
150
151        common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
152        Ok(())
153    }
154
155    /// Returns true if the wrapped IO error originates from an EROFS
156    /// (read-only file system) syscall failure.
157    fn is_erofs(err: &WrappedIoError) -> bool {
158        matches!(
159            err.inner().raw_os_error().map(Errno::from_raw),
160            Some(Errno::EROFS)
161        )
162    }
163
164    /// Writes a list of controllers to the `{path}/cgroup.subtree_control` file
165    fn write_controllers(path: &Path, controllers: &[String]) -> Result<(), WrappedIoError> {
166        for controller in controllers {
167            common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?;
168        }
169
170        Ok(())
171    }
172
173    pub fn any(self) -> AnyCgroupManager {
174        AnyCgroupManager::V2(self)
175    }
176}
177
178impl CgroupManager for Manager {
179    type Error = V2ManagerError;
180
181    fn add_task(&self, pid: Pid) -> Result<(), Self::Error> {
182        if self.full_path.exists() {
183            common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
184            return Ok(());
185        }
186        self.create_unified_cgroup(pid)?;
187        Ok(())
188    }
189
190    fn apply(&self, controller_opt: &ControllerOpt) -> Result<(), Self::Error> {
191        for controller in CONTROLLER_TYPES {
192            match controller {
193                ControllerType::Cpu => Cpu::apply(controller_opt, &self.full_path)?,
194                ControllerType::CpuSet => CpuSet::apply(controller_opt, &self.full_path)?,
195                ControllerType::HugeTlb => HugeTlb::apply(controller_opt, &self.full_path)?,
196                ControllerType::Io => Io::apply(controller_opt, &self.full_path)?,
197                ControllerType::Memory => Memory::apply(controller_opt, &self.full_path)?,
198                ControllerType::Pids => Pids::apply(controller_opt, &self.full_path)?,
199            }
200        }
201
202        #[cfg(feature = "cgroupsv2_devices")]
203        Devices::apply(controller_opt, &self.full_path)?;
204
205        for pseudoctlr in PSEUDO_CONTROLLER_TYPES {
206            if let PseudoControllerType::Unified = pseudoctlr {
207                Unified::apply(
208                    controller_opt,
209                    &self.full_path,
210                    util::get_available_controllers(&self.root_path)?,
211                )?;
212            }
213        }
214
215        Ok(())
216    }
217
218    fn remove(&self) -> Result<(), Self::Error> {
219        if self.full_path.exists() {
220            tracing::debug!("remove cgroup {:?}", self.full_path);
221            let kill_file = self.full_path.join(CGROUP_KILL);
222            if kill_file.exists() {
223                fs::write(&kill_file, "1").wrap_write(&kill_file, "1")?;
224            } else {
225                let procs_path = self.full_path.join(CGROUP_PROCS);
226                let procs = fs::read_to_string(&procs_path).wrap_read(&procs_path)?;
227
228                for line in procs.lines() {
229                    let pid: i32 = line
230                        .parse()
231                        .map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))
232                        .wrap_other(&procs_path)?;
233                    let _ = nix::sys::signal::kill(Pid::from_raw(pid), nix::sys::signal::SIGKILL);
234                }
235            }
236
237            common::delete_with_retry(&self.full_path, 4, Duration::from_millis(100))?;
238        }
239
240        Ok(())
241    }
242
243    fn freeze(&self, state: FreezerState) -> Result<(), Self::Error> {
244        let controller_opt = ControllerOpt {
245            resources: &Default::default(),
246            freezer_state: Some(state),
247            oom_score_adj: None,
248            disable_oom_killer: false,
249        };
250        Ok(Freezer::apply(&controller_opt, &self.full_path)?)
251    }
252
253    fn stats(&self) -> Result<Stats, Self::Error> {
254        let mut stats = Stats::default();
255
256        for subsystem in CONTROLLER_TYPES {
257            match subsystem {
258                ControllerType::Cpu => stats.cpu = Cpu::stats(&self.full_path)?,
259                ControllerType::HugeTlb => stats.hugetlb = HugeTlb::stats(&self.full_path)?,
260                ControllerType::Pids => {
261                    stats.pids = Pids::stats(&self.full_path).map_err(V2ManagerError::PidsStats)?
262                }
263                ControllerType::Memory => stats.memory = Memory::stats(&self.full_path)?,
264                ControllerType::Io => stats.blkio = Io::stats(&self.full_path)?,
265                _ => continue,
266            }
267        }
268
269        Ok(stats)
270    }
271
272    fn get_all_pids(&self) -> Result<Vec<Pid>, Self::Error> {
273        Ok(common::get_all_pids(&self.full_path)?)
274    }
275}
276
277#[cfg(test)]
278mod tests {
279    use std::fs;
280
281    use super::*;
282    use crate::test::set_fixture;
283    use crate::v2::util::CGROUP_CONTROLLERS;
284
285    /// `is_erofs` correctly identifies EROFS-wrapped IO errors and rejects
286    /// every other errno.
287    #[test]
288    fn is_erofs_recognises_erofs() {
289        let erofs = std::io::Error::from_raw_os_error(Errno::EROFS as i32);
290        let wrapped = WrappedIoError::Write {
291            err: erofs,
292            path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
293            data: "+cpu".into(),
294        };
295        assert!(Manager::is_erofs(&wrapped));
296
297        let eacces = std::io::Error::from_raw_os_error(Errno::EACCES as i32);
298        let wrapped = WrappedIoError::Write {
299            err: eacces,
300            path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
301            data: "+cpu".into(),
302        };
303        assert!(!Manager::is_erofs(&wrapped));
304
305        let ebusy = WrappedIoError::Open {
306            err: std::io::Error::from_raw_os_error(Errno::EBUSY as i32),
307            path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
308        };
309        assert!(!Manager::is_erofs(&ebusy));
310    }
311
312    /// End-to-end happy-path exercise of `create_unified_cgroup` against a
313    /// fully-writable fake cgroupfs in a tempdir. This guards against the
314    /// regression where removing the unconditional `write_controllers` on the
315    /// root path would have broken nested setups: the old code would have
316    /// required `root_path/cgroup.subtree_control` to exist and be writable,
317    /// while the new code intentionally skips that write.
318    ///
319    /// The fake layout is:
320    ///     <root>/cgroup.controllers             -> "cpu memory pids"
321    ///     <root>/parent/cgroup.subtree_control  -> "" (pre-existing)
322    ///     <root>/parent/leaf/                   -> created by create_unified_cgroup
323    ///     <root>/parent/leaf/cgroup.procs       -> pre-created so write succeeds
324    ///
325    /// Note we *do not* create `<root>/cgroup.subtree_control`; the old code
326    /// would have aborted with ENOENT trying to write it.
327    #[test]
328    fn create_unified_cgroup_skips_root_subtree_control_write() {
329        let tmp = tempfile::tempdir().expect("create temp dir");
330        let root = tmp.path();
331
332        // `get_available_controllers` reads this file.
333        set_fixture(root, CGROUP_CONTROLLERS, "cpu memory pids").expect("write cgroup.controllers");
334
335        // Pre-existing parent ancestor with a writable subtree_control file.
336        let parent = root.join("parent");
337        fs::create_dir(&parent).expect("create parent dir");
338        set_fixture(&parent, CGROUP_SUBTREE_CONTROL, "").expect("write parent subtree_control");
339
340        // We do *not* pre-create the leaf directory; create_unified_cgroup
341        // must mkdir it. However its `cgroup.procs` needs to exist for the
342        // final `write_cgroup_file` call to open it (create=false).
343        //
344        // Pre-creating the file before mkdir is impossible, so instead we
345        // wedge open by pre-creating the leaf dir + procs file (which
346        // means we exercise the `current_path.exists()` true branch for
347        // the leaf — that's fine because the leaf has no subtree_control
348        // write gate).
349        let leaf = parent.join("leaf");
350        fs::create_dir(&leaf).expect("create leaf dir");
351        set_fixture(&leaf, CGROUP_PROCS, "").expect("write leaf cgroup.procs");
352
353        let manager = Manager::new(root.to_path_buf(), PathBuf::from("/parent/leaf"))
354            .expect("construct manager");
355
356        // Pid 0 is fine for the test; we just need write_cgroup_file to
357        // round-trip the bytes into the file.
358        manager
359            .create_unified_cgroup(Pid::from_raw(0))
360            .expect("create_unified_cgroup succeeds when root subtree_control is absent");
361
362        // Sanity: the pid we wrote should be in the procs file.
363        let procs = fs::read_to_string(leaf.join(CGROUP_PROCS)).expect("read cgroup.procs");
364        assert_eq!(procs.trim(), "0");
365    }
366}