1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
use std::fs::{self};
use std::os::unix::fs::PermissionsExt;
use std::path::Component::RootDir;
use std::path::{Path, PathBuf};
use std::time::Duration;
use nix::errno::Errno;
use nix::unistd::Pid;
use super::controller::Controller;
use super::controller_type::{
CONTROLLER_TYPES, ControllerType, PSEUDO_CONTROLLER_TYPES, PseudoControllerType,
};
use super::cpu::{Cpu, V2CpuControllerError, V2CpuStatsError};
use super::cpuset::CpuSet;
#[cfg(feature = "cgroupsv2_devices")]
use super::devices::Devices;
use super::freezer::{Freezer, V2FreezerError};
use super::hugetlb::{HugeTlb, V2HugeTlbControllerError, V2HugeTlbStatsError};
use super::io::{Io, V2IoControllerError, V2IoStatsError};
use super::memory::{Memory, V2MemoryControllerError, V2MemoryStatsError};
use super::pids::Pids;
use super::unified::{Unified, V2UnifiedError};
use super::util::{self, CGROUP_SUBTREE_CONTROL, V2UtilError};
use crate::common::{
self, AnyCgroupManager, CGROUP_PROCS, CgroupManager, ControllerOpt, FreezerState,
JoinSafelyError, PathBufExt, WrapIoResult, WrappedIoError,
};
use crate::stats::{PidStatsError, Stats, StatsProvider};
pub const CGROUP_KILL: &str = "cgroup.kill";
#[derive(thiserror::Error, Debug)]
pub enum V2ManagerError {
#[error("io error: {0}")]
WrappedIo(#[from] WrappedIoError),
#[error("while joining paths: {0}")]
JoinSafely(#[from] JoinSafelyError),
#[error(transparent)]
Util(#[from] V2UtilError),
#[error(transparent)]
CpuController(#[from] V2CpuControllerError),
#[error(transparent)]
CpuSetController(WrappedIoError),
#[error(transparent)]
HugeTlbController(#[from] V2HugeTlbControllerError),
#[error(transparent)]
IoController(#[from] V2IoControllerError),
#[error(transparent)]
MemoryController(#[from] V2MemoryControllerError),
#[error(transparent)]
PidsController(WrappedIoError),
#[error(transparent)]
UnifiedController(#[from] V2UnifiedError),
#[error(transparent)]
FreezerController(#[from] V2FreezerError),
#[cfg(feature = "cgroupsv2_devices")]
#[error(transparent)]
DevicesController(#[from] super::devices::controller::DevicesControllerError),
#[error(transparent)]
CpuStats(#[from] V2CpuStatsError),
#[error(transparent)]
HugeTlbStats(#[from] V2HugeTlbStatsError),
#[error(transparent)]
PidsStats(PidStatsError),
#[error(transparent)]
MemoryStats(#[from] V2MemoryStatsError),
#[error(transparent)]
IoStats(#[from] V2IoStatsError),
}
/// Represents a management interface for a cgroup located at `{root_path}/{cgroup_path}`
///
/// This struct does not have ownership of the cgroup
pub struct Manager {
root_path: PathBuf,
cgroup_path: PathBuf,
full_path: PathBuf,
}
impl Manager {
/// Constructs a new cgroup manager with root path being the mount point
/// of a cgroup v2 fs and cgroup path being a relative path from the root
pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result<Self, V2ManagerError> {
let full_path = root_path.join_safely(&cgroup_path)?;
Ok(Self {
root_path,
cgroup_path,
full_path,
})
}
/// Creates a unified cgroup at `self.full_path` and attaches a process to it
fn create_unified_cgroup(&self, pid: Pid) -> Result<(), V2ManagerError> {
let controllers: Vec<String> = util::get_available_controllers(&self.root_path)?
.iter()
.map(|c| format!("+{c}"))
.collect();
// Note: we intentionally do NOT write controllers to `self.root_path` here.
// In nested scenarios (running inside a container where the host's root
// cgroup is owned by host systemd), writing to the root's
// `cgroup.subtree_control` fails with EROFS because the file is on a
// read-only view from our PoV. Any ancestor up to root must already
// have the relevant controllers enabled — otherwise our process could
// not be executing inside that cgroup hierarchy in the first place. We
// only enable controllers on path components we ourselves create.
let mut current_path = self.root_path.clone();
let mut components = self
.cgroup_path
.components()
.filter(|c| c.ne(&RootDir))
.peekable();
while let Some(component) = components.next() {
current_path = current_path.join(component);
let we_created = if !current_path.exists() {
fs::create_dir(¤t_path).wrap_create_dir(¤t_path)?;
fs::metadata(¤t_path)
.wrap_other(¤t_path)?
.permissions()
.set_mode(0o755);
true
} else {
false
};
// last component cannot have subtree_control enabled due to internal process constraint
// if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy)
if components.peek().is_some() {
match Self::write_controllers(¤t_path, &controllers) {
Ok(()) => {}
Err(e) if !we_created && Self::is_inherited_ancestor_unwritable(&e) => {
// Pre-existing ancestor owned by a parent cgroup
// manager (e.g. host systemd, the outer container's
// runtime). The write can fail in two distinct ways
// depending on how that ancestor is exposed:
// * EROFS — nested in a container whose cgroupfs
// view is read-only (cgroupns=private, the host's
// root cgroup is owned by host systemd).
// * EACCES — running as a regular user under
// systemd, where the ancestor cgroup directory
// (e.g. /sys/fs/cgroup/user.slice) is root-owned
// and its subtree_control file is mode 0644.
// In both cases the ancestor predates this process; if
// its subtree_control genuinely needed updating we
// could not do it anyway, and our PARENT cgroup would
// already have failed if controllers were truly
// missing — because we could not be running in this
// hierarchy in the first place. Skip silently.
tracing::debug!(
path = ?current_path,
"skipping subtree_control write on pre-existing unwritable ancestor",
);
}
Err(e) => return Err(e.into()),
}
}
}
common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
Ok(())
}
/// Returns true if the wrapped IO error indicates the target cgroup file
/// is owned by a parent cgroup manager that this process cannot modify.
///
/// Two distinct errnos express this condition:
/// * `EROFS` — the file lives on a read-only view of cgroupfs (typical
/// of `cgroupns=private` containers whose root cgroup is owned by the
/// host).
/// * `EACCES` — the file is writable in principle but its DAC owner is
/// someone else (typical of rootless invocations under systemd where
/// ancestor slices like `user.slice` are root-owned mode 0644).
///
/// Callers use this to swallow `cgroup.subtree_control` write failures on
/// ancestor cgroup directories that pre-existed our process — controllers
/// in those ancestors are already enabled by whatever manager owns them,
/// otherwise we could not be running inside this hierarchy at all.
fn is_inherited_ancestor_unwritable(err: &WrappedIoError) -> bool {
matches!(
err.inner().raw_os_error().map(Errno::from_raw),
Some(Errno::EROFS) | Some(Errno::EACCES)
)
}
/// Writes a list of controllers to the `{path}/cgroup.subtree_control` file
fn write_controllers(path: &Path, controllers: &[String]) -> Result<(), WrappedIoError> {
for controller in controllers {
common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?;
}
Ok(())
}
pub fn any(self) -> AnyCgroupManager {
AnyCgroupManager::V2(self)
}
}
impl CgroupManager for Manager {
type Error = V2ManagerError;
fn add_task(&self, pid: Pid) -> Result<(), Self::Error> {
if self.full_path.exists() {
common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid)?;
return Ok(());
}
self.create_unified_cgroup(pid)?;
Ok(())
}
fn apply(&self, controller_opt: &ControllerOpt) -> Result<(), Self::Error> {
for controller in CONTROLLER_TYPES {
match controller {
ControllerType::Cpu => Cpu::apply(controller_opt, &self.full_path)?,
ControllerType::CpuSet => CpuSet::apply(controller_opt, &self.full_path)?,
ControllerType::HugeTlb => HugeTlb::apply(controller_opt, &self.full_path)?,
ControllerType::Io => Io::apply(controller_opt, &self.full_path)?,
ControllerType::Memory => Memory::apply(controller_opt, &self.full_path)?,
ControllerType::Pids => Pids::apply(controller_opt, &self.full_path)?,
}
}
#[cfg(feature = "cgroupsv2_devices")]
Devices::apply(controller_opt, &self.full_path)?;
for pseudoctlr in PSEUDO_CONTROLLER_TYPES {
if let PseudoControllerType::Unified = pseudoctlr {
Unified::apply(
controller_opt,
&self.full_path,
util::get_available_controllers(&self.root_path)?,
)?;
}
}
Ok(())
}
fn remove(&self) -> Result<(), Self::Error> {
if self.full_path.exists() {
tracing::debug!("remove cgroup {:?}", self.full_path);
let kill_file = self.full_path.join(CGROUP_KILL);
if kill_file.exists() {
fs::write(&kill_file, "1").wrap_write(&kill_file, "1")?;
} else {
let procs_path = self.full_path.join(CGROUP_PROCS);
let procs = fs::read_to_string(&procs_path).wrap_read(&procs_path)?;
for line in procs.lines() {
let pid: i32 = line
.parse()
.map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))
.wrap_other(&procs_path)?;
let _ = nix::sys::signal::kill(Pid::from_raw(pid), nix::sys::signal::SIGKILL);
}
}
common::delete_with_retry(&self.full_path, 4, Duration::from_millis(100))?;
}
Ok(())
}
fn freeze(&self, state: FreezerState) -> Result<(), Self::Error> {
let controller_opt = ControllerOpt {
resources: &Default::default(),
freezer_state: Some(state),
oom_score_adj: None,
disable_oom_killer: false,
};
Ok(Freezer::apply(&controller_opt, &self.full_path)?)
}
fn stats(&self) -> Result<Stats, Self::Error> {
let mut stats = Stats::default();
for subsystem in CONTROLLER_TYPES {
match subsystem {
ControllerType::Cpu => stats.cpu = Cpu::stats(&self.full_path)?,
ControllerType::HugeTlb => stats.hugetlb = HugeTlb::stats(&self.full_path)?,
ControllerType::Pids => {
stats.pids = Pids::stats(&self.full_path).map_err(V2ManagerError::PidsStats)?
}
ControllerType::Memory => stats.memory = Memory::stats(&self.full_path)?,
ControllerType::Io => stats.blkio = Io::stats(&self.full_path)?,
_ => continue,
}
}
Ok(stats)
}
fn get_all_pids(&self) -> Result<Vec<Pid>, Self::Error> {
Ok(common::get_all_pids(&self.full_path)?)
}
}
#[cfg(test)]
mod tests {
use std::fs;
use super::*;
use crate::test::set_fixture;
use crate::v2::util::CGROUP_CONTROLLERS;
/// `is_inherited_ancestor_unwritable` matches both EROFS (nested
/// container, read-only cgroupfs view) and EACCES (rootless under
/// systemd, ancestor slice owned by root). Other errnos must be rejected
/// so we never silently swallow legitimate write failures.
#[test]
fn is_inherited_ancestor_unwritable_matches_both_errnos() {
fn wrap(err: std::io::Error) -> WrappedIoError {
WrappedIoError::Write {
err,
path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
data: "+cpu".into(),
}
}
let erofs = std::io::Error::from_raw_os_error(Errno::EROFS as i32);
assert!(Manager::is_inherited_ancestor_unwritable(&wrap(erofs)));
let eacces = std::io::Error::from_raw_os_error(Errno::EACCES as i32);
assert!(Manager::is_inherited_ancestor_unwritable(&wrap(eacces)));
let enoent = std::io::Error::from_raw_os_error(Errno::ENOENT as i32);
assert!(!Manager::is_inherited_ancestor_unwritable(&wrap(enoent)));
let ebusy = WrappedIoError::Open {
err: std::io::Error::from_raw_os_error(Errno::EBUSY as i32),
path: PathBuf::from("/some/cgroup/cgroup.subtree_control"),
};
assert!(!Manager::is_inherited_ancestor_unwritable(&ebusy));
}
/// End-to-end happy-path exercise of `create_unified_cgroup` against a
/// fully-writable fake cgroupfs in a tempdir. This guards against the
/// regression where removing the unconditional `write_controllers` on the
/// root path would have broken nested setups: the old code would have
/// required `root_path/cgroup.subtree_control` to exist and be writable,
/// while the new code intentionally skips that write.
///
/// The fake layout is:
/// <root>/cgroup.controllers -> "cpu memory pids"
/// <root>/parent/cgroup.subtree_control -> "" (pre-existing)
/// <root>/parent/leaf/ -> created by create_unified_cgroup
/// <root>/parent/leaf/cgroup.procs -> pre-created so write succeeds
///
/// Note we *do not* create `<root>/cgroup.subtree_control`; the old code
/// would have aborted with ENOENT trying to write it.
#[test]
fn create_unified_cgroup_skips_root_subtree_control_write() {
let tmp = tempfile::tempdir().expect("create temp dir");
let root = tmp.path();
// `get_available_controllers` reads this file.
set_fixture(root, CGROUP_CONTROLLERS, "cpu memory pids").expect("write cgroup.controllers");
// Pre-existing parent ancestor with a writable subtree_control file.
let parent = root.join("parent");
fs::create_dir(&parent).expect("create parent dir");
set_fixture(&parent, CGROUP_SUBTREE_CONTROL, "").expect("write parent subtree_control");
// We do *not* pre-create the leaf directory; create_unified_cgroup
// must mkdir it. However its `cgroup.procs` needs to exist for the
// final `write_cgroup_file` call to open it (create=false).
//
// Pre-creating the file before mkdir is impossible, so instead we
// wedge open by pre-creating the leaf dir + procs file (which
// means we exercise the `current_path.exists()` true branch for
// the leaf — that's fine because the leaf has no subtree_control
// write gate).
let leaf = parent.join("leaf");
fs::create_dir(&leaf).expect("create leaf dir");
set_fixture(&leaf, CGROUP_PROCS, "").expect("write leaf cgroup.procs");
let manager = Manager::new(root.to_path_buf(), PathBuf::from("/parent/leaf"))
.expect("construct manager");
// Pid 0 is fine for the test; we just need write_cgroup_file to
// round-trip the bytes into the file.
manager
.create_unified_cgroup(Pid::from_raw(0))
.expect("create_unified_cgroup succeeds when root subtree_control is absent");
// Sanity: the pid we wrote should be in the procs file.
let procs = fs::read_to_string(leaf.join(CGROUP_PROCS)).expect("read cgroup.procs");
assert_eq!(procs.trim(), "0");
}
}