use crate::topology::TestTopology;
use anyhow::{Context, Result, anyhow, bail};
use std::collections::BTreeSet;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::mpsc;
use std::time::Duration;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Controller {
Cpuset,
Cpu,
Memory,
Pids,
Io,
}
impl Controller {
pub fn name(self) -> &'static str {
match self {
Controller::Cpuset => "cpuset",
Controller::Cpu => "cpu",
Controller::Memory => "memory",
Controller::Pids => "pids",
Controller::Io => "io",
}
}
}
const CGROUP_WRITE_TIMEOUT: Duration = Duration::from_secs(2);
fn write_with_timeout(path: &Path, data: &str, timeout: Duration) -> Result<()> {
let display = path.display().to_string();
let path = path.to_owned();
let data = data.to_owned();
let (tx, rx) = mpsc::channel();
std::thread::spawn(move || {
let result = fs::write(&path, &data);
let _ = tx.send(result);
});
match rx.recv_timeout(timeout) {
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => {
let errno_suffix = e
.raw_os_error()
.and_then(crate::errno_name)
.map(|name| format!(" ({name})"))
.unwrap_or_default();
Err(e).with_context(|| format!("write {display}{errno_suffix}"))
}
Err(_) => bail!(
"cgroup write to {display} timed out after {}ms",
timeout.as_millis()
),
}
}
fn validate_cgroup_name(name: &str) -> Result<()> {
if name.is_empty() {
bail!("cgroup name must not be empty");
}
if name.starts_with('/') {
bail!(
"cgroup name '{name}' starts with '/' — would escape the \
managed parent via Path::join (absolute paths replace the \
join base)"
);
}
if name.contains('\0') {
bail!("cgroup name '{name}' contains a NUL byte");
}
for component in name.split('/') {
if component.is_empty() {
bail!(
"cgroup name '{name}' contains an empty path component \
(consecutive '/') — Path::join would emit a malformed path"
);
}
if component == ".." {
bail!(
"cgroup name '{name}' contains a '..' component — \
would escape the managed parent via Path::join"
);
}
if component == "." {
bail!(
"cgroup name '{name}' contains a '.' component — \
ambiguous self-reference, refuse before fs writes"
);
}
if component.starts_with('.') {
bail!(
"cgroup name '{name}' contains a leading-dot component \
('{component}') — produces a hidden cgroupfs entry"
);
}
}
Ok(())
}
pub(crate) fn anyhow_first_io_errno(err: &anyhow::Error) -> Option<i32> {
err.chain()
.find_map(|cause| cause.downcast_ref::<std::io::Error>())
.and_then(|io| io.raw_os_error())
}
fn is_esrch(err: &anyhow::Error) -> bool {
anyhow_first_io_errno(err) == Some(libc::ESRCH)
}
fn is_ebusy(err: &anyhow::Error) -> bool {
anyhow_first_io_errno(err) == Some(libc::EBUSY)
}
fn capture_cpuset_state(parent: &Path, name: &str) -> String {
let child = parent.join(name);
let parent_controllers = read_or_label(&parent.join("cgroup.controllers"));
let parent_subtree_control = read_or_label(&parent.join("cgroup.subtree_control"));
let child_controllers = read_or_label(&child.join("cgroup.controllers"));
let cpuset_cpus_exists = child.join("cpuset.cpus").exists();
let child_listing = match fs::read_dir(&child) {
Ok(entries) => {
let mut names: Vec<String> = entries
.filter_map(|e| e.ok())
.map(|e| e.file_name().to_string_lossy().into_owned())
.collect();
names.sort_unstable();
format!("[{}]", names.join(", "))
}
Err(e) => format!("<read_dir failed: {e}>"),
};
format!(
"cgroup-state-snapshot: \
parent={} name={} \
parent.cgroup.controllers={:?} \
parent.cgroup.subtree_control={:?} \
child.cgroup.controllers={:?} \
child.cpuset.cpus.exists={} \
child.listing={}",
parent.display(),
name,
parent_controllers,
parent_subtree_control,
child_controllers,
cpuset_cpus_exists,
child_listing,
)
}
fn read_or_label(path: &Path) -> String {
match fs::read_to_string(path) {
Ok(s) => s.trim().to_string(),
Err(e) => format!("<read failed: {e}>"),
}
}
const MAX_OUTSTANDING_REMOVES: usize = 10;
#[derive(Debug)]
pub struct CgroupManager {
parent: PathBuf,
outstanding_removes: AtomicUsize,
}
impl CgroupManager {
pub fn new(parent: &str) -> Self {
Self {
parent: PathBuf::from(parent),
outstanding_removes: AtomicUsize::new(0),
}
}
pub fn parent_path(&self) -> &std::path::Path {
&self.parent
}
pub fn outstanding_removes(&self) -> usize {
self.outstanding_removes.load(Ordering::Relaxed)
}
pub fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()> {
self.setup_under_root(controllers, &PathBuf::from("/sys/fs/cgroup"))
}
fn setup_under_root(&self, controllers: &BTreeSet<Controller>, root: &Path) -> Result<()> {
if !self.parent.exists() {
fs::create_dir_all(&self.parent)
.with_context(|| format!("mkdir {}", self.parent.display()))?;
}
if controllers.is_empty() {
return Ok(());
}
if let Ok(rel) = self.parent.strip_prefix(root) {
let available_path = root.join("cgroup.controllers");
if available_path.exists() {
let raw = fs::read_to_string(&available_path).with_context(|| {
format!("read cgroup.controllers: {}", available_path.display())
})?;
let available: BTreeSet<&str> = raw.split_whitespace().collect();
for c in controllers {
if !available.contains(c.name()) {
return Err(anyhow!(
"cgroup controller '{}' not available at {}; \
cgroup.controllers reports {:?}. CONFIG_{}_CONTROLLER \
may be unset, or the controller is masked at this \
level of the hierarchy",
c.name(),
available_path.display(),
available,
c.name().to_uppercase(),
));
}
}
}
let line: String = controllers
.iter()
.map(|c| format!("+{}", c.name()))
.collect::<Vec<_>>()
.join(" ");
let mut cur = root.to_path_buf();
for c in rel.components() {
let sc = cur.join("cgroup.subtree_control");
if sc.exists() {
write_with_timeout(&sc, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!("enable controllers '{line}' at {}", sc.display())
})?;
}
cur = cur.join(c);
}
let sc = self.parent.join("cgroup.subtree_control");
if sc.exists() {
write_with_timeout(&sc, &line, CGROUP_WRITE_TIMEOUT)
.with_context(|| format!("enable controllers '{line}' at {}", sc.display()))?;
}
}
Ok(())
}
pub fn create_cgroup(&self, name: &str) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name);
if !p.exists() {
fs::create_dir_all(&p).with_context(|| format!("mkdir {}", p.display()))?;
}
self.enable_subtree_cpuset(name);
Ok(())
}
pub fn add_parent_subtree_controller(&self, controller: &str) -> Result<()> {
let p = self.parent.join("cgroup.subtree_control");
if !p.exists() {
return Ok(());
}
write_with_timeout(&p, &format!("+{controller}"), CGROUP_WRITE_TIMEOUT)
}
pub fn remove_cgroup(&self, name: &str) -> Result<()> {
validate_cgroup_name(name)?;
let outstanding = self.outstanding_removes.load(Ordering::Relaxed);
if outstanding > MAX_OUTSTANDING_REMOVES {
bail!(
"remove_cgroup '{name}' refused: {outstanding} cgroups outstanding \
(cap {MAX_OUTSTANDING_REMOVES}); cgroup.procs draining wedged or \
churn loop outpacing the kernel's RCU grace period — bailing to \
avoid unbounded cgroupfs accumulation"
);
}
let p = self.parent.join(name);
if !p.exists() {
return Ok(());
}
match self.remove_cgroup_inner(name, &p) {
Ok(()) => {
self.outstanding_removes
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |n| {
Some(n.saturating_sub(1))
})
.ok();
Ok(())
}
Err(err) => {
self.outstanding_removes.fetch_add(1, Ordering::Relaxed);
Err(err)
}
}
}
fn remove_cgroup_inner(&self, name: &str, p: &Path) -> Result<()> {
if p.join("cgroup.freeze").exists()
&& let Err(err) = self.set_freeze(name, false)
&& anyhow_first_io_errno(&err) != Some(libc::ENOENT)
{
tracing::warn!(
cgroup = name,
err = %format!("{err:#}"),
"remove_cgroup: pre-drain unfreeze failed; drain may strand frozen tasks at root"
);
}
self.drain_tasks(name)?;
wait_for_cgroup_unpopulated(p, std::time::Duration::from_secs(1));
fs::remove_dir(p).with_context(|| format!("rmdir {}", p.display()))
}
pub fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cpuset.cpus");
match write_with_timeout(&p, &TestTopology::cpuset_string(cpus), CGROUP_WRITE_TIMEOUT) {
Ok(()) => Ok(()),
Err(e) => {
let snapshot = capture_cpuset_state(&self.parent, name);
Err(e.context(snapshot))
}
}
}
fn enable_subtree_cpuset(&self, name: &str) {
let components: Vec<&str> = name.split('/').collect();
if components.len() < 2 {
return;
}
let mut cur = self.parent.clone();
for c in &components[..components.len() - 1] {
let sc = cur.join("cgroup.subtree_control");
if sc.exists()
&& let Err(e) = write_with_timeout(&sc, "+cpuset", CGROUP_WRITE_TIMEOUT)
{
tracing::warn!(path = %sc.display(), err = %e, "failed to enable cpuset");
}
cur = cur.join(c);
}
let sc = cur.join("cgroup.subtree_control");
if sc.exists()
&& let Err(e) = write_with_timeout(&sc, "+cpuset", CGROUP_WRITE_TIMEOUT)
{
tracing::warn!(path = %sc.display(), err = %e, "failed to enable cpuset");
}
}
pub fn clear_cpuset(&self, name: &str) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cpuset.cpus");
write_with_timeout(&p, "", CGROUP_WRITE_TIMEOUT).with_context(|| {
format!("cgroup '{name}': clear cpuset.cpus (write empty string for inherit-parent)")
})
}
pub fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cpuset.mems");
let nodes_str = TestTopology::cpuset_string(nodes);
write_with_timeout(&p, &nodes_str, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set cpuset.mems='{nodes_str}' (requires +cpuset in parent cgroup.subtree_control)"
)
})
}
pub fn clear_cpuset_mems(&self, name: &str) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cpuset.mems");
write_with_timeout(&p, "", CGROUP_WRITE_TIMEOUT).with_context(|| {
format!("cgroup '{name}': clear cpuset.mems (write empty string for inherit-parent)")
})
}
pub fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cpu.max");
let line = match quota_us {
Some(q) => format!("{q} {period_us}"),
None => format!("max {period_us}"),
};
write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set cpu.max='{line}' (requires +cpu in parent cgroup.subtree_control)"
)
})
}
pub fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cpu.weight");
write_with_timeout(&p, &weight.to_string(), CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set cpu.weight={weight} (requires +cpu in parent cgroup.subtree_control)"
)
})
}
pub fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("memory.max");
let line = match bytes {
Some(b) => b.to_string(),
None => "max".to_string(),
};
write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set memory.max='{line}' (requires +memory in parent cgroup.subtree_control)"
)
})
}
pub fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("memory.high");
let line = match bytes {
Some(b) => b.to_string(),
None => "max".to_string(),
};
write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set memory.high='{line}' (requires +memory in parent cgroup.subtree_control)"
)
})
}
pub fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("memory.low");
let line = match bytes {
Some(b) => b.to_string(),
None => "0".to_string(),
};
write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set memory.low='{line}' (requires +memory in parent cgroup.subtree_control)"
)
})
}
pub fn set_io_weight(&self, name: &str, weight: u16) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("io.weight");
write_with_timeout(&p, &weight.to_string(), CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set io.weight={weight} (requires +io in parent cgroup.subtree_control)"
)
})
}
pub fn set_freeze(&self, name: &str, frozen: bool) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cgroup.freeze");
let line = if frozen { "1" } else { "0" };
write_with_timeout(&p, line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!("cgroup '{name}': set cgroup.freeze='{line}' (cgroup-core file, no controller required)")
})
}
pub fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("pids.max");
let line = match max {
Some(n) => n.to_string(),
None => "max".to_string(),
};
write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set pids.max='{line}' (requires +pids in parent cgroup.subtree_control)"
)
})
}
pub fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("memory.swap.max");
let line = match bytes {
Some(b) => b.to_string(),
None => "max".to_string(),
};
write_with_timeout(&p, &line, CGROUP_WRITE_TIMEOUT).with_context(|| {
format!(
"cgroup '{name}': set memory.swap.max='{line}' (requires +memory in parent cgroup.subtree_control; file absent on CONFIG_SWAP=n kernels)"
)
})
}
pub fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()> {
validate_cgroup_name(name)?;
self.check_cpuset_ordering(name)?;
let p = self.parent.join(name).join("cgroup.procs");
write_with_timeout(&p, &pid.to_string(), CGROUP_WRITE_TIMEOUT)
}
fn check_cpuset_ordering(&self, name: &str) -> Result<()> {
let cpus_path = self.parent.join(name).join("cpuset.cpus");
let mems_effective_path = self.parent.join(name).join("cpuset.mems.effective");
let cpus = match fs::read_to_string(&cpus_path) {
Ok(s) => s,
Err(_) => return Ok(()),
};
if cpus.trim().is_empty() {
return Ok(());
}
let mems_effective = match fs::read_to_string(&mems_effective_path) {
Ok(s) => s,
Err(_) => return Ok(()),
};
if mems_effective.trim().is_empty() {
bail!(
"move_task into '{name}' refused: cpuset.cpus is set ({}) \
but cpuset.mems.effective reads empty — half-configured \
cgroup. The kernel's behavior here is path-dependent \
(guarantee_online_mems walks up to find a non-empty \
ancestor mask; the empty-nodemask OOM path is reachable \
only in degenerate hierarchies), but the framework \
surfaces a focused error rather than letting the \
migration through. Call set_cpuset_mems on this cgroup \
or widen an ancestor's cpuset.mems before move_task",
cpus.trim(),
);
}
Ok(())
}
pub fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()> {
validate_cgroup_name(name)?;
for &pid in pids {
if let Err(e) = self.move_task_with_retry(name, pid) {
if is_esrch(&e) {
tracing::warn!(pid, cgroup = name, "task vanished during migration");
continue;
}
return Err(e);
}
}
Ok(())
}
fn move_task_with_retry(&self, name: &str, pid: libc::pid_t) -> Result<()> {
const MAX_RETRIES: u32 = 3;
const RETRY_DELAY: Duration = Duration::from_millis(100);
for attempt in 0..MAX_RETRIES {
match self.move_task(name, pid) {
Ok(()) => return Ok(()),
Err(e) if is_ebusy(&e) && attempt + 1 < MAX_RETRIES => {
tracing::debug!(
pid,
cgroup = name,
attempt = attempt + 1,
"EBUSY on cgroup.procs write, retrying"
);
std::thread::sleep(RETRY_DELAY);
}
Err(e) => return Err(e),
}
}
unreachable!()
}
pub fn clear_subtree_control(&self, name: &str) -> Result<()> {
validate_cgroup_name(name)?;
let p = self.parent.join(name).join("cgroup.subtree_control");
if !p.exists() {
return Ok(());
}
let content = fs::read_to_string(&p).with_context(|| format!("read {}", p.display()))?;
let content = content.trim();
if content.is_empty() {
return Ok(());
}
let disable: Vec<String> = content
.split_whitespace()
.map(|c| format!("-{c}"))
.collect();
let disable_str = disable.join(" ");
write_with_timeout(&p, &disable_str, CGROUP_WRITE_TIMEOUT)
.with_context(|| format!("clear subtree_control on {name}"))
}
pub fn drain_tasks(&self, name: &str) -> Result<()> {
validate_cgroup_name(name)?;
let src = self.parent.join(name).join("cgroup.procs");
if !src.exists() {
return Ok(());
}
drain_pids_to_root(&src, name);
Ok(())
}
pub fn cleanup_all(&self) -> Result<()> {
if !self.parent.exists() {
return Ok(());
}
if let Err(err) = for_each_child_dir(&self.parent, "cleanup_all", cleanup_recursive) {
tracing::warn!(
parent = %self.parent.display(),
err = %err,
"cleanup_all: read_dir failed; child cgroups may remain under parent",
);
}
Ok(())
}
}
pub trait CgroupOps {
fn parent_path(&self) -> &Path;
fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()>;
fn create_cgroup(&self, name: &str) -> Result<()>;
fn remove_cgroup(&self, name: &str) -> Result<()>;
fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()>;
fn clear_cpuset(&self, name: &str) -> Result<()>;
fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()>;
fn clear_cpuset_mems(&self, name: &str) -> Result<()>;
fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()>;
fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()>;
fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()>;
fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()>;
fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()>;
fn set_io_weight(&self, name: &str, weight: u16) -> Result<()>;
fn set_freeze(&self, name: &str, frozen: bool) -> Result<()>;
fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()>;
fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()>;
fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()>;
fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()>;
fn clear_subtree_control(&self, name: &str) -> Result<()>;
fn drain_tasks(&self, name: &str) -> Result<()>;
fn cleanup_all(&self) -> Result<()>;
}
impl CgroupOps for CgroupManager {
fn parent_path(&self) -> &Path {
CgroupManager::parent_path(self)
}
fn setup(&self, controllers: &BTreeSet<Controller>) -> Result<()> {
CgroupManager::setup(self, controllers)
}
fn create_cgroup(&self, name: &str) -> Result<()> {
CgroupManager::create_cgroup(self, name)
}
fn remove_cgroup(&self, name: &str) -> Result<()> {
CgroupManager::remove_cgroup(self, name)
}
fn set_cpuset(&self, name: &str, cpus: &BTreeSet<usize>) -> Result<()> {
CgroupManager::set_cpuset(self, name, cpus)
}
fn clear_cpuset(&self, name: &str) -> Result<()> {
CgroupManager::clear_cpuset(self, name)
}
fn set_cpuset_mems(&self, name: &str, nodes: &BTreeSet<usize>) -> Result<()> {
CgroupManager::set_cpuset_mems(self, name, nodes)
}
fn clear_cpuset_mems(&self, name: &str) -> Result<()> {
CgroupManager::clear_cpuset_mems(self, name)
}
fn set_cpu_max(&self, name: &str, quota_us: Option<u64>, period_us: u64) -> Result<()> {
CgroupManager::set_cpu_max(self, name, quota_us, period_us)
}
fn set_cpu_weight(&self, name: &str, weight: u32) -> Result<()> {
CgroupManager::set_cpu_weight(self, name, weight)
}
fn set_memory_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
CgroupManager::set_memory_max(self, name, bytes)
}
fn set_memory_high(&self, name: &str, bytes: Option<u64>) -> Result<()> {
CgroupManager::set_memory_high(self, name, bytes)
}
fn set_memory_low(&self, name: &str, bytes: Option<u64>) -> Result<()> {
CgroupManager::set_memory_low(self, name, bytes)
}
fn set_io_weight(&self, name: &str, weight: u16) -> Result<()> {
CgroupManager::set_io_weight(self, name, weight)
}
fn set_freeze(&self, name: &str, frozen: bool) -> Result<()> {
CgroupManager::set_freeze(self, name, frozen)
}
fn set_pids_max(&self, name: &str, max: Option<u64>) -> Result<()> {
CgroupManager::set_pids_max(self, name, max)
}
fn set_memory_swap_max(&self, name: &str, bytes: Option<u64>) -> Result<()> {
CgroupManager::set_memory_swap_max(self, name, bytes)
}
fn move_task(&self, name: &str, pid: libc::pid_t) -> Result<()> {
CgroupManager::move_task(self, name, pid)
}
fn move_tasks(&self, name: &str, pids: &[libc::pid_t]) -> Result<()> {
CgroupManager::move_tasks(self, name, pids)
}
fn clear_subtree_control(&self, name: &str) -> Result<()> {
CgroupManager::clear_subtree_control(self, name)
}
fn drain_tasks(&self, name: &str) -> Result<()> {
CgroupManager::drain_tasks(self, name)
}
fn cleanup_all(&self) -> Result<()> {
CgroupManager::cleanup_all(self)
}
}
fn wait_for_cgroup_unpopulated(cgroup_dir: &Path, budget: std::time::Duration) -> bool {
use nix::poll::{PollFd, PollFlags, PollTimeout, poll};
use nix::sys::inotify::{AddWatchFlags, InitFlags, Inotify};
use std::os::unix::io::AsFd;
let events_path = cgroup_dir.join("cgroup.events");
if cgroup_events_reports_unpopulated(&events_path) {
return true;
}
let deadline = std::time::Instant::now() + budget;
let inotify_result =
Inotify::init(InitFlags::IN_CLOEXEC | InitFlags::IN_NONBLOCK).and_then(|i| {
i.add_watch(&events_path, AddWatchFlags::IN_MODIFY)?;
Ok(i)
});
loop {
if cgroup_events_reports_unpopulated(&events_path) {
return true;
}
let now = std::time::Instant::now();
if now >= deadline {
return false;
}
let remaining_ms = deadline
.duration_since(now)
.as_millis()
.min(u16::MAX as u128) as u16;
match inotify_result.as_ref() {
Ok(inotify) => {
let fd = inotify.as_fd();
let mut pollfds = [PollFd::new(fd, PollFlags::POLLIN)];
let _ = poll(&mut pollfds, PollTimeout::from(remaining_ms));
let _ = inotify.read_events();
}
Err(_) => {
std::thread::sleep(
std::time::Duration::from_millis(10).min(deadline.duration_since(now)),
);
}
}
}
}
fn cgroup_events_reports_unpopulated(events_path: &Path) -> bool {
match fs::read_to_string(events_path) {
Ok(s) => s
.lines()
.any(|line| line.split_whitespace().eq(["populated", "0"])),
Err(_) => false,
}
}
fn drain_pids_to_root(procs_path: &Path, context: &str) {
let dst = Path::new("/sys/fs/cgroup/cgroup.procs");
let content = match fs::read_to_string(procs_path) {
Ok(c) => c,
Err(e) => {
tracing::warn!(
path = %procs_path.display(),
cgroup = context,
err = %e,
"drain_pids_to_root: read_to_string failed; tasks may remain in cgroup",
);
return;
}
};
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let pid: u32 = match trimmed.parse() {
Ok(p) => p,
Err(e) => {
tracing::warn!(
path = %procs_path.display(),
cgroup = context,
line = trimmed,
err = %e,
"drain_pids_to_root: malformed pid line; skipping",
);
continue;
}
};
if let Err(e) = write_with_timeout(dst, &pid.to_string(), CGROUP_WRITE_TIMEOUT)
&& !is_esrch(&e)
{
tracing::warn!(pid, cgroup = context, err = %e, "failed to drain task");
}
}
}
fn for_each_child_dir(path: &Path, context: &str, mut f: impl FnMut(&Path)) -> std::io::Result<()> {
for entry in fs::read_dir(path)? {
let entry = match entry {
Ok(e) => e,
Err(err) => {
tracing::warn!(
path = %path.display(),
err = %err,
"{context}: dir entry read failed; skipping",
);
continue;
}
};
match entry.file_type() {
Ok(t) if t.is_dir() => f(&entry.path()),
Ok(_) => {}
Err(err) => tracing::warn!(
path = %entry.path().display(),
err = %err,
"{context}: file_type read failed; skipping entry",
),
}
}
Ok(())
}
fn cleanup_recursive(path: &std::path::Path) {
if let Err(err) = for_each_child_dir(path, "cleanup_recursive", cleanup_recursive) {
tracing::warn!(
path = %path.display(),
err = %err,
"cleanup_recursive: read_dir failed; child cgroups may remain",
);
}
let freeze_path = path.join("cgroup.freeze");
if freeze_path.exists()
&& let Err(err) = write_with_timeout(&freeze_path, "0", CGROUP_WRITE_TIMEOUT)
{
tracing::warn!(
path = %path.display(),
err = %format!("{err:#}"),
"cleanup_recursive: pre-drain unfreeze failed; source-cgroup state-hygiene step skipped",
);
}
drain_pids_to_root(&path.join("cgroup.procs"), &path.display().to_string());
wait_for_cgroup_unpopulated(path, std::time::Duration::from_secs(1));
if let Err(err) = fs::remove_dir(path) {
tracing::warn!(
path = %path.display(),
err = %err,
"cleanup_recursive: remove_dir failed; cgroup directory may remain",
);
}
}
#[cfg(test)]
#[path = "cgroup_tests.rs"]
mod tests;