Skip to main content

hyperi_rustlib/memory/
cgroup.rs

1// Project:   hyperi-rustlib
2// File:      src/memory/cgroup.rs
3// Purpose:   Cgroup-aware memory limit + pressure detection
4// Language:  Rust
5//
6// License:   BUSL-1.1
7// Copyright: (c) 2026 HYPERI PTY LIMITED
8
9//! Cgroup-aware memory limit and pressure detection.
10//!
11//! Three signals, all container-first (what the kernel/OOM-killer act on, not
12//! host-wide `used/total`):
13//!
14//! - **limit** (`memory.max`): the hard ceiling -- crossing it is an OOM-kill.
15//! - **high** (`memory.high`): the soft throttle -- the kernel reclaims hard
16//!   and throttles allocations here, BEFORE the OOM-kill. Shedding before it
17//!   avoids a latency cliff.
18//! - **PSI** (`memory.pressure` `some avg10`): the earliest signal -- the
19//!   fraction of the last 10s in which a task stalled waiting on memory. It
20//!   rises on reclaim/thrash before byte ratios cross a threshold.
21
22use std::fs;
23use std::path::Path;
24
25/// cgroup v2 mount root. The `*_at` helpers take a root so tests can point at
26/// a fixture directory; the public functions use this real path.
27const CGROUP_V2_ROOT: &str = "/sys/fs/cgroup";
28
29/// Detect the memory limit for this process.
30///
31/// Priority:
32/// 1. Cgroup v2: `/sys/fs/cgroup/memory.max`
33/// 2. Cgroup v1: `/sys/fs/cgroup/memory/memory.limit_in_bytes`
34/// 3. System available memory (via sysinfo)
35///
36/// Returns the limit in bytes.
37pub fn detect_memory_limit() -> u64 {
38    // Try cgroup v2
39    if let Some(limit) = read_cgroup_v2_limit_at(Path::new(CGROUP_V2_ROOT)) {
40        tracing::info!(
41            limit_bytes = limit,
42            source = "cgroup-v2",
43            "detected memory limit"
44        );
45        return limit;
46    }
47
48    // Try cgroup v1
49    if let Some(limit) = read_cgroup_v1_limit() {
50        tracing::info!(
51            limit_bytes = limit,
52            source = "cgroup-v1",
53            "detected memory limit"
54        );
55        return limit;
56    }
57
58    // Fallback to system memory
59    let mut sys = sysinfo::System::new();
60    sys.refresh_memory();
61    let total = sys.total_memory();
62    tracing::info!(
63        limit_bytes = total,
64        source = "system-memory",
65        "detected memory limit (no cgroup)"
66    );
67    total
68}
69
70/// Soft throttle ceiling: cgroup v2 `memory.high`.
71///
72/// The kernel begins aggressive reclaim and allocation throttling here, before
73/// the hard `memory.max` OOM-kill, so an app that sheds before `memory.high`
74/// avoids the throughput cliff. `None` when unset (`max`) or not cgroup v2.
75#[must_use]
76pub fn detect_memory_high() -> Option<u64> {
77    read_cgroup_v2_high_at(Path::new(CGROUP_V2_ROOT))
78}
79
80/// Detect THIS container's own memory pressure as a 0.0-1.0+ usage fraction.
81///
82/// Returns the WORST of `current/max` and `current/high` (v2), so the signal
83/// rises as the container approaches EITHER the hard OOM ceiling or the soft
84/// throttle, whichever is nearer. Falls back to v1 `usage/limit`. `None` when
85/// no cgroup memory limit is in force (bare metal, or `memory.max == max`), in
86/// which case callers fall back to a process/host signal.
87///
88/// This is the signal a container scheduler (K8s/cgroup OOM killer) actually
89/// acts on -- unlike host-wide `used/total` memory, which on a large shared
90/// host is unrelated to this container's limit.
91#[must_use]
92pub fn detect_memory_pressure() -> Option<f64> {
93    detect_memory_pressure_at(Path::new(CGROUP_V2_ROOT))
94}
95
96/// Memory PSI stall fraction: cgroup v2 `memory.pressure` `some avg10`, as a
97/// 0.0-1.0 fraction (`avg10` is the percentage of the last 10s in which at
98/// least one task stalled waiting on memory).
99///
100/// This is the earliest memory-pressure signal -- it rises on reclaim/thrash
101/// before byte ratios cross a threshold. Exposed for observability (emit it as
102/// a gauge and alert on it); it is deliberately NOT folded into the scale/shed
103/// decision, because the stall-percent at which to act is workload-specific and
104/// wants per-service calibration, not a guessed constant.
105///
106/// `None` when PSI is unavailable (kernel < 4.20, PSI disabled, or cgroup v1).
107#[must_use]
108pub fn detect_memory_stall() -> Option<f64> {
109    read_memory_psi_some_avg10_at(Path::new(CGROUP_V2_ROOT))
110}
111
112fn detect_memory_pressure_at(root: &Path) -> Option<f64> {
113    // v2: worst of current/max (hard) and current/high (soft throttle).
114    if let Some(current) = read_cgroup_v2_current_at(root) {
115        let mut worst: Option<f64> = None;
116        for limit in [read_cgroup_v2_limit_at(root), read_cgroup_v2_high_at(root)]
117            .into_iter()
118            .flatten()
119            .filter(|l| *l > 0)
120        {
121            let ratio = current as f64 / limit as f64;
122            worst = Some(worst.map_or(ratio, |w| w.max(ratio)));
123        }
124        return worst;
125    }
126
127    // v1 fallback: usage/limit only (no memory.high equivalent).
128    let limit = read_cgroup_v1_limit()?;
129    if limit == 0 {
130        return None;
131    }
132    let current = read_cgroup_v1_current()?;
133    Some(current as f64 / limit as f64)
134}
135
136fn read_cgroup_v2_limit_at(root: &Path) -> Option<u64> {
137    let content = fs::read_to_string(root.join("memory.max")).ok()?;
138    let trimmed = content.trim();
139    if trimmed == "max" {
140        return None; // No limit set
141    }
142    trimmed.parse::<u64>().ok()
143}
144
145fn read_cgroup_v2_high_at(root: &Path) -> Option<u64> {
146    let content = fs::read_to_string(root.join("memory.high")).ok()?;
147    let trimmed = content.trim();
148    if trimmed == "max" {
149        return None; // No soft throttle set
150    }
151    trimmed.parse::<u64>().ok()
152}
153
154fn read_cgroup_v2_current_at(root: &Path) -> Option<u64> {
155    fs::read_to_string(root.join("memory.current"))
156        .ok()?
157        .trim()
158        .parse::<u64>()
159        .ok()
160}
161
162/// Parse `some avg10=N` from a cgroup v2 `memory.pressure` file, as a 0-1
163/// fraction. The file looks like:
164///
165/// ```text
166/// some avg10=0.42 avg60=0.10 avg300=0.03 total=12345
167/// full avg10=0.10 avg60=0.02 avg300=0.00 total=4567
168/// ```
169fn read_memory_psi_some_avg10_at(root: &Path) -> Option<f64> {
170    let content = fs::read_to_string(root.join("memory.pressure")).ok()?;
171    let some_line = content.lines().find(|l| l.starts_with("some "))?;
172    let avg10 = some_line
173        .split_whitespace()
174        .find_map(|field| field.strip_prefix("avg10="))?
175        .parse::<f64>()
176        .ok()?;
177    if !avg10.is_finite() {
178        return None;
179    }
180    Some((avg10 / 100.0).clamp(0.0, 1.0))
181}
182
183fn read_cgroup_v1_current() -> Option<u64> {
184    fs::read_to_string("/sys/fs/cgroup/memory/memory.usage_in_bytes")
185        .ok()?
186        .trim()
187        .parse::<u64>()
188        .ok()
189}
190
191fn read_cgroup_v1_limit() -> Option<u64> {
192    let content = fs::read_to_string("/sys/fs/cgroup/memory/memory.limit_in_bytes").ok()?;
193    let value = content.trim().parse::<u64>().ok()?;
194    // cgroup v1 uses a very large number for "no limit"
195    if value > 1 << 62 {
196        return None;
197    }
198    Some(value)
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    /// Write `name`->`contents` files into a fresh temp dir and return it.
206    /// Real files on disk -- the readers do real `fs::read_to_string`, no mocks.
207    fn cgroup_fixture(files: &[(&str, &str)]) -> tempfile::TempDir {
208        let dir = tempfile::tempdir().expect("tempdir");
209        for (name, contents) in files {
210            std::fs::write(dir.path().join(name), contents).expect("write fixture");
211        }
212        dir
213    }
214
215    #[test]
216    fn test_detect_memory_limit_returns_nonzero() {
217        let limit = detect_memory_limit();
218        assert!(limit > 0, "memory limit should be positive");
219    }
220
221    #[test]
222    fn test_detect_memory_pressure_is_none_or_valid_fraction() {
223        // Either no cgroup limit is in force (None) or a finite, non-negative
224        // ratio. We can't assert an exact value -- it depends on the host.
225        match detect_memory_pressure() {
226            None => {}
227            Some(r) => assert!(
228                r.is_finite() && r >= 0.0,
229                "cgroup pressure must be finite and non-negative, got {r}"
230            ),
231        }
232    }
233
234    #[test]
235    fn v2_limit_reads_max_and_treats_literal_max_as_unlimited() {
236        let dir = cgroup_fixture(&[("memory.max", "536870912\n")]);
237        assert_eq!(read_cgroup_v2_limit_at(dir.path()), Some(536_870_912));
238
239        let dir = cgroup_fixture(&[("memory.max", "max\n")]);
240        assert_eq!(
241            read_cgroup_v2_limit_at(dir.path()),
242            None,
243            "'max' = no limit"
244        );
245    }
246
247    #[test]
248    fn v2_high_reads_soft_throttle_and_handles_unset() {
249        let dir = cgroup_fixture(&[("memory.high", "402653184\n")]);
250        assert_eq!(read_cgroup_v2_high_at(dir.path()), Some(402_653_184));
251
252        let dir = cgroup_fixture(&[("memory.high", "max\n")]);
253        assert_eq!(
254            read_cgroup_v2_high_at(dir.path()),
255            None,
256            "'max' = no throttle"
257        );
258
259        // File absent entirely (kernel/cgroup without memory.high).
260        let dir = cgroup_fixture(&[]);
261        assert_eq!(read_cgroup_v2_high_at(dir.path()), None);
262    }
263
264    #[test]
265    fn pressure_takes_worst_of_max_and_high() {
266        // current=300M, max=512M (0.586), high=400M (0.75). Worst = 0.75:
267        // the soft-throttle ratio is nearer, so we shed on it.
268        let dir = cgroup_fixture(&[
269            ("memory.current", "314572800\n"),
270            ("memory.max", "536870912\n"),
271            ("memory.high", "419430400\n"),
272        ]);
273        let p = detect_memory_pressure_at(dir.path()).expect("v2 pressure");
274        assert!(
275            (p - 0.75).abs() < 0.01,
276            "worst-of should pick current/high, got {p}"
277        );
278    }
279
280    #[test]
281    fn pressure_uses_max_when_high_unset() {
282        let dir = cgroup_fixture(&[
283            ("memory.current", "268435456\n"), // 256M
284            ("memory.max", "536870912\n"),     // 512M -> 0.5
285            ("memory.high", "max\n"),          // unset
286        ]);
287        let p = detect_memory_pressure_at(dir.path()).expect("v2 pressure");
288        assert!((p - 0.5).abs() < 0.01, "falls back to current/max, got {p}");
289    }
290
291    #[test]
292    fn pressure_is_none_when_no_limit_in_force() {
293        let dir = cgroup_fixture(&[
294            ("memory.current", "268435456\n"),
295            ("memory.max", "max\n"),
296            ("memory.high", "max\n"),
297        ]);
298        assert_eq!(detect_memory_pressure_at(dir.path()), None);
299    }
300
301    #[test]
302    fn psi_parses_some_avg10_as_fraction() {
303        let dir = cgroup_fixture(&[(
304            "memory.pressure",
305            "some avg10=42.00 avg60=10.00 avg300=3.00 total=12345\n\
306             full avg10=10.00 avg60=2.00 avg300=0.00 total=4567\n",
307        )]);
308        let stall = read_memory_psi_some_avg10_at(dir.path()).expect("psi");
309        assert!(
310            (stall - 0.42).abs() < 0.001,
311            "avg10=42 -> 0.42, got {stall}"
312        );
313    }
314
315    #[test]
316    fn psi_is_none_when_file_absent() {
317        let dir = cgroup_fixture(&[]);
318        assert_eq!(read_memory_psi_some_avg10_at(dir.path()), None);
319    }
320
321    #[test]
322    fn psi_clamps_and_handles_zero_stall() {
323        let dir = cgroup_fixture(&[(
324            "memory.pressure",
325            "some avg10=0.00 avg60=0.00 avg300=0.00 total=0\n",
326        )]);
327        assert_eq!(read_memory_psi_some_avg10_at(dir.path()), Some(0.0));
328    }
329}