Skip to main content

sqlite_graphrag/
system_load.rs

1//! G28-D: system load average observation before spawning LLM subprocesses.
2//!
3//! The 2026-06-03 incident saturated a 10-CPU host with load 276 because
4//! parallel `enrich` workers kept spawning `claude -p` / `codex exec`
5//! children even when the system was already at saturation. This module
6//! exposes a single helper that returns `true` when the 1-minute load
7//! average is above `2 × ncpus` (the conservative threshold the G28-D
8//! original discussion recommended).
9//!
10//! Uses `sysinfo::System::load_average()` which is already a transitive
11//! dependency of the project. The read is cheap (single syscall on
12//! Linux) and throttled to once per second via a Mutex-cached timestamp.
13
14use std::sync::Mutex;
15use std::time::{Duration, Instant};
16
17static LAST_REFRESH: Mutex<Option<Instant>> = Mutex::new(None);
18
19/// Returns the 1-minute load average as reported by the OS.
20///
21/// On platforms where `sysinfo` cannot read load average (very old Linux
22/// without /proc/loadavg), returns `0.0` so callers default to "no
23/// saturation detected".
24pub fn load_average_one() -> f64 {
25    let _ = ensure_fresh();
26    sysinfo::System::load_average().one
27}
28
29/// Returns the number of logical CPUs the runtime can detect.
30///
31/// Used together with [`load_average_one`] to apply a saturation check.
32pub fn ncpus() -> usize {
33    std::thread::available_parallelism()
34        .map(|n| n.get())
35        .unwrap_or(4)
36}
37
38/// G28-D: returns `true` when the 1-minute load average exceeds
39/// `2 × ncpus` (the conservative threshold originally proposed in the
40/// G28 audit). The default threshold can be overridden by the
41/// `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` env var.
42pub fn is_system_saturated() -> bool {
43    let load = load_average_one();
44    let n = ncpus() as f64;
45    let multiplier: f64 = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
46        .ok()
47        .and_then(|v| v.parse().ok())
48        .unwrap_or(2.0);
49    load > n * multiplier
50}
51
52/// Throttles the cached refresh timestamp so we read /proc/loadavg at
53/// most once per second across all callers. The function returns the
54/// previous timestamp (or None on first call) so the caller can decide
55/// whether to actually invoke the syscall.
56fn ensure_fresh() -> Option<Instant> {
57    let mut guard = LAST_REFRESH
58        .lock()
59        .unwrap_or_else(|poisoned| poisoned.into_inner());
60    let now = Instant::now();
61    let should_refresh = guard
62        .as_ref()
63        .is_none_or(|last| now.duration_since(*last) > Duration::from_secs(1));
64    let prev = guard.as_ref().copied();
65    if should_refresh {
66        *guard = Some(now);
67    }
68    prev
69}
70
71#[cfg(test)]
72mod tests {
73    use super::*;
74
75    #[test]
76    fn ncpus_is_at_least_one() {
77        assert!(ncpus() >= 1);
78    }
79
80    #[test]
81    fn load_average_is_non_negative() {
82        assert!(load_average_one() >= 0.0);
83    }
84
85    #[test]
86    fn saturation_default_threshold_is_two() {
87        // G28-D default: 2 × ncpus. Operators can lower it via env var
88        // when running on contended CI runners.
89        let env_default = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
90            .ok()
91            .and_then(|v| v.parse().ok())
92            .unwrap_or(2.0);
93        assert!(env_default >= 1.0);
94    }
95
96    #[test]
97    fn saturation_check_does_not_panic() {
98        // The function must always return a definitive answer.
99        let _ = is_system_saturated();
100    }
101
102    #[test]
103    fn ensure_fresh_returns_previous_then_sets_new() {
104        let prev = ensure_fresh();
105        // On the first call prev is None; subsequent calls return Some.
106        if prev.is_none() {
107            let second = ensure_fresh();
108            // Within the same second the cache is fresh so prev is Some.
109            assert!(second.is_some());
110        }
111    }
112}