Skip to main content

sqlite_graphrag/
system_load.rs

1//! G28-D: system load average observation before spawning LLM subprocesses.
2//!
3//! The 2026-06-03 incident saturated a 10-CPU host with load 276 because
4//! parallel `enrich` workers kept spawning `claude -p` / `codex exec`
5//! children even when the system was already at saturation. This module
6//! exposes a single helper that returns `true` when the 1-minute load
7//! average is above `2 × ncpus` (the conservative threshold the G28-D
8//! original discussion recommended).
9//!
10//! Uses `sysinfo::System::load_average()` which is already a transitive
11//! dependency of the project. The read is cheap (single syscall on
12//! Linux) and throttled to once per second via a Mutex-cached timestamp.
13
14use std::sync::Mutex;
15use std::time::{Duration, Instant};
16
17static LAST_REFRESH: Mutex<Option<Instant>> = Mutex::new(None);
18
19/// Returns the 1-minute load average as reported by the OS.
20///
21/// On platforms where `sysinfo` cannot read load average (very old Linux
22/// without /proc/loadavg), returns `0.0` so callers default to "no
23/// saturation detected".
24pub fn load_average_one() -> f64 {
25    let _ = ensure_fresh();
26    sysinfo::System::load_average().one
27}
28
29/// Returns the number of logical CPUs the runtime can detect.
30///
31/// Used together with [`load_average_one`] to apply a saturation check.
32pub fn ncpus() -> usize {
33    std::thread::available_parallelism()
34        .map(|n| n.get())
35        .unwrap_or(4)
36}
37
38/// G28-D: returns `true` when the 1-minute load average exceeds
39/// `2 × ncpus` (the conservative threshold originally proposed in the
40/// G28 audit). The default threshold can be overridden by the
41/// `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` env var.
42pub fn is_system_saturated() -> bool {
43    let load = load_average_one();
44    let n = ncpus() as f64;
45    let multiplier: f64 = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
46        .ok()
47        .and_then(|v| v.parse().ok())
48        .unwrap_or(2.0);
49    load > n * multiplier
50}
51
52/// Throttles the cached refresh timestamp so we read /proc/loadavg at
53/// most once per second across all callers. The function returns the
54/// previous timestamp (or None on first call) so the caller can decide
55/// whether to actually invoke the syscall.
56fn ensure_fresh() -> Option<Instant> {
57    let mut guard = LAST_REFRESH.lock().expect("loadavg mutex poisoned");
58    let now = Instant::now();
59    let should_refresh = guard
60        .as_ref()
61        .is_none_or(|last| now.duration_since(*last) > Duration::from_secs(1));
62    let prev = guard.as_ref().copied();
63    if should_refresh {
64        *guard = Some(now);
65    }
66    prev
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72
73    #[test]
74    fn ncpus_is_at_least_one() {
75        assert!(ncpus() >= 1);
76    }
77
78    #[test]
79    fn load_average_is_non_negative() {
80        assert!(load_average_one() >= 0.0);
81    }
82
83    #[test]
84    fn saturation_default_threshold_is_two() {
85        // G28-D default: 2 × ncpus. Operators can lower it via env var
86        // when running on contended CI runners.
87        let env_default = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
88            .ok()
89            .and_then(|v| v.parse().ok())
90            .unwrap_or(2.0);
91        assert!(env_default >= 1.0);
92    }
93
94    #[test]
95    fn saturation_check_does_not_panic() {
96        // The function must always return a definitive answer.
97        let _ = is_system_saturated();
98    }
99
100    #[test]
101    fn ensure_fresh_returns_previous_then_sets_new() {
102        let prev = ensure_fresh();
103        // On the first call prev is None; subsequent calls return Some.
104        if prev.is_none() {
105            let second = ensure_fresh();
106            // Within the same second the cache is fresh so prev is Some.
107            assert!(second.is_some());
108        }
109    }
110}