sqlite_graphrag/system_load.rs
1//! G28-D: system load average observation before spawning LLM subprocesses.
2//!
3//! The 2026-06-03 incident saturated a 10-CPU host with load 276 because
4//! parallel `enrich` workers kept spawning `claude -p` / `codex exec`
5//! children even when the system was already at saturation. This module
6//! exposes a single helper that returns `true` when the 1-minute load
7//! average is above `2 × ncpus` (the conservative threshold the G28-D
8//! original discussion recommended).
9//!
10//! Uses `sysinfo::System::load_average()` which is already a transitive
11//! dependency of the project. The read is cheap (single syscall on
12//! Linux) and throttled to once per second via a Mutex-cached timestamp.
13
14use std::sync::Mutex;
15use std::time::{Duration, Instant};
16
17static LAST_REFRESH: Mutex<Option<Instant>> = Mutex::new(None);
18
19/// Returns the 1-minute load average as reported by the OS.
20///
21/// On platforms where `sysinfo` cannot read load average (very old Linux
22/// without /proc/loadavg), returns `0.0` so callers default to "no
23/// saturation detected".
24pub fn load_average_one() -> f64 {
25 let _ = ensure_fresh();
26 sysinfo::System::load_average().one
27}
28
29/// Returns the number of logical CPUs the runtime can detect.
30///
31/// Used together with [`load_average_one`] to apply a saturation check.
32pub fn ncpus() -> usize {
33 std::thread::available_parallelism()
34 .map(|n| n.get())
35 .unwrap_or(4)
36}
37
38/// G28-D: returns `true` when the 1-minute load average exceeds
39/// `2 × ncpus` (the conservative threshold originally proposed in the
40/// G28 audit). The default threshold can be overridden by the
41/// `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` env var.
42pub fn is_system_saturated() -> bool {
43 let load = load_average_one();
44 let n = ncpus() as f64;
45 let multiplier: f64 = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
46 .ok()
47 .and_then(|v| v.parse().ok())
48 .unwrap_or(2.0);
49 load > n * multiplier
50}
51
52/// Throttles the cached refresh timestamp so we read /proc/loadavg at
53/// most once per second across all callers. The function returns the
54/// previous timestamp (or None on first call) so the caller can decide
55/// whether to actually invoke the syscall.
56fn ensure_fresh() -> Option<Instant> {
57 let mut guard = LAST_REFRESH.lock().expect("loadavg mutex poisoned");
58 let now = Instant::now();
59 let should_refresh = guard
60 .as_ref()
61 .is_none_or(|last| now.duration_since(*last) > Duration::from_secs(1));
62 let prev = guard.as_ref().copied();
63 if should_refresh {
64 *guard = Some(now);
65 }
66 prev
67}
68
69#[cfg(test)]
70mod tests {
71 use super::*;
72
73 #[test]
74 fn ncpus_is_at_least_one() {
75 assert!(ncpus() >= 1);
76 }
77
78 #[test]
79 fn load_average_is_non_negative() {
80 assert!(load_average_one() >= 0.0);
81 }
82
83 #[test]
84 fn saturation_default_threshold_is_two() {
85 // G28-D default: 2 × ncpus. Operators can lower it via env var
86 // when running on contended CI runners.
87 let env_default = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
88 .ok()
89 .and_then(|v| v.parse().ok())
90 .unwrap_or(2.0);
91 assert!(env_default >= 1.0);
92 }
93
94 #[test]
95 fn saturation_check_does_not_panic() {
96 // The function must always return a definitive answer.
97 let _ = is_system_saturated();
98 }
99
100 #[test]
101 fn ensure_fresh_returns_previous_then_sets_new() {
102 let prev = ensure_fresh();
103 // On the first call prev is None; subsequent calls return Some.
104 if prev.is_none() {
105 let second = ensure_fresh();
106 // Within the same second the cache is fresh so prev is Some.
107 assert!(second.is_some());
108 }
109 }
110}