sqlite_graphrag/system_load.rs
1//! G28-D: system load average observation before spawning LLM subprocesses.
2//!
3//! The 2026-06-03 incident saturated a 10-CPU host with load 276 because
4//! parallel `enrich` workers kept spawning `claude -p` / `codex exec`
5//! children even when the system was already at saturation. This module
6//! exposes a single helper that returns `true` when the 1-minute load
7//! average is above `2 × ncpus` (the conservative threshold the G28-D
8//! original discussion recommended).
9//!
10//! Uses `sysinfo::System::load_average()` which is already a transitive
11//! dependency of the project. The read is cheap (single syscall on
12//! Linux) and throttled to once per second via a Mutex-cached timestamp.
13
14use std::sync::Mutex;
15use std::time::{Duration, Instant};
16
17static LAST_REFRESH: Mutex<Option<Instant>> = Mutex::new(None);
18
19/// Returns the 1-minute load average as reported by the OS.
20///
21/// On platforms where `sysinfo` cannot read load average (very old Linux
22/// without /proc/loadavg), returns `0.0` so callers default to "no
23/// saturation detected".
24pub fn load_average_one() -> f64 {
25 let _ = ensure_fresh();
26 sysinfo::System::load_average().one
27}
28
29/// Returns the number of logical CPUs the runtime can detect.
30///
31/// Used together with [`load_average_one`] to apply a saturation check.
32pub fn ncpus() -> usize {
33 std::thread::available_parallelism()
34 .map(|n| n.get())
35 .unwrap_or(4)
36}
37
38/// G28-D: returns `true` when the 1-minute load average exceeds
39/// `2 × ncpus` (the conservative threshold originally proposed in the
40/// G28 audit). The default threshold can be overridden by the
41/// `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` env var.
42pub fn is_system_saturated() -> bool {
43 let load = load_average_one();
44 let n = ncpus() as f64;
45 let multiplier: f64 = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
46 .ok()
47 .and_then(|v| v.parse().ok())
48 .unwrap_or(2.0);
49 load > n * multiplier
50}
51
52/// Throttles the cached refresh timestamp so we read /proc/loadavg at
53/// most once per second across all callers. The function returns the
54/// previous timestamp (or None on first call) so the caller can decide
55/// whether to actually invoke the syscall.
56fn ensure_fresh() -> Option<Instant> {
57 let mut guard = LAST_REFRESH
58 .lock()
59 .unwrap_or_else(|poisoned| poisoned.into_inner());
60 let now = Instant::now();
61 let should_refresh = guard
62 .as_ref()
63 .is_none_or(|last| now.duration_since(*last) > Duration::from_secs(1));
64 let prev = guard.as_ref().copied();
65 if should_refresh {
66 *guard = Some(now);
67 }
68 prev
69}
70
71#[cfg(test)]
72mod tests {
73 use super::*;
74
75 #[test]
76 fn ncpus_is_at_least_one() {
77 assert!(ncpus() >= 1);
78 }
79
80 #[test]
81 fn load_average_is_non_negative() {
82 assert!(load_average_one() >= 0.0);
83 }
84
85 #[test]
86 fn saturation_default_threshold_is_two() {
87 // G28-D default: 2 × ncpus. Operators can lower it via env var
88 // when running on contended CI runners.
89 let env_default = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
90 .ok()
91 .and_then(|v| v.parse().ok())
92 .unwrap_or(2.0);
93 assert!(env_default >= 1.0);
94 }
95
96 #[test]
97 fn saturation_check_does_not_panic() {
98 // The function must always return a definitive answer.
99 let _ = is_system_saturated();
100 }
101
102 #[test]
103 fn ensure_fresh_returns_previous_then_sets_new() {
104 let prev = ensure_fresh();
105 // On the first call prev is None; subsequent calls return Some.
106 if prev.is_none() {
107 let second = ensure_fresh();
108 // Within the same second the cache is fresh so prev is Some.
109 assert!(second.is_some());
110 }
111 }
112}