Skip to main content

metrics_lib/
system_health.rs

1//! # System Health Monitoring
2//!
3//! Ultra-fast system resource monitoring with process introspection.
4//!
5//! ## Architecture (v0.9.4)
6//!
7//! `SystemHealth` separates state from sampling:
8//!
9//! - All atomic state lives in `HealthInner` behind an `Arc`.
10//! - Reader methods (`cpu_used` / `mem_used_mb` / `health_score` / …) do a
11//!   single `Relaxed` atomic load and return — they never block, never call
12//!   into the OS, never acquire a lock.
13//! - A **background sampler thread**, owned by the `SystemHealth` instance,
14//!   wakes on the configured interval and refreshes the atomics. The thread
15//!   is the only writer; readers see a fresh snapshot every
16//!   `update_interval_ms` (default: 1000 ms).
17//! - `SystemHealth::manual()` constructs an instance with no sampler thread
18//!   for callers who want full control via [`SystemHealth::update`].
19//!
20//! Before 0.9.4, readers called `maybe_update()` which contended on the
21//! sysinfo mutex on non-Linux platforms and stalled async runtimes during
22//! refresh. The new architecture moves that work off the read path entirely.
23//!
24//! ## Features
25//!
26//! - **Process CPU/Memory tracking** — automatic per-process sampling.
27//! - **System-wide monitoring** — CPU, memory, load average.
28//! - **Background refresh** — non-blocking reads regardless of platform.
29//! - **Cross-platform** — `/proc` on Linux, `sysinfo` elsewhere.
30//! - **Zero allocations** on the hot path.
31//! - **Health scoring** — composite 0–100 health score.
32
33use std::io;
34use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
35use std::sync::Arc;
36use std::thread::{self, JoinHandle};
37use std::time::{Duration, Instant};
38
39#[cfg(not(target_os = "linux"))]
40use sysinfo::{get_current_pid, CpuExt, ProcessExt, System, SystemExt};
41
42/// Default interval between background samples (milliseconds).
43const DEFAULT_INTERVAL_MS: u64 = 1000;
44/// Hard floor on the sampler sleep duration so a misconfigured 0 ms interval
45/// does not become a CPU spin loop.
46const MIN_INTERVAL_MS: u64 = 50;
47/// Maximum sleep window before the sampler re-checks the stop flag — keeps
48/// `Drop` latency bounded even on very long configured intervals.
49const MAX_SLEEP_CHUNK_MS: u64 = 1000;
50
51/// One threshold/penalty pair used by [`HealthConfig`].
52///
53/// When a metric value exceeds `threshold`, the configured `penalty` is
54/// subtracted from the running health score. Thresholds inside a `&[Step]`
55/// slice must be supplied in **descending order** — the first match wins.
56#[derive(Debug, Clone, Copy, PartialEq)]
57#[cfg_attr(feature = "serde", derive(serde::Serialize))]
58pub struct Step {
59    /// Inclusive lower bound that must be exceeded for the penalty to apply.
60    pub threshold: f64,
61    /// Amount subtracted from the running health score (0..=100) on match.
62    pub penalty: f64,
63}
64
65impl Step {
66    /// Construct a step with the given threshold and penalty.
67    #[inline]
68    pub const fn new(threshold: f64, penalty: f64) -> Self {
69        Self { threshold, penalty }
70    }
71}
72
73/// Configurable thresholds for [`SystemHealth::quick_check`] /
74/// [`SystemHealth::health_score`].
75///
76/// Each metric (`system_cpu` / `load_avg` / `process_cpu` / `memory_gb` /
77/// `threads` / `fds`) accepts a list of [`Step`]s. When the metric exceeds a
78/// step's threshold, the step's `penalty` is subtracted from the running
79/// score (starting at 100). The first matching step wins, so list steps in
80/// **descending threshold order**.
81///
82/// The load-average steps are interpreted as multipliers of `num_cpus::get()`
83/// (e.g. `Step::new(2.0, 25.0)` applies when load exceeds `2 × CPU count`).
84///
85/// The defaults in [`HealthConfig::default`] match the v0.9.x behavior
86/// exactly so existing dashboards do not shift unexpectedly.
87///
88/// # Example
89///
90/// ```
91/// use metrics_lib::{HealthConfig, Step, SystemHealth};
92/// use std::time::Duration;
93///
94/// // Stricter CPU thresholds for a CPU-bound workload.
95/// let cfg = HealthConfig {
96///     system_cpu: vec![
97///         Step::new(70.0, 30.0),
98///         Step::new(50.0, 15.0),
99///         Step::new(30.0, 5.0),
100///     ],
101///     ..HealthConfig::default()
102/// };
103/// let health = SystemHealth::with_config(Duration::from_millis(500), cfg);
104/// let _ = health.health_score();
105/// ```
106#[derive(Debug, Clone, PartialEq)]
107#[cfg_attr(feature = "serde", derive(serde::Serialize))]
108pub struct HealthConfig {
109    /// Penalty steps applied to the system-wide CPU percentage (0..=100).
110    pub system_cpu: Vec<Step>,
111    /// Penalty steps applied to the 1-minute load average, expressed as a
112    /// **multiplier of `num_cpus::get()`** (e.g. threshold `2.0` ⇒ trips
113    /// when load > 2× cores).
114    pub load_avg: Vec<Step>,
115    /// Penalty steps applied to the process CPU percentage (0..=100).
116    pub process_cpu: Vec<Step>,
117    /// Penalty steps applied to system memory used, in **gigabytes**.
118    pub memory_gb: Vec<Step>,
119    /// Penalty steps applied to the process thread count.
120    pub threads: Vec<Step>,
121    /// Penalty steps applied to the process file-descriptor count.
122    pub fds: Vec<Step>,
123}
124
125impl Default for HealthConfig {
126    /// Defaults match the v0.9.x scoring exactly. Each `Vec<Step>` is
127    /// ordered descending by threshold so the first match wins.
128    fn default() -> Self {
129        Self {
130            system_cpu: vec![
131                Step::new(80.0, 30.0),
132                Step::new(60.0, 15.0),
133                Step::new(40.0, 5.0),
134            ],
135            load_avg: vec![
136                Step::new(2.0, 25.0),
137                Step::new(1.5, 10.0),
138                Step::new(1.0, 5.0),
139            ],
140            process_cpu: vec![Step::new(50.0, 15.0), Step::new(25.0, 8.0)],
141            memory_gb: vec![Step::new(16.0, 10.0), Step::new(8.0, 5.0)],
142            threads: vec![
143                Step::new(1000.0, 20.0),
144                Step::new(500.0, 10.0),
145                Step::new(200.0, 5.0),
146            ],
147            fds: vec![
148                Step::new(10_000.0, 15.0),
149                Step::new(5_000.0, 8.0),
150                Step::new(1_000.0, 3.0),
151            ],
152        }
153    }
154}
155
156fn apply_steps(value: f64, steps: &[Step]) -> f64 {
157    for step in steps {
158        if value > step.threshold {
159            return step.penalty;
160        }
161    }
162    0.0
163}
164
165/// Mutable state of a [`SystemHealth`] instance.
166///
167/// Shared between the sampler thread (sole writer) and any number of reader
168/// threads via `Arc`. All public fields are atomic so reads never block.
169#[repr(align(64))]
170struct HealthInner {
171    /// Last system CPU usage (percentage * 100).
172    system_cpu: AtomicU32,
173    /// Last process CPU usage (percentage * 100).
174    process_cpu: AtomicU32,
175    /// System memory usage in MB.
176    system_memory_mb: AtomicU64,
177    /// Process memory usage in MB.
178    process_memory_mb: AtomicU64,
179    /// System load average (1 min * 100).
180    load_average: AtomicU32,
181    /// Process thread count.
182    thread_count: AtomicU32,
183    /// Process file descriptor count.
184    fd_count: AtomicU32,
185    /// Overall health score (0-10000, where 10000 = 100%).
186    health_score: AtomicU32,
187    /// Milliseconds since `created_at` at the last metrics refresh.
188    last_update_ms: AtomicU64,
189    /// Creation timestamp (process start, effectively).
190    created_at: Instant,
191    /// Linux-only delta-sample state for process CPU.
192    #[cfg(target_os = "linux")]
193    proc_cpu_prev: AtomicU64,
194    /// Linux-only delta-sample state for process CPU. `u64::MAX` sentinel =
195    /// "no prior sample yet".
196    #[cfg(target_os = "linux")]
197    proc_cpu_prev_ms: AtomicU64,
198    /// Non-Linux: shared `sysinfo::System` used by the sampler thread.
199    /// Readers never touch this mutex — only the sampler does.
200    #[cfg(not(target_os = "linux"))]
201    sys: parking_lot::Mutex<System>,
202    #[cfg(not(target_os = "linux"))]
203    pid: Option<sysinfo::Pid>,
204    /// Tunable health-score thresholds (v0.9.5).
205    config: HealthConfig,
206}
207
208impl HealthInner {
209    fn new(config: HealthConfig) -> Self {
210        Self {
211            system_cpu: AtomicU32::new(0),
212            process_cpu: AtomicU32::new(0),
213            system_memory_mb: AtomicU64::new(0),
214            process_memory_mb: AtomicU64::new(0),
215            load_average: AtomicU32::new(0),
216            thread_count: AtomicU32::new(0),
217            fd_count: AtomicU32::new(0),
218            health_score: AtomicU32::new(10000),
219            last_update_ms: AtomicU64::new(0),
220            created_at: Instant::now(),
221            #[cfg(target_os = "linux")]
222            proc_cpu_prev: AtomicU64::new(0),
223            #[cfg(target_os = "linux")]
224            proc_cpu_prev_ms: AtomicU64::new(u64::MAX),
225            #[cfg(not(target_os = "linux"))]
226            sys: parking_lot::Mutex::new(System::new()),
227            #[cfg(not(target_os = "linux"))]
228            pid: get_current_pid().ok(),
229            config,
230        }
231    }
232
233    fn update_metrics(&self) {
234        let now_ms = self.created_at.elapsed().as_millis() as u64;
235
236        if let Ok(cpu) = self.get_system_cpu() {
237            self.system_cpu
238                .store((cpu * 100.0) as u32, Ordering::Relaxed);
239        }
240        if let Ok(memory_mb) = self.get_system_memory_mb() {
241            self.system_memory_mb.store(memory_mb, Ordering::Relaxed);
242        }
243        if let Ok(load) = self.get_load_average() {
244            self.load_average
245                .store((load * 100.0) as u32, Ordering::Relaxed);
246        }
247        if let Ok(cpu) = self.get_process_cpu() {
248            self.process_cpu
249                .store((cpu * 100.0) as u32, Ordering::Relaxed);
250        }
251        if let Ok(memory_mb) = self.get_process_memory_mb() {
252            self.process_memory_mb.store(memory_mb, Ordering::Relaxed);
253        }
254        if let Ok(threads) = self.get_thread_count() {
255            self.thread_count.store(threads, Ordering::Relaxed);
256        }
257        if let Ok(fds) = self.get_fd_count() {
258            self.fd_count.store(fds, Ordering::Relaxed);
259        }
260
261        let health = self.calculate_health_score();
262        self.health_score
263            .store((health * 100.0) as u32, Ordering::Relaxed);
264
265        self.last_update_ms.store(now_ms, Ordering::Relaxed);
266    }
267
268    fn calculate_health_score(&self) -> f64 {
269        let cfg = &self.config;
270        let cpu_count = num_cpus::get() as f64;
271
272        let system_cpu = self.system_cpu.load(Ordering::Relaxed) as f64 / 100.0;
273        let load_norm =
274            (self.load_average.load(Ordering::Relaxed) as f64 / 100.0) / cpu_count.max(1.0);
275        let process_cpu = self.process_cpu.load(Ordering::Relaxed) as f64 / 100.0;
276        let memory_gb = self.system_memory_mb.load(Ordering::Relaxed) as f64 / 1024.0;
277        let threads = self.thread_count.load(Ordering::Relaxed) as f64;
278        let fds = self.fd_count.load(Ordering::Relaxed) as f64;
279
280        let score = 100.0
281            - apply_steps(system_cpu, &cfg.system_cpu)
282            - apply_steps(load_norm, &cfg.load_avg)
283            - apply_steps(process_cpu, &cfg.process_cpu)
284            - apply_steps(memory_gb, &cfg.memory_gb)
285            - apply_steps(threads, &cfg.threads)
286            - apply_steps(fds, &cfg.fds);
287
288        score.max(0.0)
289    }
290
291    // ----- platform-specific samplers -----
292
293    #[cfg(target_os = "linux")]
294    fn get_system_cpu(&self) -> io::Result<f64> {
295        let contents = std::fs::read_to_string("/proc/stat")?;
296        if let Some(line) = contents.lines().next() {
297            let parts: Vec<&str> = line.split_whitespace().collect();
298            if parts.len() >= 5 && parts[0] == "cpu" {
299                let user: u64 = parts[1].parse().unwrap_or(0);
300                let nice: u64 = parts[2].parse().unwrap_or(0);
301                let system: u64 = parts[3].parse().unwrap_or(0);
302                let idle: u64 = parts[4].parse().unwrap_or(0);
303                let total = user + nice + system + idle;
304                let used = user + nice + system;
305                if total > 0 {
306                    return Ok(used as f64 / total as f64 * 100.0);
307                }
308            }
309        }
310        Ok(0.0)
311    }
312
313    #[cfg(not(target_os = "linux"))]
314    fn get_system_cpu(&self) -> io::Result<f64> {
315        let mut guard = self.sys.lock();
316        guard.refresh_cpu();
317        Ok(guard.global_cpu_info().cpu_usage() as f64)
318    }
319
320    #[cfg(target_os = "linux")]
321    fn get_system_memory_mb(&self) -> io::Result<u64> {
322        let contents = std::fs::read_to_string("/proc/meminfo")?;
323        let mut total_kb = 0u64;
324        let mut free_kb = 0u64;
325        let mut available_kb = 0u64;
326        for line in contents.lines() {
327            if let Some(rest) = line.strip_prefix("MemTotal:") {
328                total_kb = rest
329                    .split_whitespace()
330                    .next()
331                    .and_then(|s| s.parse().ok())
332                    .unwrap_or(0);
333            } else if let Some(rest) = line.strip_prefix("MemFree:") {
334                free_kb = rest
335                    .split_whitespace()
336                    .next()
337                    .and_then(|s| s.parse().ok())
338                    .unwrap_or(0);
339            } else if let Some(rest) = line.strip_prefix("MemAvailable:") {
340                available_kb = rest
341                    .split_whitespace()
342                    .next()
343                    .and_then(|s| s.parse().ok())
344                    .unwrap_or(0);
345            }
346        }
347        let used_kb = if available_kb > 0 {
348            total_kb.saturating_sub(available_kb)
349        } else {
350            total_kb.saturating_sub(free_kb)
351        };
352        Ok(used_kb / 1024)
353    }
354
355    #[cfg(not(target_os = "linux"))]
356    fn get_system_memory_mb(&self) -> io::Result<u64> {
357        let mut guard = self.sys.lock();
358        guard.refresh_memory();
359        let used_kib = guard.used_memory();
360        Ok(used_kib / 1024)
361    }
362
363    #[cfg(target_os = "linux")]
364    fn get_load_average(&self) -> io::Result<f64> {
365        let contents = std::fs::read_to_string("/proc/loadavg")?;
366        if let Some(first) = contents.split_whitespace().next() {
367            return first
368                .parse()
369                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "Invalid load average"));
370        }
371        Ok(0.0)
372    }
373
374    #[cfg(not(target_os = "linux"))]
375    fn get_load_average(&self) -> io::Result<f64> {
376        let guard = self.sys.lock();
377        Ok(guard.load_average().one)
378    }
379
380    #[cfg(target_os = "linux")]
381    fn get_process_cpu(&self) -> io::Result<f64> {
382        // Delta sample: ((utime + stime) - prev) / (CLK_TCK * elapsed_s * cores) * 100.
383        let contents = std::fs::read_to_string("/proc/self/stat")?;
384        let parts: Vec<&str> = contents.split_whitespace().collect();
385        if parts.len() < 15 {
386            return Ok(0.0);
387        }
388        let utime: u64 = parts[13].parse().unwrap_or(0);
389        let stime: u64 = parts[14].parse().unwrap_or(0);
390        let total_ticks = utime.saturating_add(stime);
391        let now_ms = self.created_at.elapsed().as_millis() as u64;
392
393        let prev_ticks = self.proc_cpu_prev.load(Ordering::Relaxed);
394        let prev_ms = self.proc_cpu_prev_ms.load(Ordering::Relaxed);
395        self.proc_cpu_prev.store(total_ticks, Ordering::Relaxed);
396        self.proc_cpu_prev_ms.store(now_ms, Ordering::Relaxed);
397
398        if prev_ms == u64::MAX {
399            return Ok(0.0);
400        }
401        let elapsed_ms = now_ms.saturating_sub(prev_ms);
402        if elapsed_ms == 0 {
403            return Ok(0.0);
404        }
405        let delta_ticks = total_ticks.saturating_sub(prev_ticks) as f64;
406        let clk_tck: f64 = 100.0;
407        let elapsed_s = elapsed_ms as f64 / 1000.0;
408        let cores = num_cpus::get().max(1) as f64;
409        let pct = (delta_ticks / (clk_tck * elapsed_s * cores)) * 100.0;
410        Ok(pct.clamp(0.0, 100.0))
411    }
412
413    #[cfg(not(target_os = "linux"))]
414    fn get_process_cpu(&self) -> io::Result<f64> {
415        let mut guard = self.sys.lock();
416        if let Some(pid) = self.pid {
417            guard.refresh_process(pid);
418            if let Some(proc_) = guard.process(pid) {
419                let raw = proc_.cpu_usage() as f64;
420                let cores = num_cpus::get() as f64;
421                let norm = if cores > 0.0 { raw / cores } else { raw };
422                return Ok(norm.clamp(0.0, 100.0));
423            }
424        }
425        Ok(0.0)
426    }
427
428    #[cfg(target_os = "linux")]
429    fn get_process_memory_mb(&self) -> io::Result<u64> {
430        let contents = std::fs::read_to_string("/proc/self/status")?;
431        for line in contents.lines() {
432            if let Some(rest) = line.strip_prefix("VmRSS:") {
433                if let Some(kb) = rest
434                    .split_whitespace()
435                    .next()
436                    .and_then(|s| s.parse::<u64>().ok())
437                {
438                    return Ok(kb / 1024);
439                }
440            }
441        }
442        Ok(0)
443    }
444
445    #[cfg(not(target_os = "linux"))]
446    fn get_process_memory_mb(&self) -> io::Result<u64> {
447        let mut guard = self.sys.lock();
448        if let Some(pid) = self.pid {
449            guard.refresh_process(pid);
450            if let Some(proc_) = guard.process(pid) {
451                return Ok(proc_.memory() / 1024);
452            }
453        }
454        Ok(0)
455    }
456
457    #[cfg(target_os = "linux")]
458    fn get_thread_count(&self) -> io::Result<u32> {
459        let contents = std::fs::read_to_string("/proc/self/status")?;
460        for line in contents.lines() {
461            if let Some(rest) = line.strip_prefix("Threads:") {
462                if let Some(c) = rest.split_whitespace().next().and_then(|s| s.parse().ok()) {
463                    return Ok(c);
464                }
465            }
466        }
467        Ok(1)
468    }
469
470    #[cfg(not(target_os = "linux"))]
471    fn get_thread_count(&self) -> io::Result<u32> {
472        Ok(1)
473    }
474
475    #[cfg(target_os = "linux")]
476    fn get_fd_count(&self) -> io::Result<u32> {
477        match std::fs::read_dir("/proc/self/fd") {
478            Ok(entries) => Ok(entries.count() as u32),
479            Err(_) => Ok(0),
480        }
481    }
482
483    #[cfg(not(target_os = "linux"))]
484    fn get_fd_count(&self) -> io::Result<u32> {
485        Ok(0)
486    }
487}
488
489/// System health monitor with process introspection.
490///
491/// Owns a background sampler thread (unless constructed via
492/// [`SystemHealth::manual`]) that refreshes the cached values every
493/// `update_interval_ms`. All accessor methods are lock-free atomic loads.
494#[repr(align(64))]
495pub struct SystemHealth {
496    inner: Arc<HealthInner>,
497    /// Lives only for its `Drop` side-effect (stops + joins the sampler
498    /// thread). Prefixed `_` so the `dead_code` lint doesn't flag the
499    /// drop-only field.
500    _sampler: Option<SamplerHandle>,
501    /// Configured interval in milliseconds (0 = manual mode, no sampler).
502    update_interval_ms: u64,
503}
504
505/// Per-instance sampler thread handle. Stops + joins the thread on `Drop`.
506struct SamplerHandle {
507    stop: Arc<AtomicBool>,
508    thread: Option<JoinHandle<()>>,
509}
510
511impl Drop for SamplerHandle {
512    fn drop(&mut self) {
513        self.stop.store(true, Ordering::Relaxed);
514        if let Some(t) = self.thread.take() {
515            // Wake the sleeper so `Drop` doesn't block until the next tick.
516            t.thread().unpark();
517            let _ = t.join();
518        }
519    }
520}
521
522/// System resource usage snapshot
523#[derive(Debug, Clone)]
524#[cfg_attr(feature = "serde", derive(serde::Serialize))]
525pub struct SystemSnapshot {
526    /// System CPU usage percentage (0.0-100.0)
527    pub system_cpu_percent: f64,
528    /// Process CPU usage percentage (0.0-100.0)
529    pub process_cpu_percent: f64,
530    /// System memory usage in MB
531    pub system_memory_mb: u64,
532    /// Process memory usage in MB
533    pub process_memory_mb: u64,
534    /// System load average (1 minute)
535    pub load_average: f64,
536    /// Number of process threads
537    pub thread_count: u32,
538    /// Number of file descriptors
539    pub fd_count: u32,
540    /// Overall health score (0.0-100.0)
541    pub health_score: f64,
542    /// Time since last sampler refresh
543    pub last_update: Duration,
544}
545
546/// Process-specific resource usage
547#[derive(Debug, Clone)]
548#[cfg_attr(feature = "serde", derive(serde::Serialize))]
549pub struct ProcessStats {
550    /// CPU usage percentage
551    pub cpu_percent: f64,
552    /// Memory usage in megabytes
553    pub memory_mb: f64,
554    /// Number of threads
555    pub threads: u32,
556    /// Number of file handles
557    pub file_handles: u32,
558    /// Process uptime
559    pub uptime: Duration,
560}
561
562impl SystemHealth {
563    /// Create a new system health monitor with the default refresh interval
564    /// of 1 second. A background sampler thread is spawned and joined on
565    /// [`Drop`].
566    #[inline]
567    pub fn new() -> Self {
568        Self::with_interval(Duration::from_millis(DEFAULT_INTERVAL_MS))
569    }
570
571    /// Create with a custom refresh interval. Uses [`HealthConfig::default`]
572    /// for the health-score thresholds.
573    ///
574    /// - `interval == Duration::ZERO` ⇒ no sampler thread is spawned;
575    ///   callers must use [`Self::update`] to refresh the cached values
576    ///   (equivalent to [`Self::manual`]).
577    /// - Intervals below `50 ms` are clamped to `50 ms` to prevent the
578    ///   sampler from becoming a CPU spin loop.
579    #[inline]
580    pub fn with_interval(interval: Duration) -> Self {
581        Self::with_config(interval, HealthConfig::default())
582    }
583
584    /// Create with both a custom refresh interval and a custom
585    /// [`HealthConfig`] (v0.9.5+). See [`Self::with_interval`] for interval
586    /// semantics.
587    pub fn with_config(interval: Duration, config: HealthConfig) -> Self {
588        let inner = Arc::new(HealthInner::new(config));
589        // Always seed the initial snapshot so the first read returns
590        // meaningful values even before the sampler ticks.
591        inner.update_metrics();
592
593        if interval.is_zero() {
594            return Self {
595                inner,
596                _sampler: None,
597                update_interval_ms: 0,
598            };
599        }
600        let interval_ms = (interval.as_millis() as u64).max(MIN_INTERVAL_MS);
601        let sampler = spawn_sampler(inner.clone(), interval_ms);
602        Self {
603            inner,
604            _sampler: Some(sampler),
605            update_interval_ms: interval_ms,
606        }
607    }
608
609    /// Construct a manual-mode instance with no sampler thread. Callers
610    /// must invoke [`Self::update`] to refresh the cached values.
611    #[inline]
612    pub fn manual() -> Self {
613        Self::with_interval(Duration::ZERO)
614    }
615
616    /// Configured refresh interval in milliseconds. `0` indicates manual
617    /// mode (no sampler thread).
618    #[must_use]
619    #[inline]
620    pub fn update_interval_ms(&self) -> u64 {
621        self.update_interval_ms
622    }
623
624    /// Get system CPU usage percentage. Lock-free atomic load.
625    #[inline(always)]
626    pub fn cpu_used(&self) -> f64 {
627        self.inner.system_cpu.load(Ordering::Relaxed) as f64 / 100.0
628    }
629
630    /// Get system CPU free percentage.
631    #[inline]
632    pub fn cpu_free(&self) -> f64 {
633        100.0 - self.cpu_used()
634    }
635
636    /// Get system memory usage in MB. Lock-free atomic load.
637    #[inline(always)]
638    pub fn mem_used_mb(&self) -> f64 {
639        self.inner.system_memory_mb.load(Ordering::Relaxed) as f64
640    }
641
642    /// Get system memory usage in GB.
643    #[inline]
644    pub fn mem_used_gb(&self) -> f64 {
645        self.mem_used_mb() / 1024.0
646    }
647
648    /// Get process CPU usage percentage. Lock-free atomic load.
649    #[inline(always)]
650    pub fn process_cpu_used(&self) -> f64 {
651        self.inner.process_cpu.load(Ordering::Relaxed) as f64 / 100.0
652    }
653
654    /// Get process memory usage in MB. Lock-free atomic load.
655    #[inline(always)]
656    pub fn process_mem_used_mb(&self) -> f64 {
657        self.inner.process_memory_mb.load(Ordering::Relaxed) as f64
658    }
659
660    /// Get system load average. Lock-free atomic load.
661    #[inline(always)]
662    pub fn load_avg(&self) -> f64 {
663        self.inner.load_average.load(Ordering::Relaxed) as f64 / 100.0
664    }
665
666    /// Get process thread count. Lock-free atomic load.
667    #[inline(always)]
668    pub fn thread_count(&self) -> u32 {
669        self.inner.thread_count.load(Ordering::Relaxed)
670    }
671
672    /// Get process file descriptor count. Lock-free atomic load.
673    #[inline(always)]
674    pub fn fd_count(&self) -> u32 {
675        self.inner.fd_count.load(Ordering::Relaxed)
676    }
677
678    /// Get overall system health score (0.0-100.0). Lock-free atomic load.
679    #[inline(always)]
680    pub fn health_score(&self) -> f64 {
681        self.inner.health_score.load(Ordering::Relaxed) as f64 / 100.0
682    }
683
684    /// Quick health check. Lock-free atomic load.
685    #[inline(always)]
686    pub fn quick_check(&self) -> HealthStatus {
687        let score = self.health_score();
688        if score >= 80.0 {
689            HealthStatus::Healthy
690        } else if score >= 60.0 {
691            HealthStatus::Warning
692        } else if score >= 40.0 {
693            HealthStatus::Degraded
694        } else {
695            HealthStatus::Critical
696        }
697    }
698
699    /// Force immediate (synchronous) refresh of every cached metric.
700    ///
701    /// Bypasses the sampler interval — useful for tests, on-demand
702    /// snapshots, or manual-mode operation. Safe to call from any thread.
703    #[inline]
704    pub fn update(&self) {
705        self.inner.update_metrics();
706    }
707
708    /// Get a detailed system snapshot. Lock-free atomic loads.
709    pub fn snapshot(&self) -> SystemSnapshot {
710        let inner = &self.inner;
711        let now_ms = inner.created_at.elapsed().as_millis() as u64;
712        let last_ms = inner.last_update_ms.load(Ordering::Relaxed);
713        let last_update = Duration::from_millis(now_ms.saturating_sub(last_ms));
714
715        SystemSnapshot {
716            system_cpu_percent: inner.system_cpu.load(Ordering::Relaxed) as f64 / 100.0,
717            process_cpu_percent: inner.process_cpu.load(Ordering::Relaxed) as f64 / 100.0,
718            system_memory_mb: inner.system_memory_mb.load(Ordering::Relaxed),
719            process_memory_mb: inner.process_memory_mb.load(Ordering::Relaxed),
720            load_average: inner.load_average.load(Ordering::Relaxed) as f64 / 100.0,
721            thread_count: inner.thread_count.load(Ordering::Relaxed),
722            fd_count: inner.fd_count.load(Ordering::Relaxed),
723            health_score: inner.health_score.load(Ordering::Relaxed) as f64 / 100.0,
724            last_update,
725        }
726    }
727
728    /// Get process-specific statistics. Lock-free atomic loads.
729    pub fn process(&self) -> ProcessStats {
730        let inner = &self.inner;
731        ProcessStats {
732            cpu_percent: inner.process_cpu.load(Ordering::Relaxed) as f64 / 100.0,
733            memory_mb: inner.process_memory_mb.load(Ordering::Relaxed) as f64,
734            threads: inner.thread_count.load(Ordering::Relaxed),
735            file_handles: inner.fd_count.load(Ordering::Relaxed),
736            uptime: inner.created_at.elapsed(),
737        }
738    }
739}
740
741/// Spawn the background sampler thread and return its handle.
742fn spawn_sampler(inner: Arc<HealthInner>, interval_ms: u64) -> SamplerHandle {
743    let stop = Arc::new(AtomicBool::new(false));
744    let stop2 = stop.clone();
745    let thread = thread::Builder::new()
746        .name("metrics-lib-health-sampler".into())
747        .spawn(move || run_sampler(inner, stop2, interval_ms))
748        .expect("spawn metrics-lib sampler thread");
749    SamplerHandle {
750        stop,
751        thread: Some(thread),
752    }
753}
754
755fn run_sampler(inner: Arc<HealthInner>, stop: Arc<AtomicBool>, interval_ms: u64) {
756    while !stop.load(Ordering::Relaxed) {
757        // Park in `MAX_SLEEP_CHUNK_MS` chunks so `Drop` can wake us
758        // promptly via `thread.unpark()` without waiting for the full
759        // configured interval to elapse. Manual ceiling-divide keeps MSRV
760        // 1.70 (`u64::div_ceil` is 1.73+).
761        let chunks = interval_ms.saturating_add(MAX_SLEEP_CHUNK_MS - 1) / MAX_SLEEP_CHUNK_MS;
762        let chunk_ms = interval_ms.min(MAX_SLEEP_CHUNK_MS);
763        for _ in 0..chunks.max(1) {
764            if stop.load(Ordering::Relaxed) {
765                return;
766            }
767            thread::park_timeout(Duration::from_millis(chunk_ms));
768        }
769        if stop.load(Ordering::Relaxed) {
770            return;
771        }
772        inner.update_metrics();
773    }
774}
775
776/// System health status
777#[derive(Debug, Clone, Copy, PartialEq, Eq)]
778#[cfg_attr(feature = "serde", derive(serde::Serialize))]
779pub enum HealthStatus {
780    /// System is healthy (80%+ score)
781    Healthy,
782    /// System has warnings (60-80% score)
783    Warning,
784    /// System is degraded (40-60% score)
785    Degraded,
786    /// System is in critical state (<40% score)
787    Critical,
788}
789
790impl HealthStatus {
791    /// `true` if the status indicates degraded or worse.
792    #[inline]
793    pub fn is_degraded(&self) -> bool {
794        matches!(self, Self::Degraded | Self::Critical)
795    }
796
797    /// `true` if the status is healthy.
798    #[inline]
799    pub fn is_healthy(&self) -> bool {
800        matches!(self, Self::Healthy)
801    }
802
803    /// `true` if the status has warnings or worse.
804    #[inline]
805    pub fn has_issues(&self) -> bool {
806        !matches!(self, Self::Healthy)
807    }
808}
809
810impl Default for SystemHealth {
811    fn default() -> Self {
812        Self::new()
813    }
814}
815
816impl std::fmt::Display for SystemHealth {
817    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
818        let snapshot = self.snapshot();
819        write!(
820            f,
821            "SystemHealth(CPU: {:.1}%, Mem: {} MB, Health: {:.1}%)",
822            snapshot.system_cpu_percent, snapshot.system_memory_mb, snapshot.health_score
823        )
824    }
825}
826
827impl std::fmt::Debug for SystemHealth {
828    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
829        let snapshot = self.snapshot();
830        f.debug_struct("SystemHealth")
831            .field("system_cpu", &snapshot.system_cpu_percent)
832            .field("process_cpu", &snapshot.process_cpu_percent)
833            .field("system_memory_mb", &snapshot.system_memory_mb)
834            .field("process_memory_mb", &snapshot.process_memory_mb)
835            .field("load_average", &snapshot.load_average)
836            .field("threads", &snapshot.thread_count)
837            .field("fds", &snapshot.fd_count)
838            .field("health_score", &snapshot.health_score)
839            .field("update_interval_ms", &self.update_interval_ms)
840            .finish()
841    }
842}
843
844impl std::fmt::Display for HealthStatus {
845    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
846        match self {
847            Self::Healthy => write!(f, "Healthy"),
848            Self::Warning => write!(f, "Warning"),
849            Self::Degraded => write!(f, "Degraded"),
850            Self::Critical => write!(f, "Critical"),
851        }
852    }
853}
854
855// Thread safety:
856// `SystemHealth` is composed of `Arc<HealthInner>` (Send + Sync) and an
857// optional `SamplerHandle` (Send + Sync). The compiler derives Send + Sync
858// automatically.
859
860#[cfg(test)]
861mod tests {
862    use super::*;
863    use std::thread;
864
865    #[test]
866    fn test_basic_functionality() {
867        let health = SystemHealth::new();
868        let _cpu = health.cpu_used();
869        let _mem = health.mem_used_mb();
870        let _process_cpu = health.process_cpu_used();
871        let _process_mem = health.process_mem_used_mb();
872        let _load = health.load_avg();
873        let _threads = health.thread_count();
874        let _fds = health.fd_count();
875        let _score = health.health_score();
876
877        let status = health.quick_check();
878        assert!(matches!(
879            status,
880            HealthStatus::Healthy
881                | HealthStatus::Warning
882                | HealthStatus::Degraded
883                | HealthStatus::Critical
884        ));
885    }
886
887    #[test]
888    fn test_cpu_free() {
889        let health = SystemHealth::new();
890        let used = health.cpu_used();
891        let free = health.cpu_free();
892        assert!((used + free - 100.0).abs() < 0.1);
893    }
894
895    #[test]
896    fn test_memory_units() {
897        let health = SystemHealth::new();
898        let mb = health.mem_used_mb();
899        let gb = health.mem_used_gb();
900        if mb > 0.0 {
901            assert!((gb * 1024.0 - mb).abs() < 1.0);
902        }
903    }
904
905    #[test]
906    fn test_snapshot() {
907        let health = SystemHealth::new();
908        let snapshot = health.snapshot();
909        assert!(snapshot.system_cpu_percent >= 0.0);
910        assert!(snapshot.system_cpu_percent <= 100.0);
911        assert!(snapshot.health_score >= 0.0);
912        assert!(snapshot.health_score <= 100.0);
913        assert!(snapshot.thread_count > 0);
914    }
915
916    #[test]
917    fn test_process_stats() {
918        let health = SystemHealth::new();
919        let stats = health.process();
920        assert!(stats.threads > 0);
921        assert!(stats.uptime > Duration::ZERO);
922        assert!(stats.cpu_percent >= 0.0);
923        assert!(stats.memory_mb >= 0.0);
924    }
925
926    #[test]
927    fn test_health_status() {
928        for hs in [
929            HealthStatus::Healthy,
930            HealthStatus::Warning,
931            HealthStatus::Degraded,
932            HealthStatus::Critical,
933        ] {
934            let _ = format!("{hs}");
935        }
936        assert!(HealthStatus::Healthy.is_healthy());
937        assert!(!HealthStatus::Healthy.has_issues());
938        assert!(HealthStatus::Warning.has_issues());
939        assert!(HealthStatus::Degraded.is_degraded());
940        assert!(HealthStatus::Critical.is_degraded());
941    }
942
943    #[test]
944    fn test_custom_interval_floors_to_50ms() {
945        let health = SystemHealth::with_interval(Duration::from_millis(5));
946        assert!(health.update_interval_ms() >= MIN_INTERVAL_MS);
947    }
948
949    #[test]
950    fn test_background_sampler_refreshes_snapshot_after_interval() {
951        // v0.9.4: with a background sampler the snapshot's `last_update`
952        // should bound itself within `interval + slack` even when readers
953        // don't actively trigger refreshes.
954        let health = SystemHealth::with_interval(Duration::from_millis(50));
955        let snap_before = health.snapshot();
956        assert!(snap_before.system_cpu_percent.is_finite());
957
958        thread::sleep(Duration::from_millis(250));
959
960        let snap_after = health.snapshot();
961        assert!(
962            snap_after.last_update <= Duration::from_millis(500),
963            "snapshot.last_update should be 'time since last sampler refresh' \
964             (≤ interval + slack); got {:?}",
965            snap_after.last_update,
966        );
967        assert!(snap_after.system_cpu_percent.is_finite());
968    }
969
970    #[test]
971    fn test_manual_mode_does_not_spawn_sampler() {
972        let health = SystemHealth::manual();
973        assert_eq!(health.update_interval_ms(), 0);
974        // The values should be seeded by the initial in-constructor refresh.
975        let snap = health.snapshot();
976        assert!(snap.system_cpu_percent >= 0.0);
977        // Explicit update still works.
978        health.update();
979    }
980
981    #[test]
982    fn test_force_update() {
983        let health = SystemHealth::new();
984        let score_before = health.health_score();
985        health.update();
986        let score_after = health.health_score();
987        assert!(score_before >= 0.0);
988        assert!(score_after >= 0.0);
989    }
990
991    #[test]
992    fn test_concurrent_access() {
993        let health = std::sync::Arc::new(SystemHealth::new());
994        let mut handles = vec![];
995        for _ in 0..10 {
996            let health_clone = health.clone();
997            let handle = thread::spawn(move || {
998                for _ in 0..100 {
999                    let _cpu = health_clone.cpu_used();
1000                    let _mem = health_clone.mem_used_mb();
1001                    let _status = health_clone.quick_check();
1002                }
1003            });
1004            handles.push(handle);
1005        }
1006        for handle in handles {
1007            handle.join().unwrap();
1008        }
1009        let final_score = health.health_score();
1010        assert!((0.0..=100.0).contains(&final_score));
1011    }
1012
1013    #[test]
1014    fn test_display_formatting() {
1015        let health = SystemHealth::new();
1016        let display_str = format!("{health}");
1017        assert!(display_str.contains("SystemHealth"));
1018        assert!(display_str.contains("CPU"));
1019        assert!(display_str.contains("Mem"));
1020
1021        let debug_str = format!("{health:?}");
1022        assert!(debug_str.contains("SystemHealth"));
1023
1024        let status = health.quick_check();
1025        let status_str = format!("{status}");
1026        assert!(!status_str.is_empty());
1027    }
1028
1029    #[test]
1030    fn test_drop_joins_sampler_thread() {
1031        // Best-effort: dropping a SystemHealth should not leak the sampler.
1032        // We can't directly assert the join, but we exercise the path.
1033        let health = SystemHealth::with_interval(Duration::from_millis(50));
1034        thread::sleep(Duration::from_millis(75));
1035        drop(health);
1036        // If `Drop` deadlocked we'd hang here; reaching the next line passes.
1037    }
1038}
1039
1040#[cfg(all(test, feature = "bench-tests", not(tarpaulin), not(coverage)))]
1041#[allow(unused_imports)]
1042mod benchmarks {
1043    use super::*;
1044    use std::time::Instant;
1045
1046    #[cfg_attr(not(feature = "bench-tests"), ignore)]
1047    #[test]
1048    fn bench_quick_check() {
1049        let health = SystemHealth::new();
1050        let iterations = 1_000_000;
1051        let start = Instant::now();
1052        for _ in 0..iterations {
1053            let _ = health.quick_check();
1054        }
1055        let elapsed = start.elapsed();
1056        println!(
1057            "SystemHealth quick_check: {:.2} ns/op",
1058            elapsed.as_nanos() as f64 / iterations as f64
1059        );
1060        // Throughput-only smoke check; Criterion is the regression detector.
1061    }
1062
1063    #[cfg_attr(not(feature = "bench-tests"), ignore)]
1064    #[test]
1065    fn bench_cached_metrics() {
1066        let health = SystemHealth::new();
1067        let iterations = 1_000_000;
1068        let start = Instant::now();
1069        for _ in 0..iterations {
1070            let _ = health.cpu_used();
1071            let _ = health.mem_used_mb();
1072            let _ = health.health_score();
1073        }
1074        let elapsed = start.elapsed();
1075        println!(
1076            "SystemHealth cached metrics: {:.2} ns/op",
1077            elapsed.as_nanos() as f64 / iterations as f64 / 3.0
1078        );
1079        // Throughput-only smoke check; Criterion is the regression detector.
1080    }
1081
1082    #[cfg_attr(not(feature = "bench-tests"), ignore)]
1083    #[test]
1084    fn bench_force_update() {
1085        let health = SystemHealth::manual();
1086        let iterations = 1000;
1087        let start = Instant::now();
1088        for _ in 0..iterations {
1089            health.update();
1090        }
1091        let elapsed = start.elapsed();
1092        println!(
1093            "SystemHealth force update: {:.2} μs/op",
1094            elapsed.as_micros() as f64 / iterations as f64
1095        );
1096        // Throughput-only smoke check; Criterion is the regression detector.
1097    }
1098}