Skip to main content

metrics_lib/
system_health.rs

1//! # System Health Monitoring
2//!
3//! Ultra-fast system resource monitoring with process introspection.
4//!
5//! ## Architecture (v0.9.4)
6//!
7//! `SystemHealth` separates state from sampling:
8//!
9//! - All atomic state lives in `HealthInner` behind an `Arc`.
10//! - Reader methods (`cpu_used` / `mem_used_mb` / `health_score` / …) do a
11//!   single `Relaxed` atomic load and return — they never block, never call
12//!   into the OS, never acquire a lock.
13//! - A **background sampler thread**, owned by the `SystemHealth` instance,
14//!   wakes on the configured interval and refreshes the atomics. The thread
15//!   is the only writer; readers see a fresh snapshot every
16//!   `update_interval_ms` (default: 1000 ms).
17//! - `SystemHealth::manual()` constructs an instance with no sampler thread
18//!   for callers who want full control via [`SystemHealth::update`].
19//!
20//! Before 0.9.4, readers called `maybe_update()` which contended on the
21//! sysinfo mutex on non-Linux platforms and stalled async runtimes during
22//! refresh. The new architecture moves that work off the read path entirely.
23//!
24//! ## Features
25//!
26//! - **Process CPU/Memory tracking** — automatic per-process sampling.
27//! - **System-wide monitoring** — CPU, memory, load average.
28//! - **Background refresh** — non-blocking reads regardless of platform.
29//! - **Cross-platform** — `/proc` on Linux, `sysinfo` elsewhere.
30//! - **Zero allocations** on the hot path.
31//! - **Health scoring** — composite 0–100 health score.
32
33use std::io;
34use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
35use std::sync::Arc;
36use std::thread::{self, JoinHandle};
37use std::time::{Duration, Instant};
38
39#[cfg(not(target_os = "linux"))]
40use sysinfo::{get_current_pid, CpuExt, ProcessExt, System, SystemExt};
41
42/// Default interval between background samples (milliseconds).
43const DEFAULT_INTERVAL_MS: u64 = 1000;
44/// Hard floor on the sampler sleep duration so a misconfigured 0 ms interval
45/// does not become a CPU spin loop.
46const MIN_INTERVAL_MS: u64 = 50;
47/// Maximum sleep window before the sampler re-checks the stop flag — keeps
48/// `Drop` latency bounded even on very long configured intervals.
49const MAX_SLEEP_CHUNK_MS: u64 = 1000;
50
51/// Mutable state of a [`SystemHealth`] instance.
52///
53/// Shared between the sampler thread (sole writer) and any number of reader
54/// threads via `Arc`. All public fields are atomic so reads never block.
55#[repr(align(64))]
56struct HealthInner {
57    /// Last system CPU usage (percentage * 100).
58    system_cpu: AtomicU32,
59    /// Last process CPU usage (percentage * 100).
60    process_cpu: AtomicU32,
61    /// System memory usage in MB.
62    system_memory_mb: AtomicU64,
63    /// Process memory usage in MB.
64    process_memory_mb: AtomicU64,
65    /// System load average (1 min * 100).
66    load_average: AtomicU32,
67    /// Process thread count.
68    thread_count: AtomicU32,
69    /// Process file descriptor count.
70    fd_count: AtomicU32,
71    /// Overall health score (0-10000, where 10000 = 100%).
72    health_score: AtomicU32,
73    /// Milliseconds since `created_at` at the last metrics refresh.
74    last_update_ms: AtomicU64,
75    /// Creation timestamp (process start, effectively).
76    created_at: Instant,
77    /// Linux-only delta-sample state for process CPU.
78    #[cfg(target_os = "linux")]
79    proc_cpu_prev: AtomicU64,
80    /// Linux-only delta-sample state for process CPU. `u64::MAX` sentinel =
81    /// "no prior sample yet".
82    #[cfg(target_os = "linux")]
83    proc_cpu_prev_ms: AtomicU64,
84    /// Non-Linux: shared `sysinfo::System` used by the sampler thread.
85    /// Readers never touch this mutex — only the sampler does.
86    #[cfg(not(target_os = "linux"))]
87    sys: parking_lot::Mutex<System>,
88    #[cfg(not(target_os = "linux"))]
89    pid: Option<sysinfo::Pid>,
90}
91
92impl HealthInner {
93    fn new() -> Self {
94        Self {
95            system_cpu: AtomicU32::new(0),
96            process_cpu: AtomicU32::new(0),
97            system_memory_mb: AtomicU64::new(0),
98            process_memory_mb: AtomicU64::new(0),
99            load_average: AtomicU32::new(0),
100            thread_count: AtomicU32::new(0),
101            fd_count: AtomicU32::new(0),
102            health_score: AtomicU32::new(10000),
103            last_update_ms: AtomicU64::new(0),
104            created_at: Instant::now(),
105            #[cfg(target_os = "linux")]
106            proc_cpu_prev: AtomicU64::new(0),
107            #[cfg(target_os = "linux")]
108            proc_cpu_prev_ms: AtomicU64::new(u64::MAX),
109            #[cfg(not(target_os = "linux"))]
110            sys: parking_lot::Mutex::new(System::new()),
111            #[cfg(not(target_os = "linux"))]
112            pid: get_current_pid().ok(),
113        }
114    }
115
116    fn update_metrics(&self) {
117        let now_ms = self.created_at.elapsed().as_millis() as u64;
118
119        if let Ok(cpu) = self.get_system_cpu() {
120            self.system_cpu
121                .store((cpu * 100.0) as u32, Ordering::Relaxed);
122        }
123        if let Ok(memory_mb) = self.get_system_memory_mb() {
124            self.system_memory_mb.store(memory_mb, Ordering::Relaxed);
125        }
126        if let Ok(load) = self.get_load_average() {
127            self.load_average
128                .store((load * 100.0) as u32, Ordering::Relaxed);
129        }
130        if let Ok(cpu) = self.get_process_cpu() {
131            self.process_cpu
132                .store((cpu * 100.0) as u32, Ordering::Relaxed);
133        }
134        if let Ok(memory_mb) = self.get_process_memory_mb() {
135            self.process_memory_mb.store(memory_mb, Ordering::Relaxed);
136        }
137        if let Ok(threads) = self.get_thread_count() {
138            self.thread_count.store(threads, Ordering::Relaxed);
139        }
140        if let Ok(fds) = self.get_fd_count() {
141            self.fd_count.store(fds, Ordering::Relaxed);
142        }
143
144        let health = self.calculate_health_score();
145        self.health_score
146            .store((health * 100.0) as u32, Ordering::Relaxed);
147
148        self.last_update_ms.store(now_ms, Ordering::Relaxed);
149    }
150
151    fn calculate_health_score(&self) -> f64 {
152        let mut score: f64 = 100.0;
153
154        // CPU penalty (system)
155        let system_cpu = self.system_cpu.load(Ordering::Relaxed) as f64 / 100.0;
156        if system_cpu > 80.0 {
157            score -= 30.0;
158        } else if system_cpu > 60.0 {
159            score -= 15.0;
160        } else if system_cpu > 40.0 {
161            score -= 5.0;
162        }
163
164        // Load average penalty
165        let load = self.load_average.load(Ordering::Relaxed) as f64 / 100.0;
166        let cpu_count = num_cpus::get() as f64;
167        if load > cpu_count * 2.0 {
168            score -= 25.0;
169        } else if load > cpu_count * 1.5 {
170            score -= 10.0;
171        } else if load > cpu_count {
172            score -= 5.0;
173        }
174
175        // Process CPU penalty
176        let process_cpu = self.process_cpu.load(Ordering::Relaxed) as f64 / 100.0;
177        if process_cpu > 50.0 {
178            score -= 15.0;
179        } else if process_cpu > 25.0 {
180            score -= 8.0;
181        }
182
183        // Memory pressure (simplified — would need actual available memory)
184        let memory_gb = self.system_memory_mb.load(Ordering::Relaxed) as f64 / 1024.0;
185        if memory_gb > 16.0 {
186            score -= 10.0;
187        } else if memory_gb > 8.0 {
188            score -= 5.0;
189        }
190
191        // Thread count penalty
192        let threads = self.thread_count.load(Ordering::Relaxed);
193        if threads > 1000 {
194            score -= 20.0;
195        } else if threads > 500 {
196            score -= 10.0;
197        } else if threads > 200 {
198            score -= 5.0;
199        }
200
201        // FD count penalty
202        let fds = self.fd_count.load(Ordering::Relaxed);
203        if fds > 10000 {
204            score -= 15.0;
205        } else if fds > 5000 {
206            score -= 8.0;
207        } else if fds > 1000 {
208            score -= 3.0;
209        }
210
211        score.max(0.0)
212    }
213
214    // ----- platform-specific samplers -----
215
216    #[cfg(target_os = "linux")]
217    fn get_system_cpu(&self) -> io::Result<f64> {
218        let contents = std::fs::read_to_string("/proc/stat")?;
219        if let Some(line) = contents.lines().next() {
220            let parts: Vec<&str> = line.split_whitespace().collect();
221            if parts.len() >= 5 && parts[0] == "cpu" {
222                let user: u64 = parts[1].parse().unwrap_or(0);
223                let nice: u64 = parts[2].parse().unwrap_or(0);
224                let system: u64 = parts[3].parse().unwrap_or(0);
225                let idle: u64 = parts[4].parse().unwrap_or(0);
226                let total = user + nice + system + idle;
227                let used = user + nice + system;
228                if total > 0 {
229                    return Ok(used as f64 / total as f64 * 100.0);
230                }
231            }
232        }
233        Ok(0.0)
234    }
235
236    #[cfg(not(target_os = "linux"))]
237    fn get_system_cpu(&self) -> io::Result<f64> {
238        let mut guard = self.sys.lock();
239        guard.refresh_cpu();
240        Ok(guard.global_cpu_info().cpu_usage() as f64)
241    }
242
243    #[cfg(target_os = "linux")]
244    fn get_system_memory_mb(&self) -> io::Result<u64> {
245        let contents = std::fs::read_to_string("/proc/meminfo")?;
246        let mut total_kb = 0u64;
247        let mut free_kb = 0u64;
248        let mut available_kb = 0u64;
249        for line in contents.lines() {
250            if let Some(rest) = line.strip_prefix("MemTotal:") {
251                total_kb = rest
252                    .split_whitespace()
253                    .next()
254                    .and_then(|s| s.parse().ok())
255                    .unwrap_or(0);
256            } else if let Some(rest) = line.strip_prefix("MemFree:") {
257                free_kb = rest
258                    .split_whitespace()
259                    .next()
260                    .and_then(|s| s.parse().ok())
261                    .unwrap_or(0);
262            } else if let Some(rest) = line.strip_prefix("MemAvailable:") {
263                available_kb = rest
264                    .split_whitespace()
265                    .next()
266                    .and_then(|s| s.parse().ok())
267                    .unwrap_or(0);
268            }
269        }
270        let used_kb = if available_kb > 0 {
271            total_kb.saturating_sub(available_kb)
272        } else {
273            total_kb.saturating_sub(free_kb)
274        };
275        Ok(used_kb / 1024)
276    }
277
278    #[cfg(not(target_os = "linux"))]
279    fn get_system_memory_mb(&self) -> io::Result<u64> {
280        let mut guard = self.sys.lock();
281        guard.refresh_memory();
282        let used_kib = guard.used_memory();
283        Ok(used_kib / 1024)
284    }
285
286    #[cfg(target_os = "linux")]
287    fn get_load_average(&self) -> io::Result<f64> {
288        let contents = std::fs::read_to_string("/proc/loadavg")?;
289        if let Some(first) = contents.split_whitespace().next() {
290            return first
291                .parse()
292                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "Invalid load average"));
293        }
294        Ok(0.0)
295    }
296
297    #[cfg(not(target_os = "linux"))]
298    fn get_load_average(&self) -> io::Result<f64> {
299        let guard = self.sys.lock();
300        Ok(guard.load_average().one)
301    }
302
303    #[cfg(target_os = "linux")]
304    fn get_process_cpu(&self) -> io::Result<f64> {
305        // Delta sample: ((utime + stime) - prev) / (CLK_TCK * elapsed_s * cores) * 100.
306        let contents = std::fs::read_to_string("/proc/self/stat")?;
307        let parts: Vec<&str> = contents.split_whitespace().collect();
308        if parts.len() < 15 {
309            return Ok(0.0);
310        }
311        let utime: u64 = parts[13].parse().unwrap_or(0);
312        let stime: u64 = parts[14].parse().unwrap_or(0);
313        let total_ticks = utime.saturating_add(stime);
314        let now_ms = self.created_at.elapsed().as_millis() as u64;
315
316        let prev_ticks = self.proc_cpu_prev.load(Ordering::Relaxed);
317        let prev_ms = self.proc_cpu_prev_ms.load(Ordering::Relaxed);
318        self.proc_cpu_prev.store(total_ticks, Ordering::Relaxed);
319        self.proc_cpu_prev_ms.store(now_ms, Ordering::Relaxed);
320
321        if prev_ms == u64::MAX {
322            return Ok(0.0);
323        }
324        let elapsed_ms = now_ms.saturating_sub(prev_ms);
325        if elapsed_ms == 0 {
326            return Ok(0.0);
327        }
328        let delta_ticks = total_ticks.saturating_sub(prev_ticks) as f64;
329        let clk_tck: f64 = 100.0;
330        let elapsed_s = elapsed_ms as f64 / 1000.0;
331        let cores = num_cpus::get().max(1) as f64;
332        let pct = (delta_ticks / (clk_tck * elapsed_s * cores)) * 100.0;
333        Ok(pct.clamp(0.0, 100.0))
334    }
335
336    #[cfg(not(target_os = "linux"))]
337    fn get_process_cpu(&self) -> io::Result<f64> {
338        let mut guard = self.sys.lock();
339        if let Some(pid) = self.pid {
340            guard.refresh_process(pid);
341            if let Some(proc_) = guard.process(pid) {
342                let raw = proc_.cpu_usage() as f64;
343                let cores = num_cpus::get() as f64;
344                let norm = if cores > 0.0 { raw / cores } else { raw };
345                return Ok(norm.clamp(0.0, 100.0));
346            }
347        }
348        Ok(0.0)
349    }
350
351    #[cfg(target_os = "linux")]
352    fn get_process_memory_mb(&self) -> io::Result<u64> {
353        let contents = std::fs::read_to_string("/proc/self/status")?;
354        for line in contents.lines() {
355            if let Some(rest) = line.strip_prefix("VmRSS:") {
356                if let Some(kb) = rest
357                    .split_whitespace()
358                    .next()
359                    .and_then(|s| s.parse::<u64>().ok())
360                {
361                    return Ok(kb / 1024);
362                }
363            }
364        }
365        Ok(0)
366    }
367
368    #[cfg(not(target_os = "linux"))]
369    fn get_process_memory_mb(&self) -> io::Result<u64> {
370        let mut guard = self.sys.lock();
371        if let Some(pid) = self.pid {
372            guard.refresh_process(pid);
373            if let Some(proc_) = guard.process(pid) {
374                return Ok(proc_.memory() / 1024);
375            }
376        }
377        Ok(0)
378    }
379
380    #[cfg(target_os = "linux")]
381    fn get_thread_count(&self) -> io::Result<u32> {
382        let contents = std::fs::read_to_string("/proc/self/status")?;
383        for line in contents.lines() {
384            if let Some(rest) = line.strip_prefix("Threads:") {
385                if let Some(c) = rest.split_whitespace().next().and_then(|s| s.parse().ok()) {
386                    return Ok(c);
387                }
388            }
389        }
390        Ok(1)
391    }
392
393    #[cfg(not(target_os = "linux"))]
394    fn get_thread_count(&self) -> io::Result<u32> {
395        Ok(1)
396    }
397
398    #[cfg(target_os = "linux")]
399    fn get_fd_count(&self) -> io::Result<u32> {
400        match std::fs::read_dir("/proc/self/fd") {
401            Ok(entries) => Ok(entries.count() as u32),
402            Err(_) => Ok(0),
403        }
404    }
405
406    #[cfg(not(target_os = "linux"))]
407    fn get_fd_count(&self) -> io::Result<u32> {
408        Ok(0)
409    }
410}
411
412/// System health monitor with process introspection.
413///
414/// Owns a background sampler thread (unless constructed via
415/// [`SystemHealth::manual`]) that refreshes the cached values every
416/// `update_interval_ms`. All accessor methods are lock-free atomic loads.
417#[repr(align(64))]
418pub struct SystemHealth {
419    inner: Arc<HealthInner>,
420    /// Lives only for its `Drop` side-effect (stops + joins the sampler
421    /// thread). Prefixed `_` so the `dead_code` lint doesn't flag the
422    /// drop-only field.
423    _sampler: Option<SamplerHandle>,
424    /// Configured interval in milliseconds (0 = manual mode, no sampler).
425    update_interval_ms: u64,
426}
427
428/// Per-instance sampler thread handle. Stops + joins the thread on `Drop`.
429struct SamplerHandle {
430    stop: Arc<AtomicBool>,
431    thread: Option<JoinHandle<()>>,
432}
433
434impl Drop for SamplerHandle {
435    fn drop(&mut self) {
436        self.stop.store(true, Ordering::Relaxed);
437        if let Some(t) = self.thread.take() {
438            // Wake the sleeper so `Drop` doesn't block until the next tick.
439            t.thread().unpark();
440            let _ = t.join();
441        }
442    }
443}
444
445/// System resource usage snapshot
446#[derive(Debug, Clone)]
447#[cfg_attr(feature = "serde", derive(serde::Serialize))]
448pub struct SystemSnapshot {
449    /// System CPU usage percentage (0.0-100.0)
450    pub system_cpu_percent: f64,
451    /// Process CPU usage percentage (0.0-100.0)
452    pub process_cpu_percent: f64,
453    /// System memory usage in MB
454    pub system_memory_mb: u64,
455    /// Process memory usage in MB
456    pub process_memory_mb: u64,
457    /// System load average (1 minute)
458    pub load_average: f64,
459    /// Number of process threads
460    pub thread_count: u32,
461    /// Number of file descriptors
462    pub fd_count: u32,
463    /// Overall health score (0.0-100.0)
464    pub health_score: f64,
465    /// Time since last sampler refresh
466    pub last_update: Duration,
467}
468
469/// Process-specific resource usage
470#[derive(Debug, Clone)]
471#[cfg_attr(feature = "serde", derive(serde::Serialize))]
472pub struct ProcessStats {
473    /// CPU usage percentage
474    pub cpu_percent: f64,
475    /// Memory usage in megabytes
476    pub memory_mb: f64,
477    /// Number of threads
478    pub threads: u32,
479    /// Number of file handles
480    pub file_handles: u32,
481    /// Process uptime
482    pub uptime: Duration,
483}
484
485impl SystemHealth {
486    /// Create a new system health monitor with the default refresh interval
487    /// of 1 second. A background sampler thread is spawned and joined on
488    /// [`Drop`].
489    #[inline]
490    pub fn new() -> Self {
491        Self::with_interval(Duration::from_millis(DEFAULT_INTERVAL_MS))
492    }
493
494    /// Create with a custom refresh interval.
495    ///
496    /// - `interval == Duration::ZERO` ⇒ no sampler thread is spawned;
497    ///   callers must use [`Self::update`] to refresh the cached values
498    ///   (equivalent to [`Self::manual`]).
499    /// - Intervals below `50 ms` are clamped to `50 ms` to prevent the
500    ///   sampler from becoming a CPU spin loop.
501    #[inline]
502    pub fn with_interval(interval: Duration) -> Self {
503        let inner = Arc::new(HealthInner::new());
504        // Always seed the initial snapshot so the first read returns
505        // meaningful values even before the sampler ticks.
506        inner.update_metrics();
507
508        if interval.is_zero() {
509            return Self {
510                inner,
511                _sampler: None,
512                update_interval_ms: 0,
513            };
514        }
515        let interval_ms = (interval.as_millis() as u64).max(MIN_INTERVAL_MS);
516        let sampler = spawn_sampler(inner.clone(), interval_ms);
517        Self {
518            inner,
519            _sampler: Some(sampler),
520            update_interval_ms: interval_ms,
521        }
522    }
523
524    /// Construct a manual-mode instance with no sampler thread. Callers
525    /// must invoke [`Self::update`] to refresh the cached values.
526    #[inline]
527    pub fn manual() -> Self {
528        Self::with_interval(Duration::ZERO)
529    }
530
531    /// Configured refresh interval in milliseconds. `0` indicates manual
532    /// mode (no sampler thread).
533    #[must_use]
534    #[inline]
535    pub fn update_interval_ms(&self) -> u64 {
536        self.update_interval_ms
537    }
538
539    /// Get system CPU usage percentage. Lock-free atomic load.
540    #[inline(always)]
541    pub fn cpu_used(&self) -> f64 {
542        self.inner.system_cpu.load(Ordering::Relaxed) as f64 / 100.0
543    }
544
545    /// Get system CPU free percentage.
546    #[inline]
547    pub fn cpu_free(&self) -> f64 {
548        100.0 - self.cpu_used()
549    }
550
551    /// Get system memory usage in MB. Lock-free atomic load.
552    #[inline(always)]
553    pub fn mem_used_mb(&self) -> f64 {
554        self.inner.system_memory_mb.load(Ordering::Relaxed) as f64
555    }
556
557    /// Get system memory usage in GB.
558    #[inline]
559    pub fn mem_used_gb(&self) -> f64 {
560        self.mem_used_mb() / 1024.0
561    }
562
563    /// Get process CPU usage percentage. Lock-free atomic load.
564    #[inline(always)]
565    pub fn process_cpu_used(&self) -> f64 {
566        self.inner.process_cpu.load(Ordering::Relaxed) as f64 / 100.0
567    }
568
569    /// Get process memory usage in MB. Lock-free atomic load.
570    #[inline(always)]
571    pub fn process_mem_used_mb(&self) -> f64 {
572        self.inner.process_memory_mb.load(Ordering::Relaxed) as f64
573    }
574
575    /// Get system load average. Lock-free atomic load.
576    #[inline(always)]
577    pub fn load_avg(&self) -> f64 {
578        self.inner.load_average.load(Ordering::Relaxed) as f64 / 100.0
579    }
580
581    /// Get process thread count. Lock-free atomic load.
582    #[inline(always)]
583    pub fn thread_count(&self) -> u32 {
584        self.inner.thread_count.load(Ordering::Relaxed)
585    }
586
587    /// Get process file descriptor count. Lock-free atomic load.
588    #[inline(always)]
589    pub fn fd_count(&self) -> u32 {
590        self.inner.fd_count.load(Ordering::Relaxed)
591    }
592
593    /// Get overall system health score (0.0-100.0). Lock-free atomic load.
594    #[inline(always)]
595    pub fn health_score(&self) -> f64 {
596        self.inner.health_score.load(Ordering::Relaxed) as f64 / 100.0
597    }
598
599    /// Quick health check. Lock-free atomic load.
600    #[inline(always)]
601    pub fn quick_check(&self) -> HealthStatus {
602        let score = self.health_score();
603        if score >= 80.0 {
604            HealthStatus::Healthy
605        } else if score >= 60.0 {
606            HealthStatus::Warning
607        } else if score >= 40.0 {
608            HealthStatus::Degraded
609        } else {
610            HealthStatus::Critical
611        }
612    }
613
614    /// Force immediate (synchronous) refresh of every cached metric.
615    ///
616    /// Bypasses the sampler interval — useful for tests, on-demand
617    /// snapshots, or manual-mode operation. Safe to call from any thread.
618    #[inline]
619    pub fn update(&self) {
620        self.inner.update_metrics();
621    }
622
623    /// Get a detailed system snapshot. Lock-free atomic loads.
624    pub fn snapshot(&self) -> SystemSnapshot {
625        let inner = &self.inner;
626        let now_ms = inner.created_at.elapsed().as_millis() as u64;
627        let last_ms = inner.last_update_ms.load(Ordering::Relaxed);
628        let last_update = Duration::from_millis(now_ms.saturating_sub(last_ms));
629
630        SystemSnapshot {
631            system_cpu_percent: inner.system_cpu.load(Ordering::Relaxed) as f64 / 100.0,
632            process_cpu_percent: inner.process_cpu.load(Ordering::Relaxed) as f64 / 100.0,
633            system_memory_mb: inner.system_memory_mb.load(Ordering::Relaxed),
634            process_memory_mb: inner.process_memory_mb.load(Ordering::Relaxed),
635            load_average: inner.load_average.load(Ordering::Relaxed) as f64 / 100.0,
636            thread_count: inner.thread_count.load(Ordering::Relaxed),
637            fd_count: inner.fd_count.load(Ordering::Relaxed),
638            health_score: inner.health_score.load(Ordering::Relaxed) as f64 / 100.0,
639            last_update,
640        }
641    }
642
643    /// Get process-specific statistics. Lock-free atomic loads.
644    pub fn process(&self) -> ProcessStats {
645        let inner = &self.inner;
646        ProcessStats {
647            cpu_percent: inner.process_cpu.load(Ordering::Relaxed) as f64 / 100.0,
648            memory_mb: inner.process_memory_mb.load(Ordering::Relaxed) as f64,
649            threads: inner.thread_count.load(Ordering::Relaxed),
650            file_handles: inner.fd_count.load(Ordering::Relaxed),
651            uptime: inner.created_at.elapsed(),
652        }
653    }
654}
655
656/// Spawn the background sampler thread and return its handle.
657fn spawn_sampler(inner: Arc<HealthInner>, interval_ms: u64) -> SamplerHandle {
658    let stop = Arc::new(AtomicBool::new(false));
659    let stop2 = stop.clone();
660    let thread = thread::Builder::new()
661        .name("metrics-lib-health-sampler".into())
662        .spawn(move || run_sampler(inner, stop2, interval_ms))
663        .expect("spawn metrics-lib sampler thread");
664    SamplerHandle {
665        stop,
666        thread: Some(thread),
667    }
668}
669
670fn run_sampler(inner: Arc<HealthInner>, stop: Arc<AtomicBool>, interval_ms: u64) {
671    while !stop.load(Ordering::Relaxed) {
672        // Park in `MAX_SLEEP_CHUNK_MS` chunks so `Drop` can wake us
673        // promptly via `thread.unpark()` without waiting for the full
674        // configured interval to elapse. Manual ceiling-divide keeps MSRV
675        // 1.70 (`u64::div_ceil` is 1.73+).
676        let chunks = interval_ms.saturating_add(MAX_SLEEP_CHUNK_MS - 1) / MAX_SLEEP_CHUNK_MS;
677        let chunk_ms = interval_ms.min(MAX_SLEEP_CHUNK_MS);
678        for _ in 0..chunks.max(1) {
679            if stop.load(Ordering::Relaxed) {
680                return;
681            }
682            thread::park_timeout(Duration::from_millis(chunk_ms));
683        }
684        if stop.load(Ordering::Relaxed) {
685            return;
686        }
687        inner.update_metrics();
688    }
689}
690
691/// System health status
692#[derive(Debug, Clone, Copy, PartialEq, Eq)]
693#[cfg_attr(feature = "serde", derive(serde::Serialize))]
694pub enum HealthStatus {
695    /// System is healthy (80%+ score)
696    Healthy,
697    /// System has warnings (60-80% score)
698    Warning,
699    /// System is degraded (40-60% score)
700    Degraded,
701    /// System is in critical state (<40% score)
702    Critical,
703}
704
705impl HealthStatus {
706    /// `true` if the status indicates degraded or worse.
707    #[inline]
708    pub fn is_degraded(&self) -> bool {
709        matches!(self, Self::Degraded | Self::Critical)
710    }
711
712    /// `true` if the status is healthy.
713    #[inline]
714    pub fn is_healthy(&self) -> bool {
715        matches!(self, Self::Healthy)
716    }
717
718    /// `true` if the status has warnings or worse.
719    #[inline]
720    pub fn has_issues(&self) -> bool {
721        !matches!(self, Self::Healthy)
722    }
723}
724
725impl Default for SystemHealth {
726    fn default() -> Self {
727        Self::new()
728    }
729}
730
731impl std::fmt::Display for SystemHealth {
732    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
733        let snapshot = self.snapshot();
734        write!(
735            f,
736            "SystemHealth(CPU: {:.1}%, Mem: {} MB, Health: {:.1}%)",
737            snapshot.system_cpu_percent, snapshot.system_memory_mb, snapshot.health_score
738        )
739    }
740}
741
742impl std::fmt::Debug for SystemHealth {
743    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
744        let snapshot = self.snapshot();
745        f.debug_struct("SystemHealth")
746            .field("system_cpu", &snapshot.system_cpu_percent)
747            .field("process_cpu", &snapshot.process_cpu_percent)
748            .field("system_memory_mb", &snapshot.system_memory_mb)
749            .field("process_memory_mb", &snapshot.process_memory_mb)
750            .field("load_average", &snapshot.load_average)
751            .field("threads", &snapshot.thread_count)
752            .field("fds", &snapshot.fd_count)
753            .field("health_score", &snapshot.health_score)
754            .field("update_interval_ms", &self.update_interval_ms)
755            .finish()
756    }
757}
758
759impl std::fmt::Display for HealthStatus {
760    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
761        match self {
762            Self::Healthy => write!(f, "Healthy"),
763            Self::Warning => write!(f, "Warning"),
764            Self::Degraded => write!(f, "Degraded"),
765            Self::Critical => write!(f, "Critical"),
766        }
767    }
768}
769
770// Thread safety:
771// `SystemHealth` is composed of `Arc<HealthInner>` (Send + Sync) and an
772// optional `SamplerHandle` (Send + Sync). The compiler derives Send + Sync
773// automatically.
774
775#[cfg(test)]
776mod tests {
777    use super::*;
778    use std::thread;
779
780    #[test]
781    fn test_basic_functionality() {
782        let health = SystemHealth::new();
783        let _cpu = health.cpu_used();
784        let _mem = health.mem_used_mb();
785        let _process_cpu = health.process_cpu_used();
786        let _process_mem = health.process_mem_used_mb();
787        let _load = health.load_avg();
788        let _threads = health.thread_count();
789        let _fds = health.fd_count();
790        let _score = health.health_score();
791
792        let status = health.quick_check();
793        assert!(matches!(
794            status,
795            HealthStatus::Healthy
796                | HealthStatus::Warning
797                | HealthStatus::Degraded
798                | HealthStatus::Critical
799        ));
800    }
801
802    #[test]
803    fn test_cpu_free() {
804        let health = SystemHealth::new();
805        let used = health.cpu_used();
806        let free = health.cpu_free();
807        assert!((used + free - 100.0).abs() < 0.1);
808    }
809
810    #[test]
811    fn test_memory_units() {
812        let health = SystemHealth::new();
813        let mb = health.mem_used_mb();
814        let gb = health.mem_used_gb();
815        if mb > 0.0 {
816            assert!((gb * 1024.0 - mb).abs() < 1.0);
817        }
818    }
819
820    #[test]
821    fn test_snapshot() {
822        let health = SystemHealth::new();
823        let snapshot = health.snapshot();
824        assert!(snapshot.system_cpu_percent >= 0.0);
825        assert!(snapshot.system_cpu_percent <= 100.0);
826        assert!(snapshot.health_score >= 0.0);
827        assert!(snapshot.health_score <= 100.0);
828        assert!(snapshot.thread_count > 0);
829    }
830
831    #[test]
832    fn test_process_stats() {
833        let health = SystemHealth::new();
834        let stats = health.process();
835        assert!(stats.threads > 0);
836        assert!(stats.uptime > Duration::ZERO);
837        assert!(stats.cpu_percent >= 0.0);
838        assert!(stats.memory_mb >= 0.0);
839    }
840
841    #[test]
842    fn test_health_status() {
843        for hs in [
844            HealthStatus::Healthy,
845            HealthStatus::Warning,
846            HealthStatus::Degraded,
847            HealthStatus::Critical,
848        ] {
849            let _ = format!("{hs}");
850        }
851        assert!(HealthStatus::Healthy.is_healthy());
852        assert!(!HealthStatus::Healthy.has_issues());
853        assert!(HealthStatus::Warning.has_issues());
854        assert!(HealthStatus::Degraded.is_degraded());
855        assert!(HealthStatus::Critical.is_degraded());
856    }
857
858    #[test]
859    fn test_custom_interval_floors_to_50ms() {
860        let health = SystemHealth::with_interval(Duration::from_millis(5));
861        assert!(health.update_interval_ms() >= MIN_INTERVAL_MS);
862    }
863
864    #[test]
865    fn test_background_sampler_refreshes_snapshot_after_interval() {
866        // v0.9.4: with a background sampler the snapshot's `last_update`
867        // should bound itself within `interval + slack` even when readers
868        // don't actively trigger refreshes.
869        let health = SystemHealth::with_interval(Duration::from_millis(50));
870        let snap_before = health.snapshot();
871        assert!(snap_before.system_cpu_percent.is_finite());
872
873        thread::sleep(Duration::from_millis(250));
874
875        let snap_after = health.snapshot();
876        assert!(
877            snap_after.last_update <= Duration::from_millis(500),
878            "snapshot.last_update should be 'time since last sampler refresh' \
879             (≤ interval + slack); got {:?}",
880            snap_after.last_update,
881        );
882        assert!(snap_after.system_cpu_percent.is_finite());
883    }
884
885    #[test]
886    fn test_manual_mode_does_not_spawn_sampler() {
887        let health = SystemHealth::manual();
888        assert_eq!(health.update_interval_ms(), 0);
889        // The values should be seeded by the initial in-constructor refresh.
890        let snap = health.snapshot();
891        assert!(snap.system_cpu_percent >= 0.0);
892        // Explicit update still works.
893        health.update();
894    }
895
896    #[test]
897    fn test_force_update() {
898        let health = SystemHealth::new();
899        let score_before = health.health_score();
900        health.update();
901        let score_after = health.health_score();
902        assert!(score_before >= 0.0);
903        assert!(score_after >= 0.0);
904    }
905
906    #[test]
907    fn test_concurrent_access() {
908        let health = std::sync::Arc::new(SystemHealth::new());
909        let mut handles = vec![];
910        for _ in 0..10 {
911            let health_clone = health.clone();
912            let handle = thread::spawn(move || {
913                for _ in 0..100 {
914                    let _cpu = health_clone.cpu_used();
915                    let _mem = health_clone.mem_used_mb();
916                    let _status = health_clone.quick_check();
917                }
918            });
919            handles.push(handle);
920        }
921        for handle in handles {
922            handle.join().unwrap();
923        }
924        let final_score = health.health_score();
925        assert!((0.0..=100.0).contains(&final_score));
926    }
927
928    #[test]
929    fn test_display_formatting() {
930        let health = SystemHealth::new();
931        let display_str = format!("{health}");
932        assert!(display_str.contains("SystemHealth"));
933        assert!(display_str.contains("CPU"));
934        assert!(display_str.contains("Mem"));
935
936        let debug_str = format!("{health:?}");
937        assert!(debug_str.contains("SystemHealth"));
938
939        let status = health.quick_check();
940        let status_str = format!("{status}");
941        assert!(!status_str.is_empty());
942    }
943
944    #[test]
945    fn test_drop_joins_sampler_thread() {
946        // Best-effort: dropping a SystemHealth should not leak the sampler.
947        // We can't directly assert the join, but we exercise the path.
948        let health = SystemHealth::with_interval(Duration::from_millis(50));
949        thread::sleep(Duration::from_millis(75));
950        drop(health);
951        // If `Drop` deadlocked we'd hang here; reaching the next line passes.
952    }
953}
954
955#[cfg(all(test, feature = "bench-tests", not(tarpaulin), not(coverage)))]
956#[allow(unused_imports)]
957mod benchmarks {
958    use super::*;
959    use std::time::Instant;
960
961    #[cfg_attr(not(feature = "bench-tests"), ignore)]
962    #[test]
963    fn bench_quick_check() {
964        let health = SystemHealth::new();
965        let iterations = 1_000_000;
966        let start = Instant::now();
967        for _ in 0..iterations {
968            let _ = health.quick_check();
969        }
970        let elapsed = start.elapsed();
971        println!(
972            "SystemHealth quick_check: {:.2} ns/op",
973            elapsed.as_nanos() as f64 / iterations as f64
974        );
975        // Throughput-only smoke check; Criterion is the regression detector.
976    }
977
978    #[cfg_attr(not(feature = "bench-tests"), ignore)]
979    #[test]
980    fn bench_cached_metrics() {
981        let health = SystemHealth::new();
982        let iterations = 1_000_000;
983        let start = Instant::now();
984        for _ in 0..iterations {
985            let _ = health.cpu_used();
986            let _ = health.mem_used_mb();
987            let _ = health.health_score();
988        }
989        let elapsed = start.elapsed();
990        println!(
991            "SystemHealth cached metrics: {:.2} ns/op",
992            elapsed.as_nanos() as f64 / iterations as f64 / 3.0
993        );
994        // Throughput-only smoke check; Criterion is the regression detector.
995    }
996
997    #[cfg_attr(not(feature = "bench-tests"), ignore)]
998    #[test]
999    fn bench_force_update() {
1000        let health = SystemHealth::manual();
1001        let iterations = 1000;
1002        let start = Instant::now();
1003        for _ in 0..iterations {
1004            health.update();
1005        }
1006        let elapsed = start.elapsed();
1007        println!(
1008            "SystemHealth force update: {:.2} μs/op",
1009            elapsed.as_micros() as f64 / iterations as f64
1010        );
1011        // Throughput-only smoke check; Criterion is the regression detector.
1012    }
1013}