Skip to main content

metrics_lib/
system_health.rs

1//! # System Health Monitoring
2//!
3//! Ultra-fast system resource monitoring with process introspection.
4//!
5//! ## Features
6//!
7//! - **Process CPU/Memory tracking** - Automatic detection of current app usage
8//! - **System-wide monitoring** - CPU, memory, load average
9//! - **Sub-millisecond updates** - Fast health checks
10//! - **Cross-platform** - Works on Linux, macOS, Windows
11//! - **Zero allocations** - Pure atomic operations
12//! - **Health scoring** - Intelligent system health assessment
13
14use std::io;
15use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
16use std::time::{Duration, Instant};
17
18#[cfg(not(target_os = "linux"))]
19use sysinfo::{get_current_pid, CpuExt, ProcessExt, System, SystemExt};
20
21/// System health monitor with process introspection
22///
23/// Tracks both system-wide and process-specific resource usage.
24/// Cache-line aligned for maximum performance.
25#[repr(align(64))]
26pub struct SystemHealth {
27    /// Last system CPU usage (percentage * 100)
28    system_cpu: AtomicU32,
29    /// Last process CPU usage (percentage * 100)
30    process_cpu: AtomicU32,
31    /// System memory usage in MB
32    system_memory_mb: AtomicU64,
33    /// Process memory usage in MB
34    process_memory_mb: AtomicU64,
35    /// System load average (1 min * 100)
36    load_average: AtomicU32,
37    /// Process thread count
38    thread_count: AtomicU32,
39    /// Process file descriptor count
40    fd_count: AtomicU32,
41    /// Overall health score (0-10000, where 10000 = 100%)
42    health_score: AtomicU32,
43    /// Milliseconds since `created_at` at the last metrics refresh.
44    ///
45    /// Stored as a single time unit (milliseconds) so the throttle check in
46    /// [`Self::maybe_update`] compares like-for-like. Earlier revisions stored
47    /// this as nanoseconds while the throttle compared it against
48    /// `update_interval_ms`, freezing refreshes indefinitely.
49    last_update_ms: AtomicU64,
50    /// Update interval in milliseconds
51    update_interval_ms: u64,
52    /// Creation timestamp
53    created_at: Instant,
54    /// Linux-only delta-sample state for process CPU. Stores `(prev_clock_ticks, prev_elapsed_ms)`.
55    /// `prev_elapsed_ms = u64::MAX` sentinel = "no prior sample yet".
56    #[cfg(target_os = "linux")]
57    proc_cpu_prev: std::sync::atomic::AtomicU64,
58    #[cfg(target_os = "linux")]
59    proc_cpu_prev_ms: std::sync::atomic::AtomicU64,
60    #[cfg(not(target_os = "linux"))]
61    sys: parking_lot::Mutex<System>,
62    #[cfg(not(target_os = "linux"))]
63    pid: Option<sysinfo::Pid>,
64}
65
66/// System resource usage snapshot
67#[derive(Debug, Clone)]
68pub struct SystemSnapshot {
69    /// System CPU usage percentage (0.0-100.0)
70    pub system_cpu_percent: f64,
71    /// Process CPU usage percentage (0.0-100.0)  
72    pub process_cpu_percent: f64,
73    /// System memory usage in MB
74    pub system_memory_mb: u64,
75    /// Process memory usage in MB
76    pub process_memory_mb: u64,
77    /// System load average (1 minute)
78    pub load_average: f64,
79    /// Number of process threads
80    pub thread_count: u32,
81    /// Number of file descriptors
82    pub fd_count: u32,
83    /// Overall health score (0.0-100.0)
84    pub health_score: f64,
85    /// Time since last update
86    pub last_update: Duration,
87}
88
89/// Process-specific resource usage
90#[derive(Debug, Clone)]
91pub struct ProcessStats {
92    /// CPU usage percentage
93    pub cpu_percent: f64,
94    /// Memory usage in megabytes
95    pub memory_mb: f64,
96    /// Number of threads
97    pub threads: u32,
98    /// Number of file handles
99    pub file_handles: u32,
100    /// Process uptime
101    pub uptime: Duration,
102}
103
104impl SystemHealth {
105    /// Create new system health monitor
106    #[inline]
107    pub fn new() -> Self {
108        let instance = Self {
109            system_cpu: AtomicU32::new(0),
110            process_cpu: AtomicU32::new(0),
111            system_memory_mb: AtomicU64::new(0),
112            process_memory_mb: AtomicU64::new(0),
113            load_average: AtomicU32::new(0),
114            thread_count: AtomicU32::new(0),
115            fd_count: AtomicU32::new(0),
116            health_score: AtomicU32::new(10000), // Start with perfect health
117            last_update_ms: AtomicU64::new(0),
118            update_interval_ms: 1000, // 1 second default
119            created_at: Instant::now(),
120            #[cfg(target_os = "linux")]
121            proc_cpu_prev: std::sync::atomic::AtomicU64::new(0),
122            #[cfg(target_os = "linux")]
123            proc_cpu_prev_ms: std::sync::atomic::AtomicU64::new(u64::MAX),
124            #[cfg(not(target_os = "linux"))]
125            sys: parking_lot::Mutex::new(System::new()),
126            #[cfg(not(target_os = "linux"))]
127            pid: get_current_pid().ok(),
128        };
129
130        // Do initial update
131        instance.update_metrics();
132        instance
133    }
134
135    /// Create with custom update interval
136    #[inline]
137    pub fn with_interval(interval: Duration) -> Self {
138        let mut instance = Self::new();
139        instance.update_interval_ms = interval.as_millis() as u64;
140        instance
141    }
142
143    /// Get system CPU usage percentage - SIMPLE AF API
144    #[inline]
145    pub fn cpu_used(&self) -> f64 {
146        self.maybe_update();
147        self.system_cpu.load(Ordering::Relaxed) as f64 / 100.0
148    }
149
150    /// Get system CPU free percentage
151    #[inline]
152    pub fn cpu_free(&self) -> f64 {
153        100.0 - self.cpu_used()
154    }
155
156    /// Get system memory usage in MB
157    #[inline]
158    pub fn mem_used_mb(&self) -> f64 {
159        self.maybe_update();
160        self.system_memory_mb.load(Ordering::Relaxed) as f64
161    }
162
163    /// Get system memory usage in GB
164    #[inline]
165    pub fn mem_used_gb(&self) -> f64 {
166        self.mem_used_mb() / 1024.0
167    }
168
169    /// Get process CPU usage percentage
170    #[inline]
171    pub fn process_cpu_used(&self) -> f64 {
172        self.maybe_update();
173        self.process_cpu.load(Ordering::Relaxed) as f64 / 100.0
174    }
175
176    /// Get process memory usage in MB
177    #[inline]
178    pub fn process_mem_used_mb(&self) -> f64 {
179        self.maybe_update();
180        self.process_memory_mb.load(Ordering::Relaxed) as f64
181    }
182
183    /// Get system load average
184    #[inline]
185    pub fn load_avg(&self) -> f64 {
186        self.maybe_update();
187        self.load_average.load(Ordering::Relaxed) as f64 / 100.0
188    }
189
190    /// Get process thread count
191    #[inline]
192    pub fn thread_count(&self) -> u32 {
193        self.maybe_update();
194        self.thread_count.load(Ordering::Relaxed)
195    }
196
197    /// Get process file descriptor count
198    #[inline]
199    pub fn fd_count(&self) -> u32 {
200        self.maybe_update();
201        self.fd_count.load(Ordering::Relaxed)
202    }
203
204    /// Get overall system health score (0.0-100.0)
205    #[inline]
206    pub fn health_score(&self) -> f64 {
207        self.maybe_update();
208        self.health_score.load(Ordering::Relaxed) as f64 / 100.0
209    }
210
211    /// Quick health check - sub-microsecond if cached
212    #[inline(always)]
213    pub fn quick_check(&self) -> HealthStatus {
214        let score = self.health_score();
215
216        if score >= 80.0 {
217            HealthStatus::Healthy
218        } else if score >= 60.0 {
219            HealthStatus::Warning
220        } else if score >= 40.0 {
221            HealthStatus::Degraded
222        } else {
223            HealthStatus::Critical
224        }
225    }
226
227    /// Force immediate update of all metrics
228    #[inline]
229    pub fn update(&self) {
230        self.update_metrics();
231    }
232
233    /// Get detailed system snapshot
234    pub fn snapshot(&self) -> SystemSnapshot {
235        self.maybe_update();
236
237        // Report **time since last update** (not "monotonic ms at last update").
238        let now_ms = self.created_at.elapsed().as_millis() as u64;
239        let last_ms = self.last_update_ms.load(Ordering::Relaxed);
240        let last_update = Duration::from_millis(now_ms.saturating_sub(last_ms));
241
242        SystemSnapshot {
243            system_cpu_percent: self.system_cpu.load(Ordering::Relaxed) as f64 / 100.0,
244            process_cpu_percent: self.process_cpu.load(Ordering::Relaxed) as f64 / 100.0,
245            system_memory_mb: self.system_memory_mb.load(Ordering::Relaxed),
246            process_memory_mb: self.process_memory_mb.load(Ordering::Relaxed),
247            load_average: self.load_average.load(Ordering::Relaxed) as f64 / 100.0,
248            thread_count: self.thread_count.load(Ordering::Relaxed),
249            fd_count: self.fd_count.load(Ordering::Relaxed),
250            health_score: self.health_score.load(Ordering::Relaxed) as f64 / 100.0,
251            last_update,
252        }
253    }
254
255    /// Get process-specific statistics
256    pub fn process(&self) -> ProcessStats {
257        self.maybe_update();
258
259        ProcessStats {
260            cpu_percent: self.process_cpu.load(Ordering::Relaxed) as f64 / 100.0,
261            memory_mb: self.process_memory_mb.load(Ordering::Relaxed) as f64,
262            threads: self.thread_count.load(Ordering::Relaxed),
263            file_handles: self.fd_count.load(Ordering::Relaxed),
264            uptime: self.created_at.elapsed(),
265        }
266    }
267
268    // Internal implementation
269
270    #[inline]
271    fn maybe_update(&self) {
272        let now_ms = self.created_at.elapsed().as_millis() as u64;
273        let last_ms = self.last_update_ms.load(Ordering::Relaxed);
274
275        if now_ms.saturating_sub(last_ms) > self.update_interval_ms {
276            self.update_metrics();
277        }
278    }
279
280    fn update_metrics(&self) {
281        let now_ms = self.created_at.elapsed().as_millis() as u64;
282
283        // Update system metrics
284        if let Ok(cpu) = self.get_system_cpu() {
285            self.system_cpu
286                .store((cpu * 100.0) as u32, Ordering::Relaxed);
287        }
288
289        if let Ok(memory_mb) = self.get_system_memory_mb() {
290            self.system_memory_mb.store(memory_mb, Ordering::Relaxed);
291        }
292
293        if let Ok(load) = self.get_load_average() {
294            self.load_average
295                .store((load * 100.0) as u32, Ordering::Relaxed);
296        }
297
298        // Update process metrics
299        if let Ok(cpu) = self.get_process_cpu() {
300            self.process_cpu
301                .store((cpu * 100.0) as u32, Ordering::Relaxed);
302        }
303
304        if let Ok(memory_mb) = self.get_process_memory_mb() {
305            self.process_memory_mb.store(memory_mb, Ordering::Relaxed);
306        }
307
308        if let Ok(threads) = self.get_thread_count() {
309            self.thread_count.store(threads, Ordering::Relaxed);
310        }
311
312        if let Ok(fds) = self.get_fd_count() {
313            self.fd_count.store(fds, Ordering::Relaxed);
314        }
315
316        // Calculate health score
317        let health = self.calculate_health_score();
318        self.health_score
319            .store((health * 100.0) as u32, Ordering::Relaxed);
320
321        self.last_update_ms.store(now_ms, Ordering::Relaxed);
322    }
323
324    fn calculate_health_score(&self) -> f64 {
325        let mut score: f64 = 100.0;
326
327        // CPU penalty (system)
328        let system_cpu = self.system_cpu.load(Ordering::Relaxed) as f64 / 100.0;
329        if system_cpu > 80.0 {
330            score -= 30.0; // Heavy penalty for high CPU
331        } else if system_cpu > 60.0 {
332            score -= 15.0;
333        } else if system_cpu > 40.0 {
334            score -= 5.0;
335        }
336
337        // Load average penalty
338        let load = self.load_average.load(Ordering::Relaxed) as f64 / 100.0;
339        let cpu_count = num_cpus::get() as f64;
340        if load > cpu_count * 2.0 {
341            score -= 25.0;
342        } else if load > cpu_count * 1.5 {
343            score -= 10.0;
344        } else if load > cpu_count {
345            score -= 5.0;
346        }
347
348        // Process CPU penalty
349        let process_cpu = self.process_cpu.load(Ordering::Relaxed) as f64 / 100.0;
350        if process_cpu > 50.0 {
351            score -= 15.0;
352        } else if process_cpu > 25.0 {
353            score -= 8.0;
354        }
355
356        // Memory pressure (simplified - would need actual available memory)
357        let memory_gb = self.system_memory_mb.load(Ordering::Relaxed) as f64 / 1024.0;
358        if memory_gb > 16.0 {
359            // Assuming this is high usage
360            score -= 10.0;
361        } else if memory_gb > 8.0 {
362            score -= 5.0;
363        }
364
365        // Thread count penalty (too many threads can indicate issues)
366        let threads = self.thread_count.load(Ordering::Relaxed);
367        if threads > 1000 {
368            score -= 20.0;
369        } else if threads > 500 {
370            score -= 10.0;
371        } else if threads > 200 {
372            score -= 5.0;
373        }
374
375        // File descriptor penalty
376        let fds = self.fd_count.load(Ordering::Relaxed);
377        if fds > 10000 {
378            score -= 15.0;
379        } else if fds > 5000 {
380            score -= 8.0;
381        } else if fds > 1000 {
382            score -= 3.0;
383        }
384
385        score.max(0.0)
386    }
387
388    // Platform-specific implementations
389
390    #[cfg(target_os = "linux")]
391    fn get_system_cpu(&self) -> io::Result<f64> {
392        let contents = std::fs::read_to_string("/proc/stat")?;
393        if let Some(line) = contents.lines().next() {
394            let parts: Vec<&str> = line.split_whitespace().collect();
395            if parts.len() >= 5 && parts[0] == "cpu" {
396                let user: u64 = parts[1].parse().unwrap_or(0);
397                let nice: u64 = parts[2].parse().unwrap_or(0);
398                let system: u64 = parts[3].parse().unwrap_or(0);
399                let idle: u64 = parts[4].parse().unwrap_or(0);
400
401                let total = user + nice + system + idle;
402                let used = user + nice + system;
403
404                if total > 0 {
405                    return Ok(used as f64 / total as f64 * 100.0);
406                }
407            }
408        }
409        Ok(0.0)
410    }
411
412    #[cfg(not(target_os = "linux"))]
413    fn get_system_cpu(&self) -> io::Result<f64> {
414        // Cross-platform via sysinfo
415        let mut guard = self.sys.lock();
416        guard.refresh_cpu();
417        Ok(guard.global_cpu_info().cpu_usage() as f64)
418    }
419
420    #[cfg(target_os = "linux")]
421    fn get_system_memory_mb(&self) -> io::Result<u64> {
422        let contents = std::fs::read_to_string("/proc/meminfo")?;
423        let mut total_kb = 0u64;
424        let mut free_kb = 0u64;
425        let mut available_kb = 0u64;
426
427        for line in contents.lines() {
428            if line.starts_with("MemTotal:") {
429                total_kb = line
430                    .split_whitespace()
431                    .nth(1)
432                    .and_then(|s| s.parse().ok())
433                    .unwrap_or(0);
434            } else if line.starts_with("MemFree:") {
435                free_kb = line
436                    .split_whitespace()
437                    .nth(1)
438                    .and_then(|s| s.parse().ok())
439                    .unwrap_or(0);
440            } else if line.starts_with("MemAvailable:") {
441                available_kb = line
442                    .split_whitespace()
443                    .nth(1)
444                    .and_then(|s| s.parse().ok())
445                    .unwrap_or(0);
446            }
447        }
448
449        // Use available if present, otherwise fall back to free
450        let used_kb = if available_kb > 0 {
451            total_kb - available_kb
452        } else {
453            total_kb - free_kb
454        };
455
456        Ok(used_kb / 1024) // Convert to MB
457    }
458
459    #[cfg(not(target_os = "linux"))]
460    fn get_system_memory_mb(&self) -> io::Result<u64> {
461        let mut guard = self.sys.lock();
462        guard.refresh_memory();
463        // sysinfo reports memory in KiB
464        let used_kib = guard.used_memory();
465        Ok(used_kib / 1024)
466    }
467
468    #[cfg(target_os = "linux")]
469    fn get_load_average(&self) -> io::Result<f64> {
470        let contents = std::fs::read_to_string("/proc/loadavg")?;
471        if let Some(first) = contents.split_whitespace().next() {
472            return first
473                .parse()
474                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "Invalid load average"));
475        }
476        Ok(0.0)
477    }
478
479    #[cfg(not(target_os = "linux"))]
480    fn get_load_average(&self) -> io::Result<f64> {
481        let guard = self.sys.lock();
482        let la = guard.load_average();
483        Ok(la.one)
484    }
485
486    #[cfg(target_os = "linux")]
487    fn get_process_cpu(&self) -> io::Result<f64> {
488        // Delta sample: ((utime + stime) - prev) / (clock_ticks_per_sec * elapsed_s * cores) * 100
489        // First sample returns 0 (no prior baseline) and seeds the state.
490        let contents = std::fs::read_to_string("/proc/self/stat")?;
491        let parts: Vec<&str> = contents.split_whitespace().collect();
492        if parts.len() < 15 {
493            return Ok(0.0);
494        }
495        let utime: u64 = parts[13].parse().unwrap_or(0);
496        let stime: u64 = parts[14].parse().unwrap_or(0);
497        let total_ticks = utime.saturating_add(stime);
498        let now_ms = self.created_at.elapsed().as_millis() as u64;
499
500        let prev_ticks = self.proc_cpu_prev.load(Ordering::Relaxed);
501        let prev_ms = self.proc_cpu_prev_ms.load(Ordering::Relaxed);
502
503        // Store current sample for the next call.
504        self.proc_cpu_prev.store(total_ticks, Ordering::Relaxed);
505        self.proc_cpu_prev_ms.store(now_ms, Ordering::Relaxed);
506
507        if prev_ms == u64::MAX {
508            // First sample — no delta yet.
509            return Ok(0.0);
510        }
511        let elapsed_ms = now_ms.saturating_sub(prev_ms);
512        if elapsed_ms == 0 {
513            return Ok(0.0);
514        }
515        let delta_ticks = total_ticks.saturating_sub(prev_ticks) as f64;
516        // Linux clock ticks per second is conventionally 100; the precise value
517        // would come from `sysconf(_SC_CLK_TCK)`, but the standard kernel build
518        // uses USER_HZ=100 and this matches `/proc/stat` semantics.
519        let clk_tck: f64 = 100.0;
520        let elapsed_s = elapsed_ms as f64 / 1000.0;
521        let cores = num_cpus::get().max(1) as f64;
522        // Per-core percentage: 100% means one whole core saturated.
523        let pct = (delta_ticks / (clk_tck * elapsed_s * cores)) * 100.0;
524        Ok(pct.clamp(0.0, 100.0))
525    }
526
527    #[cfg(not(target_os = "linux"))]
528    fn get_process_cpu(&self) -> io::Result<f64> {
529        let mut guard = self.sys.lock();
530        if let Some(pid) = self.pid {
531            guard.refresh_process(pid);
532            if let Some(proc_) = guard.process(pid) {
533                // sysinfo's cpu_usage can exceed 100 on multi-core hosts.
534                // Normalize to per-core percentage (0..100).
535                let raw = proc_.cpu_usage() as f64;
536                let cores = num_cpus::get() as f64;
537                let norm = if cores > 0.0 { raw / cores } else { raw };
538                return Ok(norm.clamp(0.0, 100.0));
539            }
540        }
541        Ok(0.0)
542    }
543
544    #[cfg(target_os = "linux")]
545    fn get_process_memory_mb(&self) -> io::Result<u64> {
546        let contents = std::fs::read_to_string("/proc/self/status")?;
547        for line in contents.lines() {
548            if line.starts_with("VmRSS:") {
549                if let Some(kb_str) = line.split_whitespace().nth(1) {
550                    if let Ok(kb) = kb_str.parse::<u64>() {
551                        return Ok(kb / 1024); // Convert to MB
552                    }
553                }
554            }
555        }
556        Ok(0)
557    }
558
559    #[cfg(not(target_os = "linux"))]
560    fn get_process_memory_mb(&self) -> io::Result<u64> {
561        let mut guard = self.sys.lock();
562        if let Some(pid) = self.pid {
563            guard.refresh_process(pid);
564            if let Some(proc_) = guard.process(pid) {
565                // memory() in KiB
566                return Ok(proc_.memory() / 1024);
567            }
568        }
569        Ok(0)
570    }
571
572    #[cfg(target_os = "linux")]
573    fn get_thread_count(&self) -> io::Result<u32> {
574        let contents = std::fs::read_to_string("/proc/self/status")?;
575        for line in contents.lines() {
576            if line.starts_with("Threads:") {
577                if let Some(count_str) = line.split_whitespace().nth(1) {
578                    if let Ok(count) = count_str.parse() {
579                        return Ok(count);
580                    }
581                }
582            }
583        }
584        Ok(1) // At least 1 thread (current)
585    }
586
587    #[cfg(not(target_os = "linux"))]
588    fn get_thread_count(&self) -> io::Result<u32> {
589        // sysinfo doesn't expose per-process thread count uniformly; approximate with 1
590        // until a portable method is added.
591        Ok(1)
592    }
593
594    #[cfg(target_os = "linux")]
595    fn get_fd_count(&self) -> io::Result<u32> {
596        match std::fs::read_dir("/proc/self/fd") {
597            Ok(entries) => Ok(entries.count() as u32),
598            Err(_) => Ok(0),
599        }
600    }
601
602    #[cfg(not(target_os = "linux"))]
603    fn get_fd_count(&self) -> io::Result<u32> {
604        // Not portable via sysinfo; return 0 on non-Linux.
605        Ok(0)
606    }
607}
608
609/// System health status
610#[derive(Debug, Clone, Copy, PartialEq, Eq)]
611pub enum HealthStatus {
612    /// System is healthy (80%+ score)
613    Healthy,
614    /// System has warnings (60-80% score)
615    Warning,
616    /// System is degraded (40-60% score)
617    Degraded,
618    /// System is in critical state (<40% score)
619    Critical,
620}
621
622impl HealthStatus {
623    /// Check if status indicates system is degraded or worse
624    #[inline]
625    pub fn is_degraded(&self) -> bool {
626        matches!(self, Self::Degraded | Self::Critical)
627    }
628
629    /// Check if status indicates system is healthy
630    #[inline]
631    pub fn is_healthy(&self) -> bool {
632        matches!(self, Self::Healthy)
633    }
634
635    /// Check if status has warnings or worse
636    #[inline]
637    pub fn has_issues(&self) -> bool {
638        !matches!(self, Self::Healthy)
639    }
640}
641
642impl Default for SystemHealth {
643    fn default() -> Self {
644        Self::new()
645    }
646}
647
648impl std::fmt::Display for SystemHealth {
649    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
650        let snapshot = self.snapshot();
651        write!(
652            f,
653            "SystemHealth(CPU: {:.1}%, Mem: {} MB, Health: {:.1}%)",
654            snapshot.system_cpu_percent, snapshot.system_memory_mb, snapshot.health_score
655        )
656    }
657}
658
659impl std::fmt::Debug for SystemHealth {
660    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
661        let snapshot = self.snapshot();
662        f.debug_struct("SystemHealth")
663            .field("system_cpu", &snapshot.system_cpu_percent)
664            .field("process_cpu", &snapshot.process_cpu_percent)
665            .field("system_memory_mb", &snapshot.system_memory_mb)
666            .field("process_memory_mb", &snapshot.process_memory_mb)
667            .field("load_average", &snapshot.load_average)
668            .field("threads", &snapshot.thread_count)
669            .field("fds", &snapshot.fd_count)
670            .field("health_score", &snapshot.health_score)
671            .finish()
672    }
673}
674
675impl std::fmt::Display for HealthStatus {
676    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
677        match self {
678            Self::Healthy => write!(f, "Healthy"),
679            Self::Warning => write!(f, "Warning"),
680            Self::Degraded => write!(f, "Degraded"),
681            Self::Critical => write!(f, "Critical"),
682        }
683    }
684}
685
686// Thread safety
687// SystemHealth is composed of atomic types (Send + Sync), `Instant` (Send +
688// Sync), `parking_lot::Mutex<sysinfo::System>` (Send + Sync via the contained
689// `System: Send`), and a `sysinfo::Pid` (Send + Sync). The compiler derives
690// Send + Sync automatically; no explicit `unsafe impl` is required.
691
692#[cfg(test)]
693mod tests {
694    use super::*;
695    use std::thread;
696
697    #[test]
698    fn test_basic_functionality() {
699        let health = SystemHealth::new();
700
701        // Should be able to get all metrics
702        let _cpu = health.cpu_used();
703        let _mem = health.mem_used_mb();
704        let _process_cpu = health.process_cpu_used();
705        let _process_mem = health.process_mem_used_mb();
706        let _load = health.load_avg();
707        let _threads = health.thread_count();
708        let _fds = health.fd_count();
709        let _score = health.health_score();
710
711        // Health check should work
712        let status = health.quick_check();
713        assert!(matches!(
714            status,
715            HealthStatus::Healthy
716                | HealthStatus::Warning
717                | HealthStatus::Degraded
718                | HealthStatus::Critical
719        ));
720    }
721
722    #[test]
723    fn test_cpu_free() {
724        let health = SystemHealth::new();
725
726        let used = health.cpu_used();
727        let free = health.cpu_free();
728
729        // Used + free should approximately equal 100%
730        assert!((used + free - 100.0).abs() < 0.1);
731    }
732
733    #[test]
734    fn test_memory_units() {
735        let health = SystemHealth::new();
736
737        let mb = health.mem_used_mb();
738        let gb = health.mem_used_gb();
739
740        // GB should be approximately MB / 1024
741        if mb > 0.0 {
742            assert!((gb * 1024.0 - mb).abs() < 1.0);
743        }
744    }
745
746    #[test]
747    fn test_snapshot() {
748        let health = SystemHealth::new();
749
750        let snapshot = health.snapshot();
751
752        // Snapshot should have reasonable values
753        assert!(snapshot.system_cpu_percent >= 0.0);
754        assert!(snapshot.system_cpu_percent <= 100.0);
755        assert!(snapshot.health_score >= 0.0);
756        assert!(snapshot.health_score <= 100.0);
757        assert!(snapshot.thread_count > 0); // Should have at least 1 thread
758    }
759
760    #[test]
761    fn test_process_stats() {
762        let health = SystemHealth::new();
763
764        let stats = health.process();
765
766        assert!(stats.threads > 0); // Should have at least current thread
767        assert!(stats.uptime > Duration::ZERO);
768        assert!(stats.cpu_percent >= 0.0);
769        assert!(stats.memory_mb >= 0.0);
770    }
771
772    #[test]
773    fn test_health_status() {
774        let healthy = HealthStatus::Healthy;
775        let warning = HealthStatus::Warning;
776        let degraded = HealthStatus::Degraded;
777        let critical = HealthStatus::Critical;
778
779        assert!(healthy.is_healthy());
780        assert!(!healthy.is_degraded());
781        assert!(!healthy.has_issues());
782
783        assert!(!warning.is_healthy());
784        assert!(!warning.is_degraded());
785        assert!(warning.has_issues());
786
787        assert!(!degraded.is_healthy());
788        assert!(degraded.is_degraded());
789        assert!(degraded.has_issues());
790
791        assert!(!critical.is_healthy());
792        assert!(critical.is_degraded());
793        assert!(critical.has_issues());
794    }
795
796    #[test]
797    fn test_custom_interval() {
798        let health = SystemHealth::with_interval(Duration::from_millis(500));
799
800        // Should still work with custom interval
801        let _cpu = health.cpu_used();
802        let _score = health.health_score();
803    }
804
805    #[test]
806    fn test_maybe_update_actually_refreshes_after_interval() {
807        // 0.9.2 regression: maybe_update previously compared milliseconds
808        // against a nanosecond-typed `last_update`, which froze the throttle
809        // and pinned all values to their initial reads. After the fix,
810        // `last_update_ms` is observed to advance once the interval elapses.
811        let health = SystemHealth::with_interval(Duration::from_millis(50));
812        let snap_before = health.snapshot();
813        let last_ms_before = health.last_update_ms.load(Ordering::Relaxed);
814
815        // Sleep beyond the interval and then poke the cache.
816        thread::sleep(Duration::from_millis(120));
817        let _ = health.cpu_used(); // triggers maybe_update path
818        let snap_after = health.snapshot();
819        let last_ms_after = health.last_update_ms.load(Ordering::Relaxed);
820
821        assert!(
822            last_ms_after > last_ms_before,
823            "last_update_ms should advance after sleeping past the throttle interval \
824             (before={last_ms_before}, after={last_ms_after})",
825        );
826        // The `last_update` Duration on the snapshot should be small (since
827        // we just refreshed) rather than equal-to-monotonic-time-since-creation,
828        // which was the prior bug.
829        assert!(
830            snap_after.last_update <= Duration::from_secs(1),
831            "snapshot.last_update should be 'time since last refresh', \
832             got {:?}",
833            snap_after.last_update,
834        );
835        // Sanity: snapshot fields still produce finite values.
836        assert!(snap_before.system_cpu_percent.is_finite());
837        assert!(snap_after.system_cpu_percent.is_finite());
838    }
839
840    #[test]
841    fn test_force_update() {
842        let health = SystemHealth::new();
843
844        let score_before = health.health_score();
845
846        // Force update
847        health.update();
848
849        let score_after = health.health_score();
850
851        // Scores might be different or the same, but both should be valid
852        assert!(score_before >= 0.0);
853        assert!(score_after >= 0.0);
854    }
855
856    #[test]
857    fn test_concurrent_access() {
858        let health = std::sync::Arc::new(SystemHealth::new());
859        let mut handles = vec![];
860
861        // Spawn multiple threads accessing health metrics
862        for _ in 0..10 {
863            let health_clone = health.clone();
864            let handle = thread::spawn(move || {
865                for _ in 0..100 {
866                    let _cpu = health_clone.cpu_used();
867                    let _mem = health_clone.mem_used_mb();
868                    let _status = health_clone.quick_check();
869                }
870            });
871            handles.push(handle);
872        }
873
874        // Wait for all threads
875        for handle in handles {
876            handle.join().unwrap();
877        }
878
879        // Should still be functional
880        let final_score = health.health_score();
881        assert!((0.0..=100.0).contains(&final_score));
882    }
883
884    #[test]
885    fn test_display_formatting() {
886        let health = SystemHealth::new();
887
888        let display_str = format!("{health}");
889        assert!(display_str.contains("SystemHealth"));
890        assert!(display_str.contains("CPU"));
891        assert!(display_str.contains("Mem"));
892
893        let debug_str = format!("{health:?}");
894        assert!(debug_str.contains("SystemHealth"));
895
896        let status = health.quick_check();
897        let status_str = format!("{status}");
898        assert!(!status_str.is_empty());
899    }
900}
901
902#[cfg(all(test, feature = "bench-tests", not(tarpaulin)))]
903#[allow(unused_imports)]
904mod benchmarks {
905    use super::*;
906    use std::time::Instant;
907
908    #[cfg_attr(not(feature = "bench-tests"), ignore)]
909    #[test]
910    fn bench_quick_check() {
911        let health = SystemHealth::new();
912        let iterations = 1_000_000;
913
914        let start = Instant::now();
915        for _ in 0..iterations {
916            let _ = health.quick_check();
917        }
918        let elapsed = start.elapsed();
919
920        println!(
921            "SystemHealth quick_check: {:.2} ns/op",
922            elapsed.as_nanos() as f64 / iterations as f64
923        );
924
925        // Should be extremely fast when cached (relaxed from 100ns to 200ns)
926        assert!(elapsed.as_nanos() / iterations < 200);
927    }
928
929    #[cfg_attr(not(feature = "bench-tests"), ignore)]
930    #[test]
931    fn bench_cached_metrics() {
932        let health = SystemHealth::new();
933        let iterations = 1_000_000;
934
935        let start = Instant::now();
936        for _ in 0..iterations {
937            let _ = health.cpu_used();
938            let _ = health.mem_used_mb();
939            let _ = health.health_score();
940        }
941        let elapsed = start.elapsed();
942
943        println!(
944            "SystemHealth cached metrics: {:.2} ns/op",
945            elapsed.as_nanos() as f64 / iterations as f64 / 3.0
946        );
947
948        // Should be very fast when cached (relaxed from 500ns to 1000ns)
949        assert!(elapsed.as_nanos() / iterations < 1000);
950    }
951
952    #[cfg_attr(not(feature = "bench-tests"), ignore)]
953    #[test]
954    fn bench_force_update() {
955        let health = SystemHealth::new();
956        let iterations = 1000; // Less iterations since this does real work
957
958        let start = Instant::now();
959        for _ in 0..iterations {
960            health.update();
961        }
962        let elapsed = start.elapsed();
963
964        println!(
965            "SystemHealth force update: {:.2} μs/op",
966            elapsed.as_micros() as f64 / iterations as f64
967        );
968
969        // Should complete updates reasonably fast (relaxed from 1000ms to 2000ms)
970        assert!(elapsed.as_millis() < 2000);
971    }
972}