Skip to main content

scirs2_core/profiling/
system_monitor.rs

1//! # System Resource Monitoring
2//!
3//! This module provides comprehensive system resource monitoring capabilities
4//! for correlating application performance with system-level metrics.
5
6use std::collections::VecDeque;
7use std::sync::{Arc, Mutex};
8use std::thread;
9use std::time::{Duration, Instant};
10use thiserror::Error;
11
12/// Error types for system monitoring
13#[derive(Error, Debug)]
14pub enum SystemMonitorError {
15    /// Failed to read system information
16    #[error("Failed to read system information: {0}")]
17    SystemReadError(String),
18
19    /// Monitoring not supported on this platform
20    #[error("System monitoring not supported on this platform")]
21    UnsupportedPlatform,
22
23    /// Permission denied for system monitoring
24    #[error("Permission denied for system monitoring")]
25    PermissionDenied,
26
27    /// Monitor not running
28    #[error("System monitor is not running")]
29    NotRunning,
30}
31
32/// System resource metrics
33#[derive(Debug, Clone)]
34pub struct SystemMetrics {
35    /// Timestamp when metrics were collected
36    pub timestamp: Instant,
37    /// CPU usage percentage (0.0 to 100.0)
38    pub cpu_usage: f64,
39    /// Memory usage in bytes
40    pub memory_usage: usize,
41    /// Available memory in bytes
42    pub memory_available: usize,
43    /// Total memory in bytes
44    pub memory_total: usize,
45    /// Disk I/O read bytes per second
46    pub disk_read_bps: u64,
47    /// Disk I/O write bytes per second
48    pub disk_write_bps: u64,
49    /// Network received bytes per second
50    pub network_rx_bps: u64,
51    /// Network transmitted bytes per second
52    pub network_tx_bps: u64,
53    /// Number of running processes
54    pub process_count: usize,
55    /// System load average (1-minute)
56    pub load_average: f64,
57}
58
59impl Default for SystemMetrics {
60    fn default() -> Self {
61        Self {
62            timestamp: Instant::now(),
63            cpu_usage: 0.0,
64            memory_usage: 0,
65            memory_available: 0,
66            memory_total: 0,
67            disk_read_bps: 0,
68            disk_write_bps: 0,
69            network_rx_bps: 0,
70            network_tx_bps: 0,
71            process_count: 0,
72            load_average: 0.0,
73        }
74    }
75}
76
77/// System monitoring configuration
78#[derive(Debug, Clone)]
79pub struct SystemMonitorConfig {
80    /// Sampling interval for system metrics
81    pub sampling_interval: Duration,
82    /// Maximum number of samples to keep in memory
83    pub max_samples: usize,
84    /// Enable CPU monitoring
85    pub monitor_cpu: bool,
86    /// Enable memory monitoring
87    pub monitor_memory: bool,
88    /// Enable disk I/O monitoring
89    pub monitor_disk: bool,
90    /// Enable network monitoring
91    pub monitor_network: bool,
92    /// Enable process monitoring
93    pub monitor_processes: bool,
94}
95
96impl Default for SystemMonitorConfig {
97    fn default() -> Self {
98        Self {
99            sampling_interval: Duration::from_millis(500),
100            max_samples: 1000,
101            monitor_cpu: true,
102            monitor_memory: true,
103            monitor_disk: true,
104            monitor_network: true,
105            monitor_processes: true,
106        }
107    }
108}
109
110/// System resource monitor
111pub struct SystemMonitor {
112    config: SystemMonitorConfig,
113    metrics_history: Arc<Mutex<VecDeque<SystemMetrics>>>,
114    running: Arc<Mutex<bool>>,
115    handle: Option<thread::JoinHandle<()>>,
116}
117
118impl SystemMonitor {
119    /// Create a new system monitor
120    pub fn new(config: SystemMonitorConfig) -> Self {
121        Self {
122            config,
123            metrics_history: Arc::new(Mutex::new(VecDeque::new())),
124            running: Arc::new(Mutex::new(false)),
125            handle: None,
126        }
127    }
128
129    /// Start monitoring system resources
130    pub fn start(&mut self) -> Result<(), SystemMonitorError> {
131        let mut running = self.running.lock().expect("Operation failed");
132        if *running {
133            return Ok(()); // Already running
134        }
135        *running = true;
136
137        let config = self.config.clone();
138        let metrics_history = Arc::clone(&self.metrics_history);
139        let running_flag = Arc::clone(&self.running);
140
141        self.handle = Some(thread::spawn(move || {
142            Self::monitoring_loop(config, metrics_history, running_flag);
143        }));
144
145        Ok(())
146    }
147
148    /// Stop monitoring
149    pub fn stop(&mut self) {
150        if let Ok(mut running) = self.running.lock() {
151            *running = false;
152        }
153
154        if let Some(handle) = self.handle.take() {
155            let _ = handle.join();
156        }
157    }
158
159    /// Get current system metrics
160    pub fn get_current_metrics(&self) -> Result<SystemMetrics, SystemMonitorError> {
161        Self::collect_system_metrics(&self.config)
162    }
163
164    /// Get metrics history
165    pub fn get_metrics_history(&self) -> Vec<SystemMetrics> {
166        self.metrics_history
167            .lock()
168            .expect("Operation failed")
169            .iter()
170            .cloned()
171            .collect()
172    }
173
174    /// Get latest N metrics
175    pub fn get_latest_metrics(&self, n: usize) -> Vec<SystemMetrics> {
176        let history = self.metrics_history.lock().expect("Operation failed");
177        history.iter().rev().take(n).cloned().collect()
178    }
179
180    /// Get metrics within time range
181    pub fn get_metrics_in_range(&self, start: Instant, end: Instant) -> Vec<SystemMetrics> {
182        self.metrics_history
183            .lock()
184            .expect("Operation failed")
185            .iter()
186            .filter(|m| m.timestamp >= start && m.timestamp <= end)
187            .cloned()
188            .collect()
189    }
190
191    /// Calculate average metrics over time period
192    pub fn get_average_metrics(&self, duration: Duration) -> Option<SystemMetrics> {
193        let now = Instant::now();
194        let start = now - duration;
195        let metrics = self.get_metrics_in_range(start, now);
196
197        if metrics.is_empty() {
198            return None;
199        }
200
201        let count = metrics.len() as f64;
202        let avg_cpu = metrics.iter().map(|m| m.cpu_usage).sum::<f64>() / count;
203        let avg_memory =
204            (metrics.iter().map(|m| m.memory_usage).sum::<usize>() as f64 / count) as usize;
205        let avg_disk_read =
206            (metrics.iter().map(|m| m.disk_read_bps).sum::<u64>() as f64 / count) as u64;
207        let avg_disk_write =
208            (metrics.iter().map(|m| m.disk_write_bps).sum::<u64>() as f64 / count) as u64;
209        let avg_network_rx =
210            (metrics.iter().map(|m| m.network_rx_bps).sum::<u64>() as f64 / count) as u64;
211        let avg_network_tx =
212            (metrics.iter().map(|m| m.network_tx_bps).sum::<u64>() as f64 / count) as u64;
213        let avg_processes =
214            (metrics.iter().map(|m| m.process_count).sum::<usize>() as f64 / count) as usize;
215        let avg_load = metrics.iter().map(|m| m.load_average).sum::<f64>() / count;
216
217        Some(SystemMetrics {
218            timestamp: now,
219            cpu_usage: avg_cpu,
220            memory_usage: avg_memory,
221            memory_available: metrics.last()?.memory_available,
222            memory_total: metrics.last()?.memory_total,
223            disk_read_bps: avg_disk_read,
224            disk_write_bps: avg_disk_write,
225            network_rx_bps: avg_network_rx,
226            network_tx_bps: avg_network_tx,
227            process_count: avg_processes,
228            load_average: avg_load,
229        })
230    }
231
232    /// Monitoring loop (runs in background thread)
233    fn monitoring_loop(
234        config: SystemMonitorConfig,
235        metrics_history: Arc<Mutex<VecDeque<SystemMetrics>>>,
236        running: Arc<Mutex<bool>>,
237    ) {
238        while *running.lock().expect("Operation failed") {
239            if let Ok(metrics) = Self::collect_system_metrics(&config) {
240                let mut history = metrics_history.lock().expect("Operation failed");
241                history.push_back(metrics);
242
243                // Keep only the last max_samples
244                while history.len() > config.max_samples {
245                    history.pop_front();
246                }
247            }
248
249            thread::sleep(config.sampling_interval);
250        }
251    }
252
253    /// Collect current system metrics
254    fn collect_system_metrics(
255        config: &SystemMonitorConfig,
256    ) -> Result<SystemMetrics, SystemMonitorError> {
257        let mut metrics = SystemMetrics::default();
258
259        if config.monitor_cpu {
260            metrics.cpu_usage = Self::get_cpu_usage()?;
261        }
262
263        if config.monitor_memory {
264            let (used, available, total) = Self::get_memoryinfo()?;
265            metrics.memory_usage = used;
266            metrics.memory_available = available;
267            metrics.memory_total = total;
268        }
269
270        if config.monitor_disk {
271            let (read_bps, write_bps) = Self::get_disk_io()?;
272            metrics.disk_read_bps = read_bps;
273            metrics.disk_write_bps = write_bps;
274        }
275
276        if config.monitor_network {
277            let (rx_bps, tx_bps) = Self::get_network_io()?;
278            metrics.network_rx_bps = rx_bps;
279            metrics.network_tx_bps = tx_bps;
280        }
281
282        if config.monitor_processes {
283            metrics.process_count = Self::get_process_count()?;
284        }
285
286        metrics.load_average = Self::get_load_average()?;
287
288        Ok(metrics)
289    }
290
291    /// Get CPU usage percentage
292    fn get_cpu_usage() -> Result<f64, SystemMonitorError> {
293        #[cfg(target_os = "linux")]
294        {
295            Self::get_cpu_usage_linux()
296        }
297
298        #[cfg(target_os = "macos")]
299        {
300            Self::get_cpu_usage_macos()
301        }
302
303        #[cfg(target_os = "windows")]
304        {
305            Self::get_cpu_usage_windows()
306        }
307
308        #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
309        {
310            // Fallback for unsupported platforms
311            Ok(0.0)
312        }
313    }
314
315    /// Get memory information (used, available, total)
316    fn get_memoryinfo() -> Result<(usize, usize, usize), SystemMonitorError> {
317        #[cfg(target_os = "linux")]
318        {
319            Self::get_memoryinfo_linux()
320        }
321
322        #[cfg(target_os = "macos")]
323        {
324            Self::get_memoryinfo_macos()
325        }
326
327        #[cfg(target_os = "windows")]
328        {
329            Self::get_memoryinfo_windows()
330        }
331
332        #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
333        {
334            Ok((0, 0, 0))
335        }
336    }
337
338    /// Get disk I/O rates (read bps, write bps)
339    fn get_disk_io() -> Result<(u64, u64), SystemMonitorError> {
340        // Simplified implementation - real version would track deltas
341        Ok((0, 0))
342    }
343
344    /// Get network I/O rates (rx bps, tx bps)
345    fn get_network_io() -> Result<(u64, u64), SystemMonitorError> {
346        // Simplified implementation - real version would track deltas
347        Ok((0, 0))
348    }
349
350    /// Get number of running processes
351    fn get_process_count() -> Result<usize, SystemMonitorError> {
352        #[cfg(target_os = "linux")]
353        {
354            match std::fs::read_dir("/proc") {
355                Ok(entries) => {
356                    let count = entries
357                        .filter_map(|entry| entry.ok())
358                        .filter(|entry| {
359                            entry
360                                .file_name()
361                                .to_string_lossy()
362                                .chars()
363                                .all(|c| c.is_ascii_digit())
364                        })
365                        .count();
366                    Ok(count)
367                }
368                Err(e) => Err(SystemMonitorError::SystemReadError(e.to_string())),
369            }
370        }
371
372        #[cfg(not(target_os = "linux"))]
373        {
374            Ok(0)
375        }
376    }
377
378    /// Get system load average
379    fn get_load_average() -> Result<f64, SystemMonitorError> {
380        #[cfg(target_os = "linux")]
381        {
382            match std::fs::read_to_string("/proc/loadavg") {
383                Ok(content) => {
384                    let load = content
385                        .split_whitespace()
386                        .next()
387                        .and_then(|s| s.parse::<f64>().ok())
388                        .unwrap_or(0.0);
389                    Ok(load)
390                }
391                Err(e) => Err(SystemMonitorError::SystemReadError(e.to_string())),
392            }
393        }
394
395        #[cfg(not(target_os = "linux"))]
396        {
397            Ok(0.0)
398        }
399    }
400
401    // Platform-specific implementations
402
403    #[cfg(target_os = "linux")]
404    fn get_cpu_usage_linux() -> Result<f64, SystemMonitorError> {
405        use std::fs;
406
407        // Read /proc/stat for CPU usage
408        let stat1 = fs::read_to_string("/proc/stat")
409            .map_err(|e| SystemMonitorError::SystemReadError(e.to_string()))?;
410
411        thread::sleep(Duration::from_millis(100));
412
413        let stat2 = fs::read_to_string("/proc/stat")
414            .map_err(|e| SystemMonitorError::SystemReadError(e.to_string()))?;
415
416        let cpu1 = Self::parse_cpu_line(&stat1)?;
417        let cpu2 = Self::parse_cpu_line(&stat2)?;
418
419        let total1 = cpu1.iter().sum::<u64>();
420        let total2 = cpu2.iter().sum::<u64>();
421        let idle1 = cpu1[3]; // idle time
422        let idle2 = cpu2[3];
423
424        let total_diff = total2 - total1;
425        let idle_diff = idle2 - idle1;
426
427        if total_diff == 0 {
428            Ok(0.0)
429        } else {
430            let usage = 100.0 - (idle_diff as f64 / total_diff as f64) * 100.0;
431            Ok(usage.clamp(0.0, 100.0))
432        }
433    }
434
435    #[cfg(target_os = "linux")]
436    fn parse_cpu_line(stat: &str) -> Result<Vec<u64>, SystemMonitorError> {
437        let first_line = stat
438            .lines()
439            .next()
440            .ok_or_else(|| SystemMonitorError::SystemReadError("Empty /proc/stat".to_string()))?;
441
442        let values: Result<Vec<u64>, _> = first_line
443            .split_whitespace()
444            .skip(1) // Skip "cpu"
445            .map(|s| s.parse::<u64>())
446            .collect();
447
448        values.map_err(|e| SystemMonitorError::SystemReadError(e.to_string()))
449    }
450
451    #[cfg(target_os = "linux")]
452    fn get_memoryinfo_linux() -> Result<(usize, usize, usize), SystemMonitorError> {
453        use std::fs;
454
455        let meminfo = fs::read_to_string("/proc/meminfo")
456            .map_err(|e| SystemMonitorError::SystemReadError(e.to_string()))?;
457
458        let mut mem_total = 0;
459        let mut mem_available = 0;
460
461        for line in meminfo.lines() {
462            if line.starts_with("MemTotal:") {
463                mem_total = Self::parse_memory_line(line)?;
464            } else if line.starts_with("MemAvailable:") {
465                mem_available = Self::parse_memory_line(line)?;
466            }
467        }
468
469        let mem_used = mem_total.saturating_sub(mem_available);
470        Ok((mem_used, mem_available, mem_total))
471    }
472
473    #[cfg(target_os = "linux")]
474    fn parse_memory_line(line: &str) -> Result<usize, SystemMonitorError> {
475        let kb = line
476            .split_whitespace()
477            .nth(1)
478            .and_then(|s| s.parse::<usize>().ok())
479            .ok_or_else(|| {
480                SystemMonitorError::SystemReadError("Invalid memory line".to_string())
481            })?;
482
483        Ok(kb * 1024) // Convert from KB to bytes
484    }
485
486    #[cfg(target_os = "macos")]
487    fn get_cpu_usage_macos() -> Result<f64, SystemMonitorError> {
488        // Would use system APIs like host_processor_info
489        Ok(0.0)
490    }
491
492    #[cfg(target_os = "macos")]
493    fn get_memoryinfo_macos() -> Result<(usize, usize, usize), SystemMonitorError> {
494        // Would use system APIs like vm_statistics64
495        Ok((0, 0, 0))
496    }
497
498    #[cfg(target_os = "windows")]
499    fn get_cpu_usage_windows() -> Result<f64, SystemMonitorError> {
500        // Would use Windows APIs like GetSystemTimes
501        Ok(0.0)
502    }
503
504    #[cfg(target_os = "windows")]
505    fn get_memoryinfo_windows() -> Result<(usize, usize, usize), SystemMonitorError> {
506        // Would use Windows APIs like GlobalMemoryStatusEx
507        Ok((0, 0, 0))
508    }
509}
510
511impl Drop for SystemMonitor {
512    fn drop(&mut self) {
513        self.stop();
514    }
515}
516
517/// System resource alert configuration
518#[derive(Debug, Clone)]
519pub struct AlertConfig {
520    /// CPU usage threshold for alerts (percentage)
521    pub cpu_threshold: f64,
522    /// Memory usage threshold for alerts (percentage)
523    pub memory_threshold: f64,
524    /// Disk I/O threshold for alerts (bytes per second)
525    pub disk_io_threshold: u64,
526    /// Network I/O threshold for alerts (bytes per second)
527    pub network_io_threshold: u64,
528    /// Load average threshold for alerts
529    pub load_threshold: f64,
530}
531
532impl Default for AlertConfig {
533    fn default() -> Self {
534        Self {
535            cpu_threshold: 80.0,
536            memory_threshold: 85.0,
537            disk_io_threshold: 100 * 1024 * 1024,   // 100 MB/s
538            network_io_threshold: 50 * 1024 * 1024, // 50 MB/s
539            load_threshold: 2.0,
540        }
541    }
542}
543
544/// System resource alert
545#[derive(Debug, Clone)]
546pub struct SystemAlert {
547    /// Alert type
548    pub alert_type: AlertType,
549    /// Current value that triggered the alert
550    pub current_value: f64,
551    /// Threshold that was exceeded
552    pub threshold: f64,
553    /// Timestamp when alert was triggered
554    pub timestamp: Instant,
555    /// Severity level
556    pub severity: AlertSeverity,
557    /// Human-readable message
558    pub message: String,
559}
560
561/// Alert types
562#[derive(Debug, Clone, PartialEq)]
563pub enum AlertType {
564    HighCpuUsage,
565    HighMemoryUsage,
566    HighDiskIo,
567    HighNetworkIo,
568    HighLoadAverage,
569}
570
571/// Alert severity levels
572#[derive(Debug, Clone, PartialEq)]
573pub enum AlertSeverity {
574    Info,
575    Warning,
576    Critical,
577}
578
579/// System resource alerting system
580pub struct SystemAlerter {
581    config: AlertConfig,
582    alert_history: VecDeque<SystemAlert>,
583    max_alert_history: usize,
584}
585
586impl SystemAlerter {
587    /// Create a new system alerter
588    pub fn new(config: AlertConfig) -> Self {
589        Self {
590            config,
591            alert_history: VecDeque::new(),
592            max_alert_history: 1000,
593        }
594    }
595
596    /// Check metrics against thresholds and generate alerts
597    pub fn check_alerts(&mut self, metrics: &SystemMetrics) -> Vec<SystemAlert> {
598        let mut alerts = Vec::new();
599
600        // Check CPU usage
601        if metrics.cpu_usage > self.config.cpu_threshold {
602            alerts.push(self.create_alert(
603                AlertType::HighCpuUsage,
604                metrics.cpu_usage,
605                self.config.cpu_threshold,
606                format!("High CPU usage: {:.1}%", metrics.cpu_usage),
607            ));
608        }
609
610        // Check memory usage
611        if metrics.memory_total > 0 {
612            let memory_percent =
613                (metrics.memory_usage as f64 / metrics.memory_total as f64) * 100.0;
614            if memory_percent > self.config.memory_threshold {
615                alerts.push(self.create_alert(
616                    AlertType::HighMemoryUsage,
617                    memory_percent,
618                    self.config.memory_threshold,
619                    format!("High memory usage: {memory_percent:.1}%"),
620                ));
621            }
622        }
623
624        // Check disk I/O
625        let total_disk_io = metrics.disk_read_bps + metrics.disk_write_bps;
626        if total_disk_io > self.config.disk_io_threshold {
627            alerts.push(self.create_alert(
628                AlertType::HighDiskIo,
629                total_disk_io as f64,
630                self.config.disk_io_threshold as f64,
631                format!(
632                    "High disk I/O: {:.1} MB/s",
633                    total_disk_io as f64 / (1024.0 * 1024.0)
634                ),
635            ));
636        }
637
638        // Check network I/O
639        let total_network_io = metrics.network_rx_bps + metrics.network_tx_bps;
640        if total_network_io > self.config.network_io_threshold {
641            alerts.push(self.create_alert(
642                AlertType::HighNetworkIo,
643                total_network_io as f64,
644                self.config.network_io_threshold as f64,
645                format!(
646                    "High network I/O: {:.1} MB/s",
647                    total_network_io as f64 / (1024.0 * 1024.0)
648                ),
649            ));
650        }
651
652        // Check load average
653        if metrics.load_average > self.config.load_threshold {
654            alerts.push(self.create_alert(
655                AlertType::HighLoadAverage,
656                metrics.load_average,
657                self.config.load_threshold,
658                format!("Load average: {:.2}", metrics.load_average),
659            ));
660        }
661
662        // Store alerts in history
663        for alert in &alerts {
664            self.alert_history.push_back(alert.clone());
665            while self.alert_history.len() > self.max_alert_history {
666                self.alert_history.pop_front();
667            }
668        }
669
670        alerts
671    }
672
673    /// Create an alert with appropriate severity
674    fn create_alert(
675        &self,
676        alert_type: AlertType,
677        current: f64,
678        threshold: f64,
679        message: String,
680    ) -> SystemAlert {
681        let severity = if current > threshold * 2.0 {
682            AlertSeverity::Critical
683        } else if current > threshold * 1.5 {
684            AlertSeverity::Warning
685        } else {
686            AlertSeverity::Info
687        };
688
689        SystemAlert {
690            alert_type,
691            current_value: current,
692            threshold,
693            timestamp: Instant::now(),
694            severity,
695            message,
696        }
697    }
698
699    /// Get alert history
700    pub fn get_alert_history(&self) -> Vec<SystemAlert> {
701        self.alert_history.iter().cloned().collect()
702    }
703
704    /// Get recent alerts
705    pub fn get_recent_alerts(&self, duration: Duration) -> Vec<SystemAlert> {
706        let cutoff = Instant::now() - duration;
707        self.alert_history
708            .iter()
709            .filter(|alert| alert.timestamp >= cutoff)
710            .cloned()
711            .collect()
712    }
713}
714
715#[cfg(test)]
716mod tests {
717    use super::*;
718
719    #[test]
720    fn test_systemmonitor_creation() {
721        let config = SystemMonitorConfig::default();
722        let monitor = SystemMonitor::new(config);
723        assert!(!*monitor.running.lock().expect("Operation failed"));
724    }
725
726    #[test]
727    fn test_alert_creation() {
728        let config = AlertConfig::default();
729        let mut alerter = SystemAlerter::new(config);
730
731        let metrics = SystemMetrics {
732            cpu_usage: 90.0, // Above threshold
733            ..Default::default()
734        };
735
736        let alerts = alerter.check_alerts(&metrics);
737        assert!(!alerts.is_empty());
738        assert_eq!(alerts[0].alert_type, AlertType::HighCpuUsage);
739    }
740
741    #[test]
742    fn test_metrics_averaging() {
743        let config = SystemMonitorConfig::default();
744        let monitor = SystemMonitor::new(config);
745
746        // Simulate some metrics
747        {
748            let mut history = monitor.metrics_history.lock().expect("Operation failed");
749            for i in 0..10 {
750                let metrics = SystemMetrics {
751                    cpu_usage: i as f64 * 10.0,
752                    timestamp: Instant::now() - Duration::from_secs(i),
753                    ..Default::default()
754                };
755                history.push_back(metrics);
756            }
757        }
758
759        let avg = monitor.get_average_metrics(Duration::from_secs(100));
760        assert!(avg.is_some());
761    }
762}