memscope_rs/lockfree/
system_profiler.rs

1//! System-wide Resource Profiler
2//!
3//! Comprehensive system resource tracking: CPU, GPU, Memory, I/O, Network
4//! Cross-platform support for Windows, Linux, macOS
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::time::{Duration, Instant};
9
10/// Comprehensive system resource snapshot
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct SystemResourceSnapshot {
13    pub timestamp: u64,
14    pub cpu_metrics: CpuMetrics,
15    pub memory_metrics: MemoryMetrics,
16    pub gpu_metrics: Option<GpuMetrics>,
17    pub io_metrics: IoMetrics,
18    pub network_metrics: NetworkMetrics,
19    pub process_metrics: ProcessMetrics,
20    pub thread_metrics: HashMap<u64, ThreadMetrics>,
21}
22
23/// CPU utilization and performance metrics
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct CpuMetrics {
26    /// Overall CPU usage percentage (0-100)
27    pub overall_usage: f32,
28    /// Per-core CPU usage
29    pub core_usage: Vec<f32>,
30    /// CPU frequency in MHz
31    pub frequency: u64,
32    /// Load average (1, 5, 15 minutes) - Unix only
33    pub load_average: Option<(f64, f64, f64)>,
34    /// CPU temperature if available
35    pub temperature: Option<f32>,
36    /// Context switches per second
37    pub context_switches: u64,
38    /// CPU cache misses if available
39    pub cache_misses: Option<u64>,
40}
41
42/// Memory system metrics
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct MemoryMetrics {
45    /// Total physical memory in bytes
46    pub total_physical: u64,
47    /// Available physical memory in bytes
48    pub available_physical: u64,
49    /// Used physical memory in bytes
50    pub used_physical: u64,
51    /// Total virtual memory in bytes
52    pub total_virtual: u64,
53    /// Available virtual memory in bytes
54    pub available_virtual: u64,
55    /// Memory pressure indicator (0-100)
56    pub pressure: f32,
57    /// Page faults per second
58    pub page_faults: u64,
59    /// Memory bandwidth utilization if available
60    pub bandwidth_utilization: Option<f32>,
61}
62
63/// GPU utilization and memory metrics
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct GpuMetrics {
66    /// GPU name/model
67    pub device_name: String,
68    /// GPU utilization percentage (0-100)
69    pub gpu_usage: f32,
70    /// GPU memory usage in bytes
71    pub memory_used: u64,
72    /// Total GPU memory in bytes
73    pub memory_total: u64,
74    /// GPU temperature if available
75    pub temperature: Option<f32>,
76    /// GPU frequency in MHz
77    pub frequency: Option<u64>,
78    /// Power consumption in watts
79    pub power_usage: Option<f32>,
80    /// CUDA/OpenCL compute utilization
81    pub compute_usage: Option<f32>,
82}
83
84/// I/O subsystem metrics
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct IoMetrics {
87    /// Disk read bytes per second
88    pub disk_read_bps: u64,
89    /// Disk write bytes per second
90    pub disk_write_bps: u64,
91    /// Disk read operations per second
92    pub disk_read_ops: u64,
93    /// Disk write operations per second
94    pub disk_write_ops: u64,
95    /// Average disk latency in microseconds
96    pub disk_latency_us: Option<u64>,
97    /// Disk queue depth
98    pub disk_queue_depth: Option<u32>,
99}
100
101/// Network utilization metrics
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct NetworkMetrics {
104    /// Network receive bytes per second
105    pub rx_bps: u64,
106    /// Network transmit bytes per second
107    pub tx_bps: u64,
108    /// Network receive packets per second
109    pub rx_pps: u64,
110    /// Network transmit packets per second
111    pub tx_pps: u64,
112    /// Network latency if measurable
113    pub latency_ms: Option<f32>,
114    /// Active connections count
115    pub connections: u32,
116}
117
118/// Per-process resource metrics
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct ProcessMetrics {
121    /// Process ID
122    pub pid: u32,
123    /// Process name
124    pub name: String,
125    /// CPU usage percentage for this process
126    pub cpu_usage: f32,
127    /// Memory usage in bytes
128    pub memory_usage: u64,
129    /// Number of threads
130    pub thread_count: u32,
131    /// Number of file handles
132    pub handle_count: u32,
133    /// Process priority
134    pub priority: i32,
135}
136
137/// Per-thread resource metrics
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ThreadMetrics {
140    /// Thread ID
141    pub thread_id: u64,
142    /// Thread name if available
143    pub thread_name: Option<String>,
144    /// CPU time consumed by this thread
145    pub cpu_time_ns: u64,
146    /// Thread state (Running, Sleeping, etc.)
147    pub state: ThreadState,
148    /// Thread priority
149    pub priority: i32,
150    /// CPU affinity mask
151    pub cpu_affinity: Option<u64>,
152}
153
154/// Thread execution state
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub enum ThreadState {
157    Running,
158    Sleeping,
159    Waiting,
160    Blocked,
161    Zombie,
162}
163
164/// System resource profiler
165pub struct SystemProfiler {
166    start_time: Instant,
167    #[allow(dead_code)]
168    sample_interval: Duration,
169    last_snapshot: Option<SystemResourceSnapshot>,
170    #[cfg(feature = "system-metrics")]
171    system: std::cell::RefCell<sysinfo::System>,
172}
173
174impl SystemProfiler {
175    /// Create new system profiler with specified sampling interval
176    pub fn new(sample_interval: Duration) -> Self {
177        Self {
178            start_time: Instant::now(),
179            sample_interval,
180            last_snapshot: None,
181            #[cfg(feature = "system-metrics")]
182            system: std::cell::RefCell::new(sysinfo::System::new_all()),
183        }
184    }
185
186    /// Take a comprehensive system resource snapshot
187    pub fn take_snapshot(&mut self) -> Result<SystemResourceSnapshot, Box<dyn std::error::Error>> {
188        let timestamp = self.start_time.elapsed().as_millis() as u64;
189
190        let snapshot = SystemResourceSnapshot {
191            timestamp,
192            cpu_metrics: self.collect_cpu_metrics()?,
193            memory_metrics: self.collect_memory_metrics()?,
194            gpu_metrics: self.collect_gpu_metrics()?,
195            io_metrics: self.collect_io_metrics()?,
196            network_metrics: self.collect_network_metrics()?,
197            process_metrics: self.collect_process_metrics()?,
198            thread_metrics: self.collect_thread_metrics()?,
199        };
200
201        self.last_snapshot = Some(snapshot.clone());
202        Ok(snapshot)
203    }
204
205    /// Collect CPU performance metrics
206    fn collect_cpu_metrics(&self) -> Result<CpuMetrics, Box<dyn std::error::Error>> {
207        #[cfg(feature = "system-metrics")]
208        {
209            let mut system = self.system.borrow_mut();
210            system.refresh_cpu_all();
211
212            let overall_usage = system.global_cpu_usage();
213            let core_usage: Vec<f32> = system.cpus().iter().map(|cpu| cpu.cpu_usage()).collect();
214            let load_average = sysinfo::System::load_average();
215
216            Ok(CpuMetrics {
217                overall_usage,
218                core_usage,
219                frequency: 0, // Would need platform-specific code
220                load_average: Some((load_average.one, load_average.five, load_average.fifteen)),
221                temperature: None,   // Would need platform-specific sensors
222                context_switches: 0, // Would need platform-specific code
223                cache_misses: None,
224            })
225        }
226
227        #[cfg(not(feature = "system-metrics"))]
228        {
229            // Fallback implementation
230            Ok(CpuMetrics {
231                overall_usage: 0.0,
232                core_usage: vec![0.0; num_cpus::get()],
233                frequency: 0,
234                load_average: None,
235                temperature: None,
236                context_switches: 0,
237                cache_misses: None,
238            })
239        }
240    }
241
242    /// Collect memory subsystem metrics
243    fn collect_memory_metrics(&self) -> Result<MemoryMetrics, Box<dyn std::error::Error>> {
244        #[cfg(feature = "system-metrics")]
245        {
246            let mut system = self.system.borrow_mut();
247            system.refresh_memory();
248
249            let total_physical = system.total_memory();
250            let available_physical = system.available_memory();
251            let used_physical = total_physical - available_physical;
252
253            let pressure = (used_physical as f32 / total_physical as f32) * 100.0;
254
255            Ok(MemoryMetrics {
256                total_physical,
257                available_physical,
258                used_physical,
259                total_virtual: system.total_swap(),
260                available_virtual: system.free_swap(),
261                pressure,
262                page_faults: 0, // Would need platform-specific code
263                bandwidth_utilization: None,
264            })
265        }
266
267        #[cfg(not(feature = "system-metrics"))]
268        {
269            Ok(MemoryMetrics {
270                total_physical: 0,
271                available_physical: 0,
272                used_physical: 0,
273                total_virtual: 0,
274                available_virtual: 0,
275                pressure: 0.0,
276                page_faults: 0,
277                bandwidth_utilization: None,
278            })
279        }
280    }
281
282    /// Collect GPU utilization metrics (platform-specific)
283    fn collect_gpu_metrics(&self) -> Result<Option<GpuMetrics>, Box<dyn std::error::Error>> {
284        // GPU metrics collection would require platform-specific implementations:
285        // - Windows: DirectX/DXGI APIs
286        // - Linux: nvidia-ml-py, ROCm, Intel GPU tools
287        // - macOS: Metal Performance Shaders, system_profiler
288
289        #[cfg(target_os = "linux")]
290        {
291            // Try to read NVIDIA GPU metrics
292            if let Ok(gpu_metrics) = self.collect_nvidia_gpu_metrics() {
293                return Ok(Some(gpu_metrics));
294            }
295        }
296
297        #[cfg(target_os = "windows")]
298        {
299            // Try to read GPU metrics via WMI/DXGI
300            if let Ok(gpu_metrics) = self.collect_windows_gpu_metrics() {
301                return Ok(Some(gpu_metrics));
302            }
303        }
304
305        #[cfg(target_os = "macos")]
306        {
307            // Try to read GPU metrics via Metal/IOKit
308            if let Ok(gpu_metrics) = self.collect_macos_gpu_metrics() {
309                return Ok(Some(gpu_metrics));
310            }
311        }
312
313        Ok(None)
314    }
315
316    /// Collect I/O subsystem metrics
317    fn collect_io_metrics(&self) -> Result<IoMetrics, Box<dyn std::error::Error>> {
318        // I/O metrics would be collected from:
319        // - Linux: /proc/diskstats, /sys/block/*/stat
320        // - Windows: Performance Counters
321        // - macOS: IOKit, system_profiler
322
323        Ok(IoMetrics {
324            disk_read_bps: 0,
325            disk_write_bps: 0,
326            disk_read_ops: 0,
327            disk_write_ops: 0,
328            disk_latency_us: None,
329            disk_queue_depth: None,
330        })
331    }
332
333    /// Collect network utilization metrics
334    fn collect_network_metrics(&self) -> Result<NetworkMetrics, Box<dyn std::error::Error>> {
335        #[cfg(feature = "system-metrics")]
336        {
337            // Network monitoring temporarily disabled due to sysinfo API changes
338            let total_rx = 0;
339            let total_tx = 0;
340
341            Ok(NetworkMetrics {
342                rx_bps: total_rx,
343                tx_bps: total_tx,
344                rx_pps: 0, // Would need more detailed monitoring
345                tx_pps: 0,
346                latency_ms: None,
347                connections: 0,
348            })
349        }
350
351        #[cfg(not(feature = "system-metrics"))]
352        {
353            Ok(NetworkMetrics {
354                rx_bps: 0,
355                tx_bps: 0,
356                rx_pps: 0,
357                tx_pps: 0,
358                latency_ms: None,
359                connections: 0,
360            })
361        }
362    }
363
364    /// Collect current process metrics
365    fn collect_process_metrics(&self) -> Result<ProcessMetrics, Box<dyn std::error::Error>> {
366        #[cfg(feature = "system-metrics")]
367        {
368            let mut system = self.system.borrow_mut();
369            system.refresh_processes(sysinfo::ProcessesToUpdate::All, true);
370
371            let current_pid = sysinfo::get_current_pid()?;
372
373            if let Some(process) = system.process(current_pid) {
374                Ok(ProcessMetrics {
375                    pid: current_pid.as_u32(),
376                    name: process.name().to_string_lossy().to_string(),
377                    cpu_usage: process.cpu_usage(),
378                    memory_usage: process.memory(),
379                    thread_count: 0, // Would need platform-specific code
380                    handle_count: 0,
381                    priority: 0,
382                })
383            } else {
384                Err("Could not find current process".into())
385            }
386        }
387
388        #[cfg(not(feature = "system-metrics"))]
389        {
390            Ok(ProcessMetrics {
391                pid: std::process::id(),
392                name: "unknown".to_string(),
393                cpu_usage: 0.0,
394                memory_usage: 0,
395                thread_count: 0,
396                handle_count: 0,
397                priority: 0,
398            })
399        }
400    }
401
402    /// Collect per-thread metrics
403    fn collect_thread_metrics(
404        &self,
405    ) -> Result<HashMap<u64, ThreadMetrics>, Box<dyn std::error::Error>> {
406        let mut thread_metrics = HashMap::new();
407
408        // Thread-level metrics would require platform-specific implementation:
409        // - Linux: /proc/[pid]/task/[tid]/* files
410        // - Windows: Thread performance counters
411        // - macOS: thread_info() system calls
412
413        // For now, return current thread info
414        let current_thread_id = get_current_thread_id();
415        thread_metrics.insert(
416            current_thread_id,
417            ThreadMetrics {
418                thread_id: current_thread_id,
419                thread_name: std::thread::current().name().map(String::from),
420                cpu_time_ns: 0,
421                state: ThreadState::Running,
422                priority: 0,
423                cpu_affinity: None,
424            },
425        );
426
427        Ok(thread_metrics)
428    }
429
430    // Platform-specific GPU collection methods
431    #[cfg(target_os = "linux")]
432    fn collect_nvidia_gpu_metrics(&self) -> Result<GpuMetrics, Box<dyn std::error::Error>> {
433        // Implementation would use nvidia-ml-py or similar
434        Err("NVIDIA GPU metrics not implemented".into())
435    }
436
437    #[cfg(target_os = "windows")]
438    fn collect_windows_gpu_metrics(&self) -> Result<GpuMetrics, Box<dyn std::error::Error>> {
439        // Implementation would use DXGI or WMI
440        Err("Windows GPU metrics not implemented".into())
441    }
442
443    #[cfg(target_os = "macos")]
444    fn collect_macos_gpu_metrics(&self) -> Result<GpuMetrics, Box<dyn std::error::Error>> {
445        // Implementation would use Metal or IOKit
446        Err("macOS GPU metrics not implemented".into())
447    }
448}
449
450/// Get current thread ID in a cross-platform way
451fn get_current_thread_id() -> u64 {
452    #[cfg(target_os = "linux")]
453    {
454        unsafe { libc::syscall(libc::SYS_gettid) as u64 }
455    }
456
457    #[cfg(target_os = "windows")]
458    {
459        unsafe { winapi::um::processthreadsapi::GetCurrentThreadId() as u64 }
460    }
461
462    #[cfg(target_os = "macos")]
463    {
464        unsafe { libc::pthread_self() as u64 }
465    }
466
467    #[cfg(not(any(target_os = "linux", target_os = "windows", target_os = "macos")))]
468    {
469        std::thread::current().id().as_u64()
470    }
471}
472
473/// Continuous system profiling manager
474pub struct ContinuousProfiler {
475    #[allow(dead_code)]
476    profiler: SystemProfiler,
477    snapshots: Vec<SystemResourceSnapshot>,
478    is_running: std::sync::Arc<std::sync::atomic::AtomicBool>,
479}
480
481impl ContinuousProfiler {
482    /// Start continuous profiling in background
483    pub fn start_background_profiling(interval: Duration) -> Self {
484        let profiler = SystemProfiler::new(interval);
485        let is_running = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(true));
486
487        Self {
488            profiler,
489            snapshots: Vec::new(),
490            is_running,
491        }
492    }
493
494    /// Stop profiling and return collected data
495    pub fn stop_and_collect(self) -> Vec<SystemResourceSnapshot> {
496        self.is_running
497            .store(false, std::sync::atomic::Ordering::SeqCst);
498        self.snapshots
499    }
500}