sklears_compose/resource_management/
monitoring.rs

1//! Resource monitoring and alerting system
2//!
3//! This module provides real-time monitoring of resource utilization,
4//! alert management, and metrics collection for the resource management system.
5
6use super::resource_types::AlertThresholds;
7use sklears_core::error::Result as SklResult;
8use std::collections::{HashMap, VecDeque};
9use std::sync::mpsc;
10use std::time::{Duration, SystemTime};
11
12/// Resource monitor for real-time resource tracking and alerting
13#[derive(Debug)]
14pub struct ResourceMonitor {
15    /// Monitoring configuration
16    config: MonitorConfig,
17    /// Current metrics
18    metrics: ResourceMetrics,
19    /// Alert system
20    alert_system: AlertSystem,
21    /// Metrics history
22    history: VecDeque<MetricsSnapshot>,
23    /// Active subscriptions
24    subscriptions: HashMap<String, MonitorSubscription>,
25}
26
27/// Resource monitoring configuration
28#[derive(Debug, Clone)]
29pub struct MonitorConfig {
30    /// Sampling interval
31    pub sample_interval: Duration,
32    /// History retention period
33    pub history_retention: Duration,
34    /// Enable real-time alerts
35    pub alerts_enabled: bool,
36    /// Enable detailed metrics collection
37    pub detailed_metrics: bool,
38    /// Metric collection threads
39    pub collector_threads: usize,
40}
41
42/// Current resource metrics
43#[derive(Debug, Clone)]
44pub struct ResourceMetrics {
45    /// CPU metrics
46    pub cpu: CpuMetrics,
47    /// Memory metrics
48    pub memory: MemoryMetrics,
49    /// GPU metrics
50    pub gpu: Vec<GpuMetrics>,
51    /// Network metrics
52    pub network: NetworkMetrics,
53    /// Storage metrics
54    pub storage: StorageMetrics,
55    /// System metrics
56    pub system: SystemMetrics,
57}
58
59/// CPU utilization metrics
60#[derive(Debug, Clone)]
61pub struct CpuMetrics {
62    /// Overall CPU utilization
63    pub utilization_percent: f64,
64    /// Per-core utilization
65    pub per_core_utilization: Vec<f64>,
66    /// Load average
67    pub load_average: LoadAverage,
68    /// Context switches per second
69    pub context_switches: f64,
70    /// Interrupts per second
71    pub interrupts: f64,
72    /// CPU temperature
73    pub temperature: Option<f64>,
74}
75
76/// System load averages
77#[derive(Debug, Clone)]
78pub struct LoadAverage {
79    /// 1-minute load average
80    pub one_min: f64,
81    /// 5-minute load average
82    pub five_min: f64,
83    /// 15-minute load average
84    pub fifteen_min: f64,
85}
86
87/// Memory utilization metrics
88#[derive(Debug, Clone)]
89pub struct MemoryMetrics {
90    /// Total memory
91    pub total: u64,
92    /// Used memory
93    pub used: u64,
94    /// Available memory
95    pub available: u64,
96    /// Buffer/cache memory
97    pub buffers: u64,
98    /// Swap metrics
99    pub swap: SwapMetrics,
100    /// Memory pressure
101    pub pressure: MemoryPressure,
102}
103
104/// Swap memory metrics
105#[derive(Debug, Clone)]
106pub struct SwapMetrics {
107    /// Total swap
108    pub total: u64,
109    /// Used swap
110    pub used: u64,
111    /// Swap in rate
112    pub swap_in_rate: f64,
113    /// Swap out rate
114    pub swap_out_rate: f64,
115}
116
117/// Memory pressure indicators
118#[derive(Debug, Clone)]
119pub struct MemoryPressure {
120    /// Some pressure (percentage)
121    pub some: f64,
122    /// Full pressure (percentage)
123    pub full: f64,
124    /// Average pressure over 10s
125    pub avg10: f64,
126    /// Average pressure over 60s
127    pub avg60: f64,
128}
129
130/// GPU utilization metrics
131#[derive(Debug, Clone)]
132pub struct GpuMetrics {
133    /// Device ID
134    pub device_id: String,
135    /// GPU utilization
136    pub utilization_percent: f64,
137    /// Memory utilization
138    pub memory_utilization_percent: f64,
139    /// Temperature
140    pub temperature: f64,
141    /// Power consumption
142    pub power_watts: f64,
143    /// Clock speeds
144    pub clocks: GpuClocks,
145    /// Throttle reasons
146    pub throttle_reasons: Vec<String>,
147}
148
149/// GPU clock speeds
150#[derive(Debug, Clone)]
151pub struct GpuClocks {
152    /// Graphics clock (MHz)
153    pub graphics: u32,
154    /// Memory clock (MHz)
155    pub memory: u32,
156    /// SM clock (MHz)
157    pub sm: u32,
158}
159
160/// Network utilization metrics
161#[derive(Debug, Clone)]
162pub struct NetworkMetrics {
163    /// Bytes per second received
164    pub bytes_recv_per_sec: f64,
165    /// Bytes per second sent
166    pub bytes_sent_per_sec: f64,
167    /// Packets per second received
168    pub packets_recv_per_sec: f64,
169    /// Packets per second sent
170    pub packets_sent_per_sec: f64,
171    /// Network errors
172    pub errors: NetworkErrors,
173    /// Interface metrics
174    pub interfaces: HashMap<String, InterfaceMetrics>,
175}
176
177/// Network error counters
178#[derive(Debug, Clone)]
179pub struct NetworkErrors {
180    /// Receive errors
181    pub rx_errors: u64,
182    /// Transmit errors
183    pub tx_errors: u64,
184    /// Dropped packets
185    pub dropped: u64,
186    /// Collisions
187    pub collisions: u64,
188}
189
190/// Per-interface network metrics
191#[derive(Debug, Clone)]
192pub struct InterfaceMetrics {
193    /// Interface name
194    pub name: String,
195    /// Bytes received
196    pub bytes_recv: u64,
197    /// Bytes sent
198    pub bytes_sent: u64,
199    /// Utilization percentage
200    pub utilization_percent: f64,
201    /// Interface speed (bps)
202    pub speed: u64,
203}
204
205/// Storage utilization metrics
206#[derive(Debug, Clone)]
207pub struct StorageMetrics {
208    /// Disk usage per mount
209    pub disk_usage: HashMap<String, DiskUsage>,
210    /// I/O metrics
211    pub io_metrics: IOMetrics,
212    /// Storage health
213    pub health: StorageHealth,
214}
215
216/// Disk usage for a mount point
217#[derive(Debug, Clone)]
218pub struct DiskUsage {
219    /// Mount point
220    pub mount_point: String,
221    /// Total space
222    pub total: u64,
223    /// Used space
224    pub used: u64,
225    /// Available space
226    pub available: u64,
227    /// Usage percentage
228    pub usage_percent: f64,
229}
230
231/// Storage I/O metrics
232#[derive(Debug, Clone)]
233pub struct IOMetrics {
234    /// Read operations per second
235    pub read_ops_per_sec: f64,
236    /// Write operations per second
237    pub write_ops_per_sec: f64,
238    /// Read bandwidth (bytes/sec)
239    pub read_bandwidth: f64,
240    /// Write bandwidth (bytes/sec)
241    pub write_bandwidth: f64,
242    /// Average queue depth
243    pub avg_queue_depth: f64,
244    /// I/O wait percentage
245    pub io_wait_percent: f64,
246}
247
248/// Storage health indicators
249#[derive(Debug, Clone)]
250pub struct StorageHealth {
251    /// SMART status
252    pub smart_status: HashMap<String, SmartStatus>,
253    /// Temperature
254    pub temperature: Option<f64>,
255    /// Wear level (SSD)
256    pub wear_level: Option<f64>,
257}
258
259/// SMART status for storage devices
260#[derive(Debug, Clone)]
261pub struct SmartStatus {
262    /// Device path
263    pub device: String,
264    /// Health status
265    pub status: String,
266    /// Critical warnings
267    pub warnings: Vec<String>,
268    /// Temperature
269    pub temperature: Option<f64>,
270}
271
272/// System-wide metrics
273#[derive(Debug, Clone)]
274pub struct SystemMetrics {
275    /// Uptime
276    pub uptime: Duration,
277    /// Process count
278    pub process_count: u32,
279    /// Thread count
280    pub thread_count: u32,
281    /// File descriptor count
282    pub fd_count: u32,
283    /// System load
284    pub system_load: f64,
285}
286
287/// Snapshot of metrics at a point in time
288#[derive(Debug, Clone)]
289pub struct MetricsSnapshot {
290    /// Timestamp
291    pub timestamp: SystemTime,
292    /// Metrics data
293    pub metrics: ResourceMetrics,
294}
295
296/// Alert system for resource monitoring
297#[derive(Debug)]
298pub struct AlertSystem {
299    /// Alert configuration
300    config: AlertConfig,
301    /// Active alerts
302    active_alerts: HashMap<String, Alert>,
303    /// Alert history
304    alert_history: VecDeque<AlertHistoryEntry>,
305    /// Alert channels
306    channels: Vec<Box<dyn AlertChannel>>,
307}
308
309/// Alert system configuration
310#[derive(Debug, Clone)]
311pub struct AlertConfig {
312    /// Enable alerts
313    pub enabled: bool,
314    /// Alert thresholds
315    pub thresholds: AlertThresholds,
316    /// Alert cooldown period
317    pub cooldown_period: Duration,
318    /// Maximum alerts per minute
319    pub rate_limit: u32,
320}
321
322/// Individual alert
323#[derive(Debug, Clone)]
324pub struct Alert {
325    /// Alert ID
326    pub id: String,
327    /// Alert type
328    pub alert_type: AlertType,
329    /// Severity level
330    pub severity: AlertSeverity,
331    /// Alert message
332    pub message: String,
333    /// Resource involved
334    pub resource: String,
335    /// Current value
336    pub current_value: f64,
337    /// Threshold value
338    pub threshold_value: f64,
339    /// Alert timestamp
340    pub timestamp: SystemTime,
341    /// Alert duration
342    pub duration: Duration,
343}
344
345/// Types of alerts
346#[derive(Debug, Clone, PartialEq)]
347pub enum AlertType {
348    /// CpuHigh
349    CpuHigh,
350    /// MemoryHigh
351    MemoryHigh,
352    /// GpuHigh
353    GpuHigh,
354    /// NetworkHigh
355    NetworkHigh,
356    /// StorageHigh
357    StorageHigh,
358    /// StorageFull
359    StorageFull,
360    /// ResourceExhaustion
361    ResourceExhaustion,
362    /// PerformanceDegradation
363    PerformanceDegradation,
364    /// SystemError
365    SystemError,
366    /// Custom
367    Custom(String),
368}
369
370/// Alert severity levels
371#[derive(Debug, Clone, PartialEq, PartialOrd)]
372pub enum AlertSeverity {
373    /// Info
374    Info,
375    /// Warning
376    Warning,
377    /// Critical
378    Critical,
379    /// Emergency
380    Emergency,
381}
382
383/// Alert history entry
384#[derive(Debug, Clone)]
385pub struct AlertHistoryEntry {
386    /// Alert
387    pub alert: Alert,
388    /// Resolution timestamp
389    pub resolved_at: Option<SystemTime>,
390    /// Resolution reason
391    pub resolution_reason: Option<String>,
392}
393
394/// Alert notification channel
395pub trait AlertChannel: Send + Sync + std::fmt::Debug {
396    /// Send an alert notification
397    fn send_alert(&self, alert: &Alert) -> SklResult<()>;
398
399    /// Get channel name
400    fn name(&self) -> &str;
401
402    /// Check if channel is enabled
403    fn is_enabled(&self) -> bool;
404}
405
406/// Monitor subscription for receiving metrics updates
407#[derive(Debug)]
408pub struct MonitorSubscription {
409    /// Subscription ID
410    pub id: String,
411    /// Metrics filter
412    pub filter: MetricsFilter,
413    /// Update interval
414    pub update_interval: Duration,
415    /// Callback channel
416    pub callback: mpsc::Sender<ResourceMetrics>,
417}
418
419/// Filter for metrics subscriptions
420#[derive(Debug, Clone)]
421pub struct MetricsFilter {
422    /// Include CPU metrics
423    pub include_cpu: bool,
424    /// Include memory metrics
425    pub include_memory: bool,
426    /// Include GPU metrics
427    pub include_gpu: bool,
428    /// Include network metrics
429    pub include_network: bool,
430    /// Include storage metrics
431    pub include_storage: bool,
432    /// Specific resources to monitor
433    pub resource_filter: Option<Vec<String>>,
434}
435
436impl Default for ResourceMonitor {
437    fn default() -> Self {
438        Self::new()
439    }
440}
441
442impl ResourceMonitor {
443    /// Create a new resource monitor
444    #[must_use]
445    pub fn new() -> Self {
446        Self {
447            config: MonitorConfig {
448                sample_interval: Duration::from_secs(1),
449                history_retention: Duration::from_secs(24 * 60 * 60), // 24 hours
450                alerts_enabled: true,
451                detailed_metrics: true,
452                collector_threads: num_cpus::get(),
453            },
454            metrics: ResourceMetrics::default(),
455            alert_system: AlertSystem::new(),
456            history: VecDeque::new(),
457            subscriptions: HashMap::new(),
458        }
459    }
460
461    /// Start monitoring resources
462    pub fn start(&mut self) -> SklResult<()> {
463        // Implementation placeholder
464        Ok(())
465    }
466
467    /// Stop monitoring resources
468    pub fn stop(&mut self) -> SklResult<()> {
469        // Implementation placeholder
470        Ok(())
471    }
472
473    /// Get current resource metrics
474    #[must_use]
475    pub fn get_metrics(&self) -> &ResourceMetrics {
476        &self.metrics
477    }
478
479    /// Subscribe to metrics updates
480    pub fn subscribe(&mut self, subscription: MonitorSubscription) -> SklResult<String> {
481        let id = subscription.id.clone();
482        self.subscriptions.insert(id.clone(), subscription);
483        Ok(id)
484    }
485
486    /// Unsubscribe from metrics updates
487    pub fn unsubscribe(&mut self, subscription_id: &str) -> SklResult<()> {
488        self.subscriptions.remove(subscription_id);
489        Ok(())
490    }
491}
492
493impl Default for AlertSystem {
494    fn default() -> Self {
495        Self::new()
496    }
497}
498
499impl AlertSystem {
500    /// Create a new alert system
501    #[must_use]
502    pub fn new() -> Self {
503        Self {
504            config: AlertConfig {
505                enabled: true,
506                thresholds: AlertThresholds {
507                    cpu_threshold: 80.0,
508                    memory_threshold: 85.0,
509                    gpu_threshold: 90.0,
510                    network_threshold: 90.0,
511                    storage_threshold: 95.0,
512                },
513                cooldown_period: Duration::from_secs(300), // 5 minutes
514                rate_limit: 10,
515            },
516            active_alerts: HashMap::new(),
517            alert_history: VecDeque::new(),
518            channels: Vec::new(),
519        }
520    }
521
522    /// Add an alert channel
523    pub fn add_channel(&mut self, channel: Box<dyn AlertChannel>) {
524        self.channels.push(channel);
525    }
526
527    /// Process metrics and generate alerts
528    pub fn process_metrics(&mut self, metrics: &ResourceMetrics) -> SklResult<Vec<Alert>> {
529        let mut new_alerts = Vec::new();
530
531        // Check CPU alerts
532        if metrics.cpu.utilization_percent > self.config.thresholds.cpu_threshold {
533            let alert = Alert {
534                id: format!(
535                    "cpu-high-{}",
536                    SystemTime::now()
537                        .duration_since(SystemTime::UNIX_EPOCH)
538                        .unwrap()
539                        .as_secs()
540                ),
541                alert_type: AlertType::CpuHigh,
542                severity: AlertSeverity::Warning,
543                message: format!("CPU utilization is {:.1}%", metrics.cpu.utilization_percent),
544                resource: "CPU".to_string(),
545                current_value: metrics.cpu.utilization_percent,
546                threshold_value: self.config.thresholds.cpu_threshold,
547                timestamp: SystemTime::now(),
548                duration: Duration::from_secs(0),
549            };
550            new_alerts.push(alert);
551        }
552
553        // Check memory alerts
554        let memory_percent = (metrics.memory.used as f64 / metrics.memory.total as f64) * 100.0;
555        if memory_percent > self.config.thresholds.memory_threshold {
556            let alert = Alert {
557                id: format!(
558                    "memory-high-{}",
559                    SystemTime::now()
560                        .duration_since(SystemTime::UNIX_EPOCH)
561                        .unwrap()
562                        .as_secs()
563                ),
564                alert_type: AlertType::MemoryHigh,
565                severity: AlertSeverity::Warning,
566                message: format!("Memory usage is {memory_percent:.1}%"),
567                resource: "Memory".to_string(),
568                current_value: memory_percent,
569                threshold_value: self.config.thresholds.memory_threshold,
570                timestamp: SystemTime::now(),
571                duration: Duration::from_secs(0),
572            };
573            new_alerts.push(alert);
574        }
575
576        Ok(new_alerts)
577    }
578
579    /// Send alert through all channels
580    pub fn send_alert(&self, alert: &Alert) -> SklResult<()> {
581        for channel in &self.channels {
582            if channel.is_enabled() {
583                channel.send_alert(alert)?;
584            }
585        }
586        Ok(())
587    }
588}
589
590impl Default for ResourceMetrics {
591    fn default() -> Self {
592        Self {
593            cpu: CpuMetrics {
594                utilization_percent: 0.0,
595                per_core_utilization: Vec::new(),
596                load_average: LoadAverage {
597                    one_min: 0.0,
598                    five_min: 0.0,
599                    fifteen_min: 0.0,
600                },
601                context_switches: 0.0,
602                interrupts: 0.0,
603                temperature: None,
604            },
605            memory: MemoryMetrics {
606                total: 0,
607                used: 0,
608                available: 0,
609                buffers: 0,
610                swap: SwapMetrics {
611                    total: 0,
612                    used: 0,
613                    swap_in_rate: 0.0,
614                    swap_out_rate: 0.0,
615                },
616                pressure: MemoryPressure {
617                    some: 0.0,
618                    full: 0.0,
619                    avg10: 0.0,
620                    avg60: 0.0,
621                },
622            },
623            gpu: Vec::new(),
624            network: NetworkMetrics {
625                bytes_recv_per_sec: 0.0,
626                bytes_sent_per_sec: 0.0,
627                packets_recv_per_sec: 0.0,
628                packets_sent_per_sec: 0.0,
629                errors: NetworkErrors {
630                    rx_errors: 0,
631                    tx_errors: 0,
632                    dropped: 0,
633                    collisions: 0,
634                },
635                interfaces: HashMap::new(),
636            },
637            storage: StorageMetrics {
638                disk_usage: HashMap::new(),
639                io_metrics: IOMetrics {
640                    read_ops_per_sec: 0.0,
641                    write_ops_per_sec: 0.0,
642                    read_bandwidth: 0.0,
643                    write_bandwidth: 0.0,
644                    avg_queue_depth: 0.0,
645                    io_wait_percent: 0.0,
646                },
647                health: StorageHealth {
648                    smart_status: HashMap::new(),
649                    temperature: None,
650                    wear_level: None,
651                },
652            },
653            system: SystemMetrics {
654                uptime: Duration::from_secs(0),
655                process_count: 0,
656                thread_count: 0,
657                fd_count: 0,
658                system_load: 0.0,
659            },
660        }
661    }
662}