1use super::resource_types::AlertThresholds;
7use sklears_core::error::Result as SklResult;
8use std::collections::{HashMap, VecDeque};
9use std::sync::mpsc;
10use std::time::{Duration, SystemTime};
11
12#[derive(Debug)]
14pub struct ResourceMonitor {
15 config: MonitorConfig,
17 metrics: ResourceMetrics,
19 alert_system: AlertSystem,
21 history: VecDeque<MetricsSnapshot>,
23 subscriptions: HashMap<String, MonitorSubscription>,
25}
26
27#[derive(Debug, Clone)]
29pub struct MonitorConfig {
30 pub sample_interval: Duration,
32 pub history_retention: Duration,
34 pub alerts_enabled: bool,
36 pub detailed_metrics: bool,
38 pub collector_threads: usize,
40}
41
42#[derive(Debug, Clone)]
44pub struct ResourceMetrics {
45 pub cpu: CpuMetrics,
47 pub memory: MemoryMetrics,
49 pub gpu: Vec<GpuMetrics>,
51 pub network: NetworkMetrics,
53 pub storage: StorageMetrics,
55 pub system: SystemMetrics,
57}
58
59#[derive(Debug, Clone)]
61pub struct CpuMetrics {
62 pub utilization_percent: f64,
64 pub per_core_utilization: Vec<f64>,
66 pub load_average: LoadAverage,
68 pub context_switches: f64,
70 pub interrupts: f64,
72 pub temperature: Option<f64>,
74}
75
76#[derive(Debug, Clone)]
78pub struct LoadAverage {
79 pub one_min: f64,
81 pub five_min: f64,
83 pub fifteen_min: f64,
85}
86
87#[derive(Debug, Clone)]
89pub struct MemoryMetrics {
90 pub total: u64,
92 pub used: u64,
94 pub available: u64,
96 pub buffers: u64,
98 pub swap: SwapMetrics,
100 pub pressure: MemoryPressure,
102}
103
104#[derive(Debug, Clone)]
106pub struct SwapMetrics {
107 pub total: u64,
109 pub used: u64,
111 pub swap_in_rate: f64,
113 pub swap_out_rate: f64,
115}
116
117#[derive(Debug, Clone)]
119pub struct MemoryPressure {
120 pub some: f64,
122 pub full: f64,
124 pub avg10: f64,
126 pub avg60: f64,
128}
129
130#[derive(Debug, Clone)]
132pub struct GpuMetrics {
133 pub device_id: String,
135 pub utilization_percent: f64,
137 pub memory_utilization_percent: f64,
139 pub temperature: f64,
141 pub power_watts: f64,
143 pub clocks: GpuClocks,
145 pub throttle_reasons: Vec<String>,
147}
148
149#[derive(Debug, Clone)]
151pub struct GpuClocks {
152 pub graphics: u32,
154 pub memory: u32,
156 pub sm: u32,
158}
159
160#[derive(Debug, Clone)]
162pub struct NetworkMetrics {
163 pub bytes_recv_per_sec: f64,
165 pub bytes_sent_per_sec: f64,
167 pub packets_recv_per_sec: f64,
169 pub packets_sent_per_sec: f64,
171 pub errors: NetworkErrors,
173 pub interfaces: HashMap<String, InterfaceMetrics>,
175}
176
177#[derive(Debug, Clone)]
179pub struct NetworkErrors {
180 pub rx_errors: u64,
182 pub tx_errors: u64,
184 pub dropped: u64,
186 pub collisions: u64,
188}
189
190#[derive(Debug, Clone)]
192pub struct InterfaceMetrics {
193 pub name: String,
195 pub bytes_recv: u64,
197 pub bytes_sent: u64,
199 pub utilization_percent: f64,
201 pub speed: u64,
203}
204
205#[derive(Debug, Clone)]
207pub struct StorageMetrics {
208 pub disk_usage: HashMap<String, DiskUsage>,
210 pub io_metrics: IOMetrics,
212 pub health: StorageHealth,
214}
215
216#[derive(Debug, Clone)]
218pub struct DiskUsage {
219 pub mount_point: String,
221 pub total: u64,
223 pub used: u64,
225 pub available: u64,
227 pub usage_percent: f64,
229}
230
231#[derive(Debug, Clone)]
233pub struct IOMetrics {
234 pub read_ops_per_sec: f64,
236 pub write_ops_per_sec: f64,
238 pub read_bandwidth: f64,
240 pub write_bandwidth: f64,
242 pub avg_queue_depth: f64,
244 pub io_wait_percent: f64,
246}
247
248#[derive(Debug, Clone)]
250pub struct StorageHealth {
251 pub smart_status: HashMap<String, SmartStatus>,
253 pub temperature: Option<f64>,
255 pub wear_level: Option<f64>,
257}
258
259#[derive(Debug, Clone)]
261pub struct SmartStatus {
262 pub device: String,
264 pub status: String,
266 pub warnings: Vec<String>,
268 pub temperature: Option<f64>,
270}
271
272#[derive(Debug, Clone)]
274pub struct SystemMetrics {
275 pub uptime: Duration,
277 pub process_count: u32,
279 pub thread_count: u32,
281 pub fd_count: u32,
283 pub system_load: f64,
285}
286
287#[derive(Debug, Clone)]
289pub struct MetricsSnapshot {
290 pub timestamp: SystemTime,
292 pub metrics: ResourceMetrics,
294}
295
296#[derive(Debug)]
298pub struct AlertSystem {
299 config: AlertConfig,
301 active_alerts: HashMap<String, Alert>,
303 alert_history: VecDeque<AlertHistoryEntry>,
305 channels: Vec<Box<dyn AlertChannel>>,
307}
308
309#[derive(Debug, Clone)]
311pub struct AlertConfig {
312 pub enabled: bool,
314 pub thresholds: AlertThresholds,
316 pub cooldown_period: Duration,
318 pub rate_limit: u32,
320}
321
322#[derive(Debug, Clone)]
324pub struct Alert {
325 pub id: String,
327 pub alert_type: AlertType,
329 pub severity: AlertSeverity,
331 pub message: String,
333 pub resource: String,
335 pub current_value: f64,
337 pub threshold_value: f64,
339 pub timestamp: SystemTime,
341 pub duration: Duration,
343}
344
345#[derive(Debug, Clone, PartialEq)]
347pub enum AlertType {
348 CpuHigh,
350 MemoryHigh,
352 GpuHigh,
354 NetworkHigh,
356 StorageHigh,
358 StorageFull,
360 ResourceExhaustion,
362 PerformanceDegradation,
364 SystemError,
366 Custom(String),
368}
369
370#[derive(Debug, Clone, PartialEq, PartialOrd)]
372pub enum AlertSeverity {
373 Info,
375 Warning,
377 Critical,
379 Emergency,
381}
382
383#[derive(Debug, Clone)]
385pub struct AlertHistoryEntry {
386 pub alert: Alert,
388 pub resolved_at: Option<SystemTime>,
390 pub resolution_reason: Option<String>,
392}
393
394pub trait AlertChannel: Send + Sync + std::fmt::Debug {
396 fn send_alert(&self, alert: &Alert) -> SklResult<()>;
398
399 fn name(&self) -> &str;
401
402 fn is_enabled(&self) -> bool;
404}
405
406#[derive(Debug)]
408pub struct MonitorSubscription {
409 pub id: String,
411 pub filter: MetricsFilter,
413 pub update_interval: Duration,
415 pub callback: mpsc::Sender<ResourceMetrics>,
417}
418
419#[derive(Debug, Clone)]
421pub struct MetricsFilter {
422 pub include_cpu: bool,
424 pub include_memory: bool,
426 pub include_gpu: bool,
428 pub include_network: bool,
430 pub include_storage: bool,
432 pub resource_filter: Option<Vec<String>>,
434}
435
436impl Default for ResourceMonitor {
437 fn default() -> Self {
438 Self::new()
439 }
440}
441
442impl ResourceMonitor {
443 #[must_use]
445 pub fn new() -> Self {
446 Self {
447 config: MonitorConfig {
448 sample_interval: Duration::from_secs(1),
449 history_retention: Duration::from_secs(24 * 60 * 60), alerts_enabled: true,
451 detailed_metrics: true,
452 collector_threads: num_cpus::get(),
453 },
454 metrics: ResourceMetrics::default(),
455 alert_system: AlertSystem::new(),
456 history: VecDeque::new(),
457 subscriptions: HashMap::new(),
458 }
459 }
460
461 pub fn start(&mut self) -> SklResult<()> {
463 Ok(())
465 }
466
467 pub fn stop(&mut self) -> SklResult<()> {
469 Ok(())
471 }
472
473 #[must_use]
475 pub fn get_metrics(&self) -> &ResourceMetrics {
476 &self.metrics
477 }
478
479 pub fn subscribe(&mut self, subscription: MonitorSubscription) -> SklResult<String> {
481 let id = subscription.id.clone();
482 self.subscriptions.insert(id.clone(), subscription);
483 Ok(id)
484 }
485
486 pub fn unsubscribe(&mut self, subscription_id: &str) -> SklResult<()> {
488 self.subscriptions.remove(subscription_id);
489 Ok(())
490 }
491}
492
493impl Default for AlertSystem {
494 fn default() -> Self {
495 Self::new()
496 }
497}
498
499impl AlertSystem {
500 #[must_use]
502 pub fn new() -> Self {
503 Self {
504 config: AlertConfig {
505 enabled: true,
506 thresholds: AlertThresholds {
507 cpu_threshold: 80.0,
508 memory_threshold: 85.0,
509 gpu_threshold: 90.0,
510 network_threshold: 90.0,
511 storage_threshold: 95.0,
512 },
513 cooldown_period: Duration::from_secs(300), rate_limit: 10,
515 },
516 active_alerts: HashMap::new(),
517 alert_history: VecDeque::new(),
518 channels: Vec::new(),
519 }
520 }
521
522 pub fn add_channel(&mut self, channel: Box<dyn AlertChannel>) {
524 self.channels.push(channel);
525 }
526
527 pub fn process_metrics(&mut self, metrics: &ResourceMetrics) -> SklResult<Vec<Alert>> {
529 let mut new_alerts = Vec::new();
530
531 if metrics.cpu.utilization_percent > self.config.thresholds.cpu_threshold {
533 let alert = Alert {
534 id: format!(
535 "cpu-high-{}",
536 SystemTime::now()
537 .duration_since(SystemTime::UNIX_EPOCH)
538 .unwrap()
539 .as_secs()
540 ),
541 alert_type: AlertType::CpuHigh,
542 severity: AlertSeverity::Warning,
543 message: format!("CPU utilization is {:.1}%", metrics.cpu.utilization_percent),
544 resource: "CPU".to_string(),
545 current_value: metrics.cpu.utilization_percent,
546 threshold_value: self.config.thresholds.cpu_threshold,
547 timestamp: SystemTime::now(),
548 duration: Duration::from_secs(0),
549 };
550 new_alerts.push(alert);
551 }
552
553 let memory_percent = (metrics.memory.used as f64 / metrics.memory.total as f64) * 100.0;
555 if memory_percent > self.config.thresholds.memory_threshold {
556 let alert = Alert {
557 id: format!(
558 "memory-high-{}",
559 SystemTime::now()
560 .duration_since(SystemTime::UNIX_EPOCH)
561 .unwrap()
562 .as_secs()
563 ),
564 alert_type: AlertType::MemoryHigh,
565 severity: AlertSeverity::Warning,
566 message: format!("Memory usage is {memory_percent:.1}%"),
567 resource: "Memory".to_string(),
568 current_value: memory_percent,
569 threshold_value: self.config.thresholds.memory_threshold,
570 timestamp: SystemTime::now(),
571 duration: Duration::from_secs(0),
572 };
573 new_alerts.push(alert);
574 }
575
576 Ok(new_alerts)
577 }
578
579 pub fn send_alert(&self, alert: &Alert) -> SklResult<()> {
581 for channel in &self.channels {
582 if channel.is_enabled() {
583 channel.send_alert(alert)?;
584 }
585 }
586 Ok(())
587 }
588}
589
590impl Default for ResourceMetrics {
591 fn default() -> Self {
592 Self {
593 cpu: CpuMetrics {
594 utilization_percent: 0.0,
595 per_core_utilization: Vec::new(),
596 load_average: LoadAverage {
597 one_min: 0.0,
598 five_min: 0.0,
599 fifteen_min: 0.0,
600 },
601 context_switches: 0.0,
602 interrupts: 0.0,
603 temperature: None,
604 },
605 memory: MemoryMetrics {
606 total: 0,
607 used: 0,
608 available: 0,
609 buffers: 0,
610 swap: SwapMetrics {
611 total: 0,
612 used: 0,
613 swap_in_rate: 0.0,
614 swap_out_rate: 0.0,
615 },
616 pressure: MemoryPressure {
617 some: 0.0,
618 full: 0.0,
619 avg10: 0.0,
620 avg60: 0.0,
621 },
622 },
623 gpu: Vec::new(),
624 network: NetworkMetrics {
625 bytes_recv_per_sec: 0.0,
626 bytes_sent_per_sec: 0.0,
627 packets_recv_per_sec: 0.0,
628 packets_sent_per_sec: 0.0,
629 errors: NetworkErrors {
630 rx_errors: 0,
631 tx_errors: 0,
632 dropped: 0,
633 collisions: 0,
634 },
635 interfaces: HashMap::new(),
636 },
637 storage: StorageMetrics {
638 disk_usage: HashMap::new(),
639 io_metrics: IOMetrics {
640 read_ops_per_sec: 0.0,
641 write_ops_per_sec: 0.0,
642 read_bandwidth: 0.0,
643 write_bandwidth: 0.0,
644 avg_queue_depth: 0.0,
645 io_wait_percent: 0.0,
646 },
647 health: StorageHealth {
648 smart_status: HashMap::new(),
649 temperature: None,
650 wear_level: None,
651 },
652 },
653 system: SystemMetrics {
654 uptime: Duration::from_secs(0),
655 process_count: 0,
656 thread_count: 0,
657 fd_count: 0,
658 system_load: 0.0,
659 },
660 }
661 }
662}