entrenar/monitor/gpu/alert/
system.rs1use super::thresholds::AndonThresholds;
4use super::types::GpuAlert;
5use crate::monitor::gpu::GpuMetrics;
6
7#[derive(Debug)]
9pub struct GpuAndonSystem {
10 thresholds: AndonThresholds,
12 alerts: Vec<GpuAlert>,
14 idle_samples: Vec<u32>,
16 sample_interval_secs: u32,
18}
19
20impl GpuAndonSystem {
21 pub fn new() -> Self {
23 Self::with_thresholds(AndonThresholds::default())
24 }
25
26 pub fn with_thresholds(thresholds: AndonThresholds) -> Self {
28 Self { thresholds, alerts: Vec::new(), idle_samples: Vec::new(), sample_interval_secs: 1 }
29 }
30
31 pub fn set_sample_interval(&mut self, secs: u32) {
33 self.sample_interval_secs = secs;
34 }
35
36 pub fn check(&mut self, metrics: &[GpuMetrics]) -> Vec<GpuAlert> {
38 self.alerts.clear();
39
40 if self.idle_samples.len() < metrics.len() {
42 self.idle_samples.resize(metrics.len(), 0);
43 }
44
45 for m in metrics {
46 if m.temperature_celsius >= self.thresholds.thermal_warning {
48 self.alerts.push(GpuAlert::ThermalThrottling {
49 device: m.device_id,
50 temp: m.temperature_celsius,
51 threshold: self.thresholds.thermal_warning,
52 });
53 }
54
55 let mem_percent = m.memory_percent() as u32;
57 if mem_percent >= self.thresholds.memory_warning {
58 self.alerts.push(GpuAlert::MemoryPressure {
59 device: m.device_id,
60 used_percent: mem_percent,
61 threshold: self.thresholds.memory_warning,
62 });
63 }
64
65 let power_percent = m.power_percent() as u32;
67 if power_percent >= self.thresholds.power_warning {
68 self.alerts.push(GpuAlert::PowerLimit {
69 device: m.device_id,
70 power_percent,
71 threshold: self.thresholds.power_warning,
72 });
73 }
74
75 let device_idx = m.device_id as usize;
77 if device_idx < self.idle_samples.len() {
78 if m.utilization_percent == 0 {
79 self.idle_samples[device_idx] += 1;
80 let idle_secs = self.idle_samples[device_idx] * self.sample_interval_secs;
81 if idle_secs >= self.thresholds.idle_threshold_secs {
82 self.alerts.push(GpuAlert::GpuIdle {
83 device: m.device_id,
84 duration_secs: idle_secs,
85 });
86 }
87 } else {
88 self.idle_samples[device_idx] = 0;
89 }
90 }
91 }
92
93 self.alerts.clone()
94 }
95
96 pub fn alerts(&self) -> &[GpuAlert] {
98 &self.alerts
99 }
100
101 pub fn thresholds(&self) -> &AndonThresholds {
103 &self.thresholds
104 }
105
106 pub fn has_critical_alerts(&self) -> bool {
108 self.alerts.iter().any(|a| a.severity() >= 80)
109 }
110}
111
112impl Default for GpuAndonSystem {
113 fn default() -> Self {
114 Self::new()
115 }
116}