Skip to main content

entrenar/monitor/gpu/alert/
system.rs

1//! GPU Andon system for alert management.
2
3use super::thresholds::AndonThresholds;
4use super::types::GpuAlert;
5use crate::monitor::gpu::GpuMetrics;
6
7/// GPU Andon system for alert management
8#[derive(Debug)]
9pub struct GpuAndonSystem {
10    /// Alert thresholds
11    thresholds: AndonThresholds,
12    /// Active alerts
13    alerts: Vec<GpuAlert>,
14    /// Idle tracking per device (device_id -> consecutive idle samples)
15    idle_samples: Vec<u32>,
16    /// Sample interval in seconds (for idle calculation)
17    sample_interval_secs: u32,
18}
19
20impl GpuAndonSystem {
21    /// Create a new Andon system with default thresholds
22    pub fn new() -> Self {
23        Self::with_thresholds(AndonThresholds::default())
24    }
25
26    /// Create with custom thresholds
27    pub fn with_thresholds(thresholds: AndonThresholds) -> Self {
28        Self { thresholds, alerts: Vec::new(), idle_samples: Vec::new(), sample_interval_secs: 1 }
29    }
30
31    /// Set sample interval for idle calculation
32    pub fn set_sample_interval(&mut self, secs: u32) {
33        self.sample_interval_secs = secs;
34    }
35
36    /// Check metrics and generate alerts
37    pub fn check(&mut self, metrics: &[GpuMetrics]) -> Vec<GpuAlert> {
38        self.alerts.clear();
39
40        // Ensure idle_samples is sized correctly
41        if self.idle_samples.len() < metrics.len() {
42            self.idle_samples.resize(metrics.len(), 0);
43        }
44
45        for m in metrics {
46            // Thermal check
47            if m.temperature_celsius >= self.thresholds.thermal_warning {
48                self.alerts.push(GpuAlert::ThermalThrottling {
49                    device: m.device_id,
50                    temp: m.temperature_celsius,
51                    threshold: self.thresholds.thermal_warning,
52                });
53            }
54
55            // Memory pressure check
56            let mem_percent = m.memory_percent() as u32;
57            if mem_percent >= self.thresholds.memory_warning {
58                self.alerts.push(GpuAlert::MemoryPressure {
59                    device: m.device_id,
60                    used_percent: mem_percent,
61                    threshold: self.thresholds.memory_warning,
62                });
63            }
64
65            // Power limit check
66            let power_percent = m.power_percent() as u32;
67            if power_percent >= self.thresholds.power_warning {
68                self.alerts.push(GpuAlert::PowerLimit {
69                    device: m.device_id,
70                    power_percent,
71                    threshold: self.thresholds.power_warning,
72                });
73            }
74
75            // Idle check
76            let device_idx = m.device_id as usize;
77            if device_idx < self.idle_samples.len() {
78                if m.utilization_percent == 0 {
79                    self.idle_samples[device_idx] += 1;
80                    let idle_secs = self.idle_samples[device_idx] * self.sample_interval_secs;
81                    if idle_secs >= self.thresholds.idle_threshold_secs {
82                        self.alerts.push(GpuAlert::GpuIdle {
83                            device: m.device_id,
84                            duration_secs: idle_secs,
85                        });
86                    }
87                } else {
88                    self.idle_samples[device_idx] = 0;
89                }
90            }
91        }
92
93        self.alerts.clone()
94    }
95
96    /// Get current alerts
97    pub fn alerts(&self) -> &[GpuAlert] {
98        &self.alerts
99    }
100
101    /// Get thresholds
102    pub fn thresholds(&self) -> &AndonThresholds {
103        &self.thresholds
104    }
105
106    /// Check if any critical alerts are active
107    pub fn has_critical_alerts(&self) -> bool {
108        self.alerts.iter().any(|a| a.severity() >= 80)
109    }
110}
111
112impl Default for GpuAndonSystem {
113    fn default() -> Self {
114        Self::new()
115    }
116}