trueno_gpu/monitor/stress_test/
results.rs1use std::time::{Duration, Instant};
7
8use super::config::StressTestConfig;
9
10#[derive(Debug, Clone, Default)]
16pub struct StressMetrics {
17 pub peak_cpu_utilization: f64,
20 pub peak_gpu_utilization: f64,
22 pub peak_memory_utilization: f64,
24 pub peak_temperature_c: f64,
26 pub peak_power_watts: f64,
28 pub peak_pcie_bandwidth_gbps: f64,
30
31 pub thermal_throttle_count: u32,
34 pub power_throttle_count: u32,
36 pub memory_pressure_events: u32,
38
39 pub gpu_errors: Vec<String>,
42 pub memory_errors: Vec<String>,
44 pub transfer_errors: Vec<String>,
46
47 pub baseline_flops: f64,
50 pub achieved_flops: f64,
52 pub performance_degradation_pct: f64,
54
55 pub duration_actual: Duration,
58 pub sample_count: u32,
60}
61
62impl StressMetrics {
63 #[must_use]
65 pub fn new() -> Self {
66 Self::default()
67 }
68
69 pub fn update_peaks(
71 &mut self,
72 cpu_util: f64,
73 gpu_util: f64,
74 mem_util: f64,
75 temp_c: f64,
76 power_w: f64,
77 pcie_gbps: f64,
78 ) {
79 self.peak_cpu_utilization = self.peak_cpu_utilization.max(cpu_util);
80 self.peak_gpu_utilization = self.peak_gpu_utilization.max(gpu_util);
81 self.peak_memory_utilization = self.peak_memory_utilization.max(mem_util);
82 self.peak_temperature_c = self.peak_temperature_c.max(temp_c);
83 self.peak_power_watts = self.peak_power_watts.max(power_w);
84 self.peak_pcie_bandwidth_gbps = self.peak_pcie_bandwidth_gbps.max(pcie_gbps);
85 self.sample_count += 1;
86 }
87
88 pub fn record_thermal_throttle(&mut self) {
90 self.thermal_throttle_count += 1;
91 }
92
93 pub fn record_power_throttle(&mut self) {
95 self.power_throttle_count += 1;
96 }
97
98 pub fn record_memory_pressure(&mut self) {
100 self.memory_pressure_events += 1;
101 }
102
103 pub fn add_gpu_error(&mut self, error: impl Into<String>) {
105 self.gpu_errors.push(error.into());
106 }
107
108 pub fn add_memory_error(&mut self, error: impl Into<String>) {
110 self.memory_errors.push(error.into());
111 }
112
113 pub fn add_transfer_error(&mut self, error: impl Into<String>) {
115 self.transfer_errors.push(error.into());
116 }
117
118 pub fn calculate_degradation(&mut self) {
120 if self.baseline_flops > 0.0 {
121 let diff = self.baseline_flops - self.achieved_flops;
122 self.performance_degradation_pct = (diff / self.baseline_flops) * 100.0;
123 }
124 }
125
126 #[must_use]
128 pub fn has_errors(&self) -> bool {
129 !self.gpu_errors.is_empty()
130 || !self.memory_errors.is_empty()
131 || !self.transfer_errors.is_empty()
132 }
133
134 #[must_use]
136 pub fn total_errors(&self) -> usize {
137 self.gpu_errors.len() + self.memory_errors.len() + self.transfer_errors.len()
138 }
139}
140
141#[derive(Debug, Clone)]
147pub struct StressTestReport {
148 pub config: StressTestConfig,
150 pub metrics: StressMetrics,
152 pub duration_actual: Duration,
154 pub verdict: StressTestVerdict,
156 pub recommendations: Vec<String>,
158 pub timestamp: Instant,
160}
161
162impl StressTestReport {
163 #[must_use]
165 pub fn new(config: StressTestConfig, metrics: StressMetrics, duration: Duration) -> Self {
166 let verdict = Self::calculate_verdict(&metrics);
167 let recommendations = Self::generate_recommendations(&metrics, verdict);
168
169 Self {
170 config,
171 metrics,
172 duration_actual: duration,
173 verdict,
174 recommendations,
175 timestamp: Instant::now(),
176 }
177 }
178
179 fn calculate_verdict(metrics: &StressMetrics) -> StressTestVerdict {
180 if metrics.has_errors() {
182 return StressTestVerdict::Fail;
183 }
184
185 if metrics.thermal_throttle_count > 10 {
187 return StressTestVerdict::Fail;
188 }
189 if metrics.peak_temperature_c > 95.0 {
190 return StressTestVerdict::Fail;
191 }
192 if metrics.performance_degradation_pct > 50.0 {
193 return StressTestVerdict::Fail;
194 }
195
196 if metrics.thermal_throttle_count > 0
198 || metrics.power_throttle_count > 0
199 || metrics.memory_pressure_events > 0
200 {
201 return StressTestVerdict::PassWithNotes;
202 }
203
204 StressTestVerdict::Pass
205 }
206
207 fn generate_recommendations(
208 metrics: &StressMetrics,
209 verdict: StressTestVerdict,
210 ) -> Vec<String> {
211 let mut recs = Vec::new();
212
213 if metrics.peak_temperature_c > 85.0 {
214 recs.push("Consider improving cooling - peak temperature exceeded 85°C".to_string());
215 }
216
217 if metrics.thermal_throttle_count > 0 {
218 recs.push(format!(
219 "Thermal throttling detected {} times - reduce workload or improve cooling",
220 metrics.thermal_throttle_count
221 ));
222 }
223
224 if metrics.power_throttle_count > 0 {
225 recs.push(format!(
226 "Power throttling detected {} times - check power supply capacity",
227 metrics.power_throttle_count
228 ));
229 }
230
231 if metrics.memory_pressure_events > 0 {
232 recs.push(format!(
233 "Memory pressure detected {} times - consider reducing parallel jobs",
234 metrics.memory_pressure_events
235 ));
236 }
237
238 if metrics.performance_degradation_pct > 10.0 {
239 recs.push(format!(
240 "Performance degraded by {:.1}% under load - investigate bottlenecks",
241 metrics.performance_degradation_pct
242 ));
243 }
244
245 if verdict == StressTestVerdict::Pass && recs.is_empty() {
246 recs.push("System passed all stress tests - no issues detected".to_string());
247 }
248
249 recs
250 }
251
252 #[must_use]
254 pub fn to_json(&self) -> String {
255 format!(
257 r#"{{
258 "verdict": "{}",
259 "duration_seconds": {:.1},
260 "peak_cpu_pct": {:.1},
261 "peak_gpu_pct": {:.1},
262 "peak_memory_pct": {:.1},
263 "peak_temp_c": {:.1},
264 "peak_power_w": {:.1},
265 "thermal_throttles": {},
266 "power_throttles": {},
267 "memory_pressure_events": {},
268 "total_errors": {},
269 "performance_degradation_pct": {:.1},
270 "recommendations": {:?}
271}}"#,
272 self.verdict,
273 self.duration_actual.as_secs_f64(),
274 self.metrics.peak_cpu_utilization,
275 self.metrics.peak_gpu_utilization,
276 self.metrics.peak_memory_utilization,
277 self.metrics.peak_temperature_c,
278 self.metrics.peak_power_watts,
279 self.metrics.thermal_throttle_count,
280 self.metrics.power_throttle_count,
281 self.metrics.memory_pressure_events,
282 self.metrics.total_errors(),
283 self.metrics.performance_degradation_pct,
284 self.recommendations
285 )
286 }
287}
288
289#[derive(Debug, Clone, Copy, PartialEq, Eq)]
291pub enum StressTestVerdict {
292 Pass,
294 PassWithNotes,
296 Fail,
298}
299
300impl std::fmt::Display for StressTestVerdict {
301 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
302 match self {
303 Self::Pass => write!(f, "PASS"),
304 Self::PassWithNotes => write!(f, "PASS_WITH_NOTES"),
305 Self::Fail => write!(f, "FAIL"),
306 }
307 }
308}
309
310#[derive(Debug, Clone, Copy, PartialEq, Eq)]
316pub enum StressTestState {
317 Idle,
319 RampUp,
321 Running,
323 CoolDown,
325 Completed,
327 Aborted,
329}
330
331impl std::fmt::Display for StressTestState {
332 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
333 match self {
334 Self::Idle => write!(f, "Idle"),
335 Self::RampUp => write!(f, "Ramp-Up"),
336 Self::Running => write!(f, "Running"),
337 Self::CoolDown => write!(f, "Cool-Down"),
338 Self::Completed => write!(f, "Completed"),
339 Self::Aborted => write!(f, "Aborted"),
340 }
341 }
342}