Skip to main content

trueno_gpu/monitor/stress_test/
results.rs

1//! Stress test results, metrics, and reporting.
2//!
3//! Contains [`StressMetrics`], [`StressTestReport`], [`StressTestVerdict`],
4//! and [`StressTestState`].
5
6use std::time::{Duration, Instant};
7
8use super::config::StressTestConfig;
9
10// ============================================================================
11// Stress Test Metrics
12// ============================================================================
13
14/// Stress test metrics collected during execution
15#[derive(Debug, Clone, Default)]
16pub struct StressMetrics {
17    // Peak values
18    /// Peak CPU utilization
19    pub peak_cpu_utilization: f64,
20    /// Peak GPU utilization
21    pub peak_gpu_utilization: f64,
22    /// Peak memory utilization
23    pub peak_memory_utilization: f64,
24    /// Peak temperature in Celsius
25    pub peak_temperature_c: f64,
26    /// Peak power in Watts
27    pub peak_power_watts: f64,
28    /// Peak PCIe bandwidth in GB/s
29    pub peak_pcie_bandwidth_gbps: f64,
30
31    // Throttling events
32    /// Number of thermal throttle events
33    pub thermal_throttle_count: u32,
34    /// Number of power throttle events
35    pub power_throttle_count: u32,
36    /// Number of memory pressure events
37    pub memory_pressure_events: u32,
38
39    // Errors
40    /// GPU errors
41    pub gpu_errors: Vec<String>,
42    /// Memory errors
43    pub memory_errors: Vec<String>,
44    /// Transfer errors
45    pub transfer_errors: Vec<String>,
46
47    // Performance
48    /// Baseline FLOPS (before stress)
49    pub baseline_flops: f64,
50    /// Achieved FLOPS (during stress)
51    pub achieved_flops: f64,
52    /// Performance degradation percentage
53    pub performance_degradation_pct: f64,
54
55    // Timing
56    /// Actual duration
57    pub duration_actual: Duration,
58    /// Number of samples collected
59    pub sample_count: u32,
60}
61
62impl StressMetrics {
63    /// Create new metrics
64    #[must_use]
65    pub fn new() -> Self {
66        Self::default()
67    }
68
69    /// Update peak values from current sample
70    pub fn update_peaks(
71        &mut self,
72        cpu_util: f64,
73        gpu_util: f64,
74        mem_util: f64,
75        temp_c: f64,
76        power_w: f64,
77        pcie_gbps: f64,
78    ) {
79        self.peak_cpu_utilization = self.peak_cpu_utilization.max(cpu_util);
80        self.peak_gpu_utilization = self.peak_gpu_utilization.max(gpu_util);
81        self.peak_memory_utilization = self.peak_memory_utilization.max(mem_util);
82        self.peak_temperature_c = self.peak_temperature_c.max(temp_c);
83        self.peak_power_watts = self.peak_power_watts.max(power_w);
84        self.peak_pcie_bandwidth_gbps = self.peak_pcie_bandwidth_gbps.max(pcie_gbps);
85        self.sample_count += 1;
86    }
87
88    /// Record a thermal throttle event
89    pub fn record_thermal_throttle(&mut self) {
90        self.thermal_throttle_count += 1;
91    }
92
93    /// Record a power throttle event
94    pub fn record_power_throttle(&mut self) {
95        self.power_throttle_count += 1;
96    }
97
98    /// Record a memory pressure event
99    pub fn record_memory_pressure(&mut self) {
100        self.memory_pressure_events += 1;
101    }
102
103    /// Add a GPU error
104    pub fn add_gpu_error(&mut self, error: impl Into<String>) {
105        self.gpu_errors.push(error.into());
106    }
107
108    /// Add a memory error
109    pub fn add_memory_error(&mut self, error: impl Into<String>) {
110        self.memory_errors.push(error.into());
111    }
112
113    /// Add a transfer error
114    pub fn add_transfer_error(&mut self, error: impl Into<String>) {
115        self.transfer_errors.push(error.into());
116    }
117
118    /// Calculate performance degradation
119    pub fn calculate_degradation(&mut self) {
120        if self.baseline_flops > 0.0 {
121            let diff = self.baseline_flops - self.achieved_flops;
122            self.performance_degradation_pct = (diff / self.baseline_flops) * 100.0;
123        }
124    }
125
126    /// Check if there were any errors
127    #[must_use]
128    pub fn has_errors(&self) -> bool {
129        !self.gpu_errors.is_empty()
130            || !self.memory_errors.is_empty()
131            || !self.transfer_errors.is_empty()
132    }
133
134    /// Get total error count
135    #[must_use]
136    pub fn total_errors(&self) -> usize {
137        self.gpu_errors.len() + self.memory_errors.len() + self.transfer_errors.len()
138    }
139}
140
141// ============================================================================
142// Stress Test Report
143// ============================================================================
144
145/// Stress test report
146#[derive(Debug, Clone)]
147pub struct StressTestReport {
148    /// Test configuration
149    pub config: StressTestConfig,
150    /// Collected metrics
151    pub metrics: StressMetrics,
152    /// Actual test duration
153    pub duration_actual: Duration,
154    /// Overall verdict
155    pub verdict: StressTestVerdict,
156    /// Recommendations
157    pub recommendations: Vec<String>,
158    /// Report timestamp
159    pub timestamp: Instant,
160}
161
162impl StressTestReport {
163    /// Create a new report
164    #[must_use]
165    pub fn new(config: StressTestConfig, metrics: StressMetrics, duration: Duration) -> Self {
166        let verdict = Self::calculate_verdict(&metrics);
167        let recommendations = Self::generate_recommendations(&metrics, verdict);
168
169        Self {
170            config,
171            metrics,
172            duration_actual: duration,
173            verdict,
174            recommendations,
175            timestamp: Instant::now(),
176        }
177    }
178
179    fn calculate_verdict(metrics: &StressMetrics) -> StressTestVerdict {
180        // Critical failures
181        if metrics.has_errors() {
182            return StressTestVerdict::Fail;
183        }
184
185        // Check for severe issues
186        if metrics.thermal_throttle_count > 10 {
187            return StressTestVerdict::Fail;
188        }
189        if metrics.peak_temperature_c > 95.0 {
190            return StressTestVerdict::Fail;
191        }
192        if metrics.performance_degradation_pct > 50.0 {
193            return StressTestVerdict::Fail;
194        }
195
196        // Minor issues
197        if metrics.thermal_throttle_count > 0
198            || metrics.power_throttle_count > 0
199            || metrics.memory_pressure_events > 0
200        {
201            return StressTestVerdict::PassWithNotes;
202        }
203
204        StressTestVerdict::Pass
205    }
206
207    fn generate_recommendations(
208        metrics: &StressMetrics,
209        verdict: StressTestVerdict,
210    ) -> Vec<String> {
211        let mut recs = Vec::new();
212
213        if metrics.peak_temperature_c > 85.0 {
214            recs.push("Consider improving cooling - peak temperature exceeded 85°C".to_string());
215        }
216
217        if metrics.thermal_throttle_count > 0 {
218            recs.push(format!(
219                "Thermal throttling detected {} times - reduce workload or improve cooling",
220                metrics.thermal_throttle_count
221            ));
222        }
223
224        if metrics.power_throttle_count > 0 {
225            recs.push(format!(
226                "Power throttling detected {} times - check power supply capacity",
227                metrics.power_throttle_count
228            ));
229        }
230
231        if metrics.memory_pressure_events > 0 {
232            recs.push(format!(
233                "Memory pressure detected {} times - consider reducing parallel jobs",
234                metrics.memory_pressure_events
235            ));
236        }
237
238        if metrics.performance_degradation_pct > 10.0 {
239            recs.push(format!(
240                "Performance degraded by {:.1}% under load - investigate bottlenecks",
241                metrics.performance_degradation_pct
242            ));
243        }
244
245        if verdict == StressTestVerdict::Pass && recs.is_empty() {
246            recs.push("System passed all stress tests - no issues detected".to_string());
247        }
248
249        recs
250    }
251
252    /// Export report to JSON
253    #[must_use]
254    pub fn to_json(&self) -> String {
255        // Simplified JSON format
256        format!(
257            r#"{{
258  "verdict": "{}",
259  "duration_seconds": {:.1},
260  "peak_cpu_pct": {:.1},
261  "peak_gpu_pct": {:.1},
262  "peak_memory_pct": {:.1},
263  "peak_temp_c": {:.1},
264  "peak_power_w": {:.1},
265  "thermal_throttles": {},
266  "power_throttles": {},
267  "memory_pressure_events": {},
268  "total_errors": {},
269  "performance_degradation_pct": {:.1},
270  "recommendations": {:?}
271}}"#,
272            self.verdict,
273            self.duration_actual.as_secs_f64(),
274            self.metrics.peak_cpu_utilization,
275            self.metrics.peak_gpu_utilization,
276            self.metrics.peak_memory_utilization,
277            self.metrics.peak_temperature_c,
278            self.metrics.peak_power_watts,
279            self.metrics.thermal_throttle_count,
280            self.metrics.power_throttle_count,
281            self.metrics.memory_pressure_events,
282            self.metrics.total_errors(),
283            self.metrics.performance_degradation_pct,
284            self.recommendations
285        )
286    }
287}
288
289/// Stress test verdict
290#[derive(Debug, Clone, Copy, PartialEq, Eq)]
291pub enum StressTestVerdict {
292    /// All tests passed
293    Pass,
294    /// Passed with minor notes
295    PassWithNotes,
296    /// Failed
297    Fail,
298}
299
300impl std::fmt::Display for StressTestVerdict {
301    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
302        match self {
303            Self::Pass => write!(f, "PASS"),
304            Self::PassWithNotes => write!(f, "PASS_WITH_NOTES"),
305            Self::Fail => write!(f, "FAIL"),
306        }
307    }
308}
309
310// ============================================================================
311// Stress Test Runner State
312// ============================================================================
313
314/// Stress test runner state
315#[derive(Debug, Clone, Copy, PartialEq, Eq)]
316pub enum StressTestState {
317    /// Not started
318    Idle,
319    /// Ramping up intensity
320    RampUp,
321    /// Running at full intensity
322    Running,
323    /// Cooling down
324    CoolDown,
325    /// Completed
326    Completed,
327    /// Failed/aborted
328    Aborted,
329}
330
331impl std::fmt::Display for StressTestState {
332    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
333        match self {
334            Self::Idle => write!(f, "Idle"),
335            Self::RampUp => write!(f, "Ramp-Up"),
336            Self::Running => write!(f, "Running"),
337            Self::CoolDown => write!(f, "Cool-Down"),
338            Self::Completed => write!(f, "Completed"),
339            Self::Aborted => write!(f, "Aborted"),
340        }
341    }
342}