1use super::benchmarking::{benchmark, BenchmarkConfig, BenchmarkResults};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::time::{SystemTime, UNIX_EPOCH};
10use torsh_core::{Result as TorshResult, TorshError};
11use torsh_tensor::Tensor;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct PerformanceBaseline {
16 pub operation: String,
18 pub timestamp: u64,
20 pub commit_hash: Option<String>,
22 pub version: Option<String>,
24 pub baseline_summary: BaselineSummary,
26 pub system_info: SystemInfo,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct BaselineSummary {
32 pub mean_duration: f64,
33 pub std_duration: f64,
34 pub min_duration: f64,
35 pub max_duration: f64,
36 pub mean_throughput: f64,
37 pub mean_flops: Option<f64>,
38 pub mean_memory_bandwidth: Option<f64>,
39 pub sample_count: usize,
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct SystemInfo {
44 pub os: String,
45 pub arch: String,
46 pub cpu_count: usize,
47 pub total_memory: Option<usize>,
48}
49
50#[derive(Debug, Clone)]
51pub struct RegressionTestResult {
52 pub operation: String,
53 pub current_performance: BaselineSummary,
54 pub baseline_performance: BaselineSummary,
55 pub regression_detected: bool,
56 pub duration_regression_percent: f64,
57 pub throughput_regression_percent: f64,
58 pub significance_level: f64,
59 pub details: String,
60}
61
62#[derive(Debug, Clone)]
63pub struct RegressionTestConfig {
64 pub regression_threshold: f64,
66 pub significance_level: f64,
68 pub min_samples: usize,
70 pub baseline_path: String,
72 pub auto_update_baseline: bool,
74}
75
76impl Default for RegressionTestConfig {
77 fn default() -> Self {
78 Self {
79 regression_threshold: 5.0, significance_level: 0.05, min_samples: 10,
82 baseline_path: std::env::temp_dir()
83 .join("torsh_performance_baselines.json")
84 .display()
85 .to_string(),
86 auto_update_baseline: false,
87 }
88 }
89}
90
91pub struct PerformanceRegressionTester {
92 config: RegressionTestConfig,
93 baselines: HashMap<String, PerformanceBaseline>,
94}
95
96impl PerformanceRegressionTester {
97 pub fn new(config: RegressionTestConfig) -> Self {
99 Self {
100 config,
101 baselines: HashMap::new(),
102 }
103 }
104
105 pub fn load_baselines(&mut self) -> TorshResult<()> {
107 match std::fs::read_to_string(&self.config.baseline_path) {
108 Ok(content) => {
109 self.baselines = serde_json::from_str(&content)
110 .map_err(|e| TorshError::Other(format!("Failed to parse baselines: {}", e)))?;
111 Ok(())
112 }
113 Err(_) => {
114 self.baselines = HashMap::new();
116 Ok(())
117 }
118 }
119 }
120
121 pub fn save_baselines(&self) -> TorshResult<()> {
123 let content = serde_json::to_string_pretty(&self.baselines)
124 .map_err(|e| TorshError::Other(format!("Failed to serialize baselines: {}", e)))?;
125
126 std::fs::write(&self.config.baseline_path, content)
127 .map_err(|e| TorshError::Other(format!("Failed to write baselines file: {}", e)))?;
128
129 Ok(())
130 }
131
132 pub fn create_baseline(
134 &mut self,
135 operation: &str,
136 benchmark_results: &BenchmarkResults,
137 commit_hash: Option<String>,
138 version: Option<String>,
139 ) -> TorshResult<()> {
140 let timestamp = SystemTime::now()
141 .duration_since(UNIX_EPOCH)
142 .expect("system time should be after UNIX_EPOCH")
143 .as_secs();
144
145 let system_info = SystemInfo {
146 os: std::env::consts::OS.to_string(),
147 arch: std::env::consts::ARCH.to_string(),
148 cpu_count: 1, total_memory: None, };
151
152 let mean_memory_bandwidth = if !benchmark_results.metrics.is_empty() {
153 Some(
154 benchmark_results
155 .metrics
156 .iter()
157 .filter_map(|m| m.memory_bandwidth)
158 .sum::<f64>()
159 / benchmark_results.metrics.len() as f64,
160 )
161 } else {
162 None
163 };
164
165 let baseline_summary = BaselineSummary {
166 mean_duration: benchmark_results.summary.mean_duration,
167 std_duration: benchmark_results.summary.std_duration,
168 min_duration: benchmark_results.summary.min_duration,
169 max_duration: benchmark_results.summary.max_duration,
170 mean_throughput: benchmark_results.summary.mean_throughput,
171 mean_flops: benchmark_results
172 .summary
173 .total_flops
174 .map(|f| f as f64 / benchmark_results.summary.count as f64),
175 mean_memory_bandwidth,
176 sample_count: benchmark_results.summary.count,
177 };
178
179 let baseline = PerformanceBaseline {
180 operation: operation.to_string(),
181 timestamp,
182 commit_hash,
183 version,
184 baseline_summary,
185 system_info,
186 };
187
188 self.baselines.insert(operation.to_string(), baseline);
189 self.save_baselines()?;
190
191 Ok(())
192 }
193
194 pub fn test_regression(
196 &self,
197 operation: &str,
198 current_results: &BenchmarkResults,
199 ) -> TorshResult<RegressionTestResult> {
200 let baseline = self.baselines.get(operation).ok_or_else(|| {
201 TorshError::invalid_argument_with_context(
202 &format!("No baseline found for operation: {}", operation),
203 "test_regression",
204 )
205 })?;
206
207 if current_results.summary.count < self.config.min_samples {
208 return Err(TorshError::invalid_argument_with_context(
209 &format!(
210 "Insufficient samples: {} < {}",
211 current_results.summary.count, self.config.min_samples
212 ),
213 "test_regression",
214 ));
215 }
216
217 let current_memory_bandwidth = if !current_results.metrics.is_empty() {
218 Some(
219 current_results
220 .metrics
221 .iter()
222 .filter_map(|m| m.memory_bandwidth)
223 .sum::<f64>()
224 / current_results.metrics.len() as f64,
225 )
226 } else {
227 None
228 };
229
230 let current_summary = BaselineSummary {
231 mean_duration: current_results.summary.mean_duration,
232 std_duration: current_results.summary.std_duration,
233 min_duration: current_results.summary.min_duration,
234 max_duration: current_results.summary.max_duration,
235 mean_throughput: current_results.summary.mean_throughput,
236 mean_flops: current_results
237 .summary
238 .total_flops
239 .map(|f| f as f64 / current_results.summary.count as f64),
240 mean_memory_bandwidth: current_memory_bandwidth,
241 sample_count: current_results.summary.count,
242 };
243
244 let duration_regression_percent = ((current_summary.mean_duration
246 - baseline.baseline_summary.mean_duration)
247 / baseline.baseline_summary.mean_duration)
248 * 100.0;
249
250 let throughput_regression_percent = ((baseline.baseline_summary.mean_throughput
251 - current_summary.mean_throughput)
252 / baseline.baseline_summary.mean_throughput)
253 * 100.0;
254
255 let is_significant =
257 self.is_statistically_significant(&baseline.baseline_summary, ¤t_summary);
258
259 let regression_detected = is_significant
260 && (duration_regression_percent > self.config.regression_threshold
261 || throughput_regression_percent > self.config.regression_threshold);
262
263 let details = format!(
264 "Duration change: {:.2}%, Throughput change: {:.2}%, Significant: {}",
265 duration_regression_percent,
266 -throughput_regression_percent, is_significant
268 );
269
270 Ok(RegressionTestResult {
271 operation: operation.to_string(),
272 current_performance: current_summary,
273 baseline_performance: baseline.baseline_summary.clone(),
274 regression_detected,
275 duration_regression_percent,
276 throughput_regression_percent,
277 significance_level: self.config.significance_level,
278 details,
279 })
280 }
281
282 fn is_statistically_significant(
284 &self,
285 baseline: &BaselineSummary,
286 current: &BaselineSummary,
287 ) -> bool {
288 let pooled_std = ((baseline.std_duration.powi(2) / baseline.sample_count as f64)
290 + (current.std_duration.powi(2) / current.sample_count as f64))
291 .sqrt();
292
293 if pooled_std == 0.0 {
294 return false;
295 }
296
297 let t_statistic = (current.mean_duration - baseline.mean_duration).abs() / pooled_std;
298
299 let critical_value = 1.96;
301
302 t_statistic > critical_value
303 }
304
305 pub fn generate_report(&self, results: &[RegressionTestResult]) -> String {
307 let mut report = String::from("Performance Regression Test Report\n");
308 report.push_str("=====================================\n\n");
309
310 let total_tests = results.len();
311 let regressions = results.iter().filter(|r| r.regression_detected).count();
312 let passed = total_tests - regressions;
313
314 report.push_str(&format!(
315 "Summary: {} tests, {} passed, {} regressions detected\n\n",
316 total_tests, passed, regressions
317 ));
318
319 if regressions > 0 {
320 report.push_str("REGRESSIONS DETECTED:\n");
321 report.push_str("====================\n");
322
323 for result in results.iter().filter(|r| r.regression_detected) {
324 report.push_str(&format!("❌ {}\n", result.operation));
325 report.push_str(&format!(
326 " Duration regression: {:.2}%\n",
327 result.duration_regression_percent
328 ));
329 report.push_str(&format!(
330 " Throughput regression: {:.2}%\n",
331 result.throughput_regression_percent
332 ));
333 report.push_str(&format!(" Details: {}\n\n", result.details));
334 }
335 }
336
337 report.push_str("All Test Results:\n");
338 report.push_str("================\n");
339
340 for result in results {
341 let status = if result.regression_detected {
342 "❌ REGRESSION"
343 } else {
344 "✅ PASS"
345 };
346 report.push_str(&format!(
347 "{} {}: {}\n",
348 status, result.operation, result.details
349 ));
350 }
351
352 report
353 }
354
355 pub fn list_baselines(&self) -> Vec<&PerformanceBaseline> {
357 self.baselines.values().collect()
358 }
359
360 pub fn remove_baseline(&mut self, operation: &str) -> bool {
362 self.baselines.remove(operation).is_some()
363 }
364
365 pub fn get_baseline(&self, operation: &str) -> Option<&PerformanceBaseline> {
367 self.baselines.get(operation)
368 }
369}
370
371pub fn run_performance_regression_test<F>(
373 operation_name: &str,
374 operation: F,
375 inputs: &[&Tensor],
376 config: Option<RegressionTestConfig>,
377) -> TorshResult<RegressionTestResult>
378where
379 F: Fn(&[&Tensor]) -> TorshResult<Vec<Tensor>>,
380{
381 let config = config.unwrap_or_default();
382 let mut tester = PerformanceRegressionTester::new(config);
383 tester.load_baselines()?;
384
385 let benchmark_config = BenchmarkConfig::default();
386 let benchmark_results = benchmark(operation_name, operation, inputs, benchmark_config)?;
387
388 match tester.test_regression(operation_name, &benchmark_results) {
389 Ok(result) => Ok(result),
390 Err(_) => {
391 tester.create_baseline(operation_name, &benchmark_results, None, None)?;
393 Err(TorshError::invalid_argument_with_context(
394 "Created new baseline for operation",
395 "run_performance_regression_test",
396 ))
397 }
398 }
399}
400
401#[cfg(test)]
402mod tests {
403 use super::*;
404 use torsh_tensor::creation::randn;
405
406 #[test]
407 fn test_regression_tester_creation() {
408 let config = RegressionTestConfig::default();
409 let tester = PerformanceRegressionTester::new(config);
410 assert_eq!(tester.baselines.len(), 0);
411 }
412
413 #[test]
414 fn test_baseline_creation() -> TorshResult<()> {
415 let input = randn(&[32, 32])?;
416 let inputs = vec![&input];
417
418 let config = BenchmarkConfig {
419 warmup_iters: 1,
420 bench_iters: 2,
421 min_duration: 0.1,
422 max_duration: 1.0,
423 detailed_metrics: false,
424 };
425
426 let results = benchmark(
427 "test_baseline_op",
428 |inputs| -> TorshResult<Vec<Tensor>> { Ok(vec![inputs[0].clone()]) },
429 &inputs,
430 config,
431 )?;
432
433 let regression_config = RegressionTestConfig {
434 baseline_path: std::env::temp_dir()
435 .join("test_baselines.json")
436 .display()
437 .to_string(),
438 ..Default::default()
439 };
440
441 let mut tester = PerformanceRegressionTester::new(regression_config);
442 tester.create_baseline("test_baseline_op", &results, None, None)?;
443
444 assert!(tester.get_baseline("test_baseline_op").is_some());
445 Ok(())
446 }
447}