Skip to main content

torsh_functional/profiling/
regression.rs

1//! Performance regression testing framework
2//!
3//! This module provides comprehensive performance regression testing with
4//! baseline storage, statistical significance testing, and automated reporting.
5
6use super::benchmarking::{benchmark, BenchmarkConfig, BenchmarkResults};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::time::{SystemTime, UNIX_EPOCH};
10use torsh_core::{Result as TorshResult, TorshError};
11use torsh_tensor::Tensor;
12
13/// Performance regression testing framework
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct PerformanceBaseline {
16    /// Operation name
17    pub operation: String,
18    /// Timestamp when baseline was created
19    pub timestamp: u64,
20    /// Git commit hash (if available)
21    pub commit_hash: Option<String>,
22    /// Version information
23    pub version: Option<String>,
24    /// Baseline performance metrics
25    pub baseline_summary: BaselineSummary,
26    /// System information
27    pub system_info: SystemInfo,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct BaselineSummary {
32    pub mean_duration: f64,
33    pub std_duration: f64,
34    pub min_duration: f64,
35    pub max_duration: f64,
36    pub mean_throughput: f64,
37    pub mean_flops: Option<f64>,
38    pub mean_memory_bandwidth: Option<f64>,
39    pub sample_count: usize,
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct SystemInfo {
44    pub os: String,
45    pub arch: String,
46    pub cpu_count: usize,
47    pub total_memory: Option<usize>,
48}
49
50#[derive(Debug, Clone)]
51pub struct RegressionTestResult {
52    pub operation: String,
53    pub current_performance: BaselineSummary,
54    pub baseline_performance: BaselineSummary,
55    pub regression_detected: bool,
56    pub duration_regression_percent: f64,
57    pub throughput_regression_percent: f64,
58    pub significance_level: f64,
59    pub details: String,
60}
61
62#[derive(Debug, Clone)]
63pub struct RegressionTestConfig {
64    /// Acceptable performance degradation threshold (as percentage)
65    pub regression_threshold: f64,
66    /// Statistical significance level for detecting regressions
67    pub significance_level: f64,
68    /// Minimum number of samples required for reliable testing
69    pub min_samples: usize,
70    /// Path to store baseline data
71    pub baseline_path: String,
72    /// Whether to update baselines automatically
73    pub auto_update_baseline: bool,
74}
75
76impl Default for RegressionTestConfig {
77    fn default() -> Self {
78        Self {
79            regression_threshold: 5.0, // 5% degradation threshold
80            significance_level: 0.05,  // 95% confidence
81            min_samples: 10,
82            baseline_path: std::env::temp_dir()
83                .join("torsh_performance_baselines.json")
84                .display()
85                .to_string(),
86            auto_update_baseline: false,
87        }
88    }
89}
90
91pub struct PerformanceRegressionTester {
92    config: RegressionTestConfig,
93    baselines: HashMap<String, PerformanceBaseline>,
94}
95
96impl PerformanceRegressionTester {
97    /// Create a new regression tester
98    pub fn new(config: RegressionTestConfig) -> Self {
99        Self {
100            config,
101            baselines: HashMap::new(),
102        }
103    }
104
105    /// Load baselines from file
106    pub fn load_baselines(&mut self) -> TorshResult<()> {
107        match std::fs::read_to_string(&self.config.baseline_path) {
108            Ok(content) => {
109                self.baselines = serde_json::from_str(&content)
110                    .map_err(|e| TorshError::Other(format!("Failed to parse baselines: {}", e)))?;
111                Ok(())
112            }
113            Err(_) => {
114                // File doesn't exist, start with empty baselines
115                self.baselines = HashMap::new();
116                Ok(())
117            }
118        }
119    }
120
121    /// Save baselines to file
122    pub fn save_baselines(&self) -> TorshResult<()> {
123        let content = serde_json::to_string_pretty(&self.baselines)
124            .map_err(|e| TorshError::Other(format!("Failed to serialize baselines: {}", e)))?;
125
126        std::fs::write(&self.config.baseline_path, content)
127            .map_err(|e| TorshError::Other(format!("Failed to write baselines file: {}", e)))?;
128
129        Ok(())
130    }
131
132    /// Create or update baseline for an operation
133    pub fn create_baseline(
134        &mut self,
135        operation: &str,
136        benchmark_results: &BenchmarkResults,
137        commit_hash: Option<String>,
138        version: Option<String>,
139    ) -> TorshResult<()> {
140        let timestamp = SystemTime::now()
141            .duration_since(UNIX_EPOCH)
142            .expect("system time should be after UNIX_EPOCH")
143            .as_secs();
144
145        let system_info = SystemInfo {
146            os: std::env::consts::OS.to_string(),
147            arch: std::env::consts::ARCH.to_string(),
148            cpu_count: 1, // Simplified - would use num_cpus crate in real implementation
149            total_memory: None, // TODO: Implement memory detection
150        };
151
152        let mean_memory_bandwidth = if !benchmark_results.metrics.is_empty() {
153            Some(
154                benchmark_results
155                    .metrics
156                    .iter()
157                    .filter_map(|m| m.memory_bandwidth)
158                    .sum::<f64>()
159                    / benchmark_results.metrics.len() as f64,
160            )
161        } else {
162            None
163        };
164
165        let baseline_summary = BaselineSummary {
166            mean_duration: benchmark_results.summary.mean_duration,
167            std_duration: benchmark_results.summary.std_duration,
168            min_duration: benchmark_results.summary.min_duration,
169            max_duration: benchmark_results.summary.max_duration,
170            mean_throughput: benchmark_results.summary.mean_throughput,
171            mean_flops: benchmark_results
172                .summary
173                .total_flops
174                .map(|f| f as f64 / benchmark_results.summary.count as f64),
175            mean_memory_bandwidth,
176            sample_count: benchmark_results.summary.count,
177        };
178
179        let baseline = PerformanceBaseline {
180            operation: operation.to_string(),
181            timestamp,
182            commit_hash,
183            version,
184            baseline_summary,
185            system_info,
186        };
187
188        self.baselines.insert(operation.to_string(), baseline);
189        self.save_baselines()?;
190
191        Ok(())
192    }
193
194    /// Test for performance regression
195    pub fn test_regression(
196        &self,
197        operation: &str,
198        current_results: &BenchmarkResults,
199    ) -> TorshResult<RegressionTestResult> {
200        let baseline = self.baselines.get(operation).ok_or_else(|| {
201            TorshError::invalid_argument_with_context(
202                &format!("No baseline found for operation: {}", operation),
203                "test_regression",
204            )
205        })?;
206
207        if current_results.summary.count < self.config.min_samples {
208            return Err(TorshError::invalid_argument_with_context(
209                &format!(
210                    "Insufficient samples: {} < {}",
211                    current_results.summary.count, self.config.min_samples
212                ),
213                "test_regression",
214            ));
215        }
216
217        let current_memory_bandwidth = if !current_results.metrics.is_empty() {
218            Some(
219                current_results
220                    .metrics
221                    .iter()
222                    .filter_map(|m| m.memory_bandwidth)
223                    .sum::<f64>()
224                    / current_results.metrics.len() as f64,
225            )
226        } else {
227            None
228        };
229
230        let current_summary = BaselineSummary {
231            mean_duration: current_results.summary.mean_duration,
232            std_duration: current_results.summary.std_duration,
233            min_duration: current_results.summary.min_duration,
234            max_duration: current_results.summary.max_duration,
235            mean_throughput: current_results.summary.mean_throughput,
236            mean_flops: current_results
237                .summary
238                .total_flops
239                .map(|f| f as f64 / current_results.summary.count as f64),
240            mean_memory_bandwidth: current_memory_bandwidth,
241            sample_count: current_results.summary.count,
242        };
243
244        // Calculate regression percentages
245        let duration_regression_percent = ((current_summary.mean_duration
246            - baseline.baseline_summary.mean_duration)
247            / baseline.baseline_summary.mean_duration)
248            * 100.0;
249
250        let throughput_regression_percent = ((baseline.baseline_summary.mean_throughput
251            - current_summary.mean_throughput)
252            / baseline.baseline_summary.mean_throughput)
253            * 100.0;
254
255        // Perform statistical significance test (simplified t-test)
256        let is_significant =
257            self.is_statistically_significant(&baseline.baseline_summary, &current_summary);
258
259        let regression_detected = is_significant
260            && (duration_regression_percent > self.config.regression_threshold
261                || throughput_regression_percent > self.config.regression_threshold);
262
263        let details = format!(
264            "Duration change: {:.2}%, Throughput change: {:.2}%, Significant: {}",
265            duration_regression_percent,
266            -throughput_regression_percent, // Negative because higher throughput is better
267            is_significant
268        );
269
270        Ok(RegressionTestResult {
271            operation: operation.to_string(),
272            current_performance: current_summary,
273            baseline_performance: baseline.baseline_summary.clone(),
274            regression_detected,
275            duration_regression_percent,
276            throughput_regression_percent,
277            significance_level: self.config.significance_level,
278            details,
279        })
280    }
281
282    /// Simplified statistical significance test
283    fn is_statistically_significant(
284        &self,
285        baseline: &BaselineSummary,
286        current: &BaselineSummary,
287    ) -> bool {
288        // Simplified two-sample t-test assumption
289        let pooled_std = ((baseline.std_duration.powi(2) / baseline.sample_count as f64)
290            + (current.std_duration.powi(2) / current.sample_count as f64))
291            .sqrt();
292
293        if pooled_std == 0.0 {
294            return false;
295        }
296
297        let t_statistic = (current.mean_duration - baseline.mean_duration).abs() / pooled_std;
298
299        // Simplified critical value for 95% confidence (approximately 1.96)
300        let critical_value = 1.96;
301
302        t_statistic > critical_value
303    }
304
305    /// Generate regression test report
306    pub fn generate_report(&self, results: &[RegressionTestResult]) -> String {
307        let mut report = String::from("Performance Regression Test Report\n");
308        report.push_str("=====================================\n\n");
309
310        let total_tests = results.len();
311        let regressions = results.iter().filter(|r| r.regression_detected).count();
312        let passed = total_tests - regressions;
313
314        report.push_str(&format!(
315            "Summary: {} tests, {} passed, {} regressions detected\n\n",
316            total_tests, passed, regressions
317        ));
318
319        if regressions > 0 {
320            report.push_str("REGRESSIONS DETECTED:\n");
321            report.push_str("====================\n");
322
323            for result in results.iter().filter(|r| r.regression_detected) {
324                report.push_str(&format!("❌ {}\n", result.operation));
325                report.push_str(&format!(
326                    "   Duration regression: {:.2}%\n",
327                    result.duration_regression_percent
328                ));
329                report.push_str(&format!(
330                    "   Throughput regression: {:.2}%\n",
331                    result.throughput_regression_percent
332                ));
333                report.push_str(&format!("   Details: {}\n\n", result.details));
334            }
335        }
336
337        report.push_str("All Test Results:\n");
338        report.push_str("================\n");
339
340        for result in results {
341            let status = if result.regression_detected {
342                "❌ REGRESSION"
343            } else {
344                "✅ PASS"
345            };
346            report.push_str(&format!(
347                "{} {}: {}\n",
348                status, result.operation, result.details
349            ));
350        }
351
352        report
353    }
354
355    /// List all available baselines
356    pub fn list_baselines(&self) -> Vec<&PerformanceBaseline> {
357        self.baselines.values().collect()
358    }
359
360    /// Remove a baseline
361    pub fn remove_baseline(&mut self, operation: &str) -> bool {
362        self.baselines.remove(operation).is_some()
363    }
364
365    /// Get baseline for an operation
366    pub fn get_baseline(&self, operation: &str) -> Option<&PerformanceBaseline> {
367        self.baselines.get(operation)
368    }
369}
370
371/// Convenience function to create and run a regression test
372pub fn run_performance_regression_test<F>(
373    operation_name: &str,
374    operation: F,
375    inputs: &[&Tensor],
376    config: Option<RegressionTestConfig>,
377) -> TorshResult<RegressionTestResult>
378where
379    F: Fn(&[&Tensor]) -> TorshResult<Vec<Tensor>>,
380{
381    let config = config.unwrap_or_default();
382    let mut tester = PerformanceRegressionTester::new(config);
383    tester.load_baselines()?;
384
385    let benchmark_config = BenchmarkConfig::default();
386    let benchmark_results = benchmark(operation_name, operation, inputs, benchmark_config)?;
387
388    match tester.test_regression(operation_name, &benchmark_results) {
389        Ok(result) => Ok(result),
390        Err(_) => {
391            // Create baseline if it doesn't exist
392            tester.create_baseline(operation_name, &benchmark_results, None, None)?;
393            Err(TorshError::invalid_argument_with_context(
394                "Created new baseline for operation",
395                "run_performance_regression_test",
396            ))
397        }
398    }
399}
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404    use torsh_tensor::creation::randn;
405
406    #[test]
407    fn test_regression_tester_creation() {
408        let config = RegressionTestConfig::default();
409        let tester = PerformanceRegressionTester::new(config);
410        assert_eq!(tester.baselines.len(), 0);
411    }
412
413    #[test]
414    fn test_baseline_creation() -> TorshResult<()> {
415        let input = randn(&[32, 32])?;
416        let inputs = vec![&input];
417
418        let config = BenchmarkConfig {
419            warmup_iters: 1,
420            bench_iters: 2,
421            min_duration: 0.1,
422            max_duration: 1.0,
423            detailed_metrics: false,
424        };
425
426        let results = benchmark(
427            "test_baseline_op",
428            |inputs| -> TorshResult<Vec<Tensor>> { Ok(vec![inputs[0].clone()]) },
429            &inputs,
430            config,
431        )?;
432
433        let regression_config = RegressionTestConfig {
434            baseline_path: std::env::temp_dir()
435                .join("test_baselines.json")
436                .display()
437                .to_string(),
438            ..Default::default()
439        };
440
441        let mut tester = PerformanceRegressionTester::new(regression_config);
442        tester.create_baseline("test_baseline_op", &results, None, None)?;
443
444        assert!(tester.get_baseline("test_baseline_op").is_some());
445        Ok(())
446    }
447}