Skip to main content

scirs2_core/benchmarking/
regression.rs

1//! # Performance Regression Testing
2//!
3//! This module provides automated regression testing to detect performance
4//! degradation over time and across different versions of the codebase.
5
6use crate::benchmarking::{BenchmarkResult, BenchmarkRunner};
7use crate::error::{CoreError, CoreResult, ErrorContext};
8#[cfg(feature = "serialization")]
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::fs;
12use std::path::{Path, PathBuf};
13use std::time::{Duration, SystemTime, UNIX_EPOCH};
14
15/// Performance regression detection configuration
16#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))]
17#[derive(Debug, Clone)]
18pub struct RegressionConfig {
19    /// Threshold for considering a regression (e.g., 1.1 = 10% slower)
20    pub regression_threshold: f64,
21    /// Minimum number of historical results needed for comparison
22    pub min_historical_samples: usize,
23    /// Statistical confidence level for regression detection
24    pub confidence_level: f64,
25    /// Enable automatic baseline updates
26    pub auto_updatebaseline: bool,
27    /// Directory to store historical results
28    pub results_directory: PathBuf,
29}
30
31impl Default for RegressionConfig {
32    fn default() -> Self {
33        Self {
34            regression_threshold: 1.1, // 10% slower
35            min_historical_samples: 5,
36            confidence_level: 0.95,
37            auto_updatebaseline: false,
38            results_directory: PathBuf::from("benchmark_results"),
39        }
40    }
41}
42
43impl RegressionConfig {
44    /// Create a new regression configuration
45    pub fn new() -> Self {
46        Self::default()
47    }
48
49    /// Set the regression threshold
50    pub fn with_regression_threshold(mut self, threshold: f64) -> Self {
51        self.regression_threshold = threshold;
52        self
53    }
54
55    /// Set the minimum historical samples
56    pub fn with_min_historical_samples(mut self, samples: usize) -> Self {
57        self.min_historical_samples = samples;
58        self
59    }
60
61    /// Set the confidence level
62    pub fn with_confidence_level(mut self, level: f64) -> Self {
63        self.confidence_level = level;
64        self
65    }
66
67    /// Enable automatic baseline updates
68    pub fn with_auto_updatebaseline(mut self, enable: bool) -> Self {
69        self.auto_updatebaseline = enable;
70        self
71    }
72
73    /// Set the results directory
74    pub fn with_results_directory<P: AsRef<Path>>(mut self, dir: P) -> Self {
75        self.results_directory = dir.as_ref().to_path_buf();
76        self
77    }
78}
79
80/// Historical benchmark result for regression analysis
81#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))]
82#[derive(Debug, Clone)]
83pub struct HistoricalResult {
84    /// Timestamp when benchmark was run
85    pub timestamp: u64,
86    /// Git commit hash (if available)
87    pub commit_hash: Option<String>,
88    /// Version string
89    pub version: Option<String>,
90    /// Benchmark name
91    pub benchmark_name: String,
92    /// Mean execution time in nanoseconds
93    pub mean_execution_time_nanos: u64,
94    /// Standard deviation in nanoseconds
95    pub std_dev_nanos: u64,
96    /// Coefficient of variation
97    pub coefficient_of_variation: f64,
98    /// Memory usage in bytes
99    pub mean_memory_usage: usize,
100    /// Sample count
101    pub sample_count: usize,
102    /// Additional metadata
103    pub metadata: HashMap<String, String>,
104}
105
106impl HistoricalResult {
107    /// Create from a benchmark result
108    pub fn from_result(result: &BenchmarkResult) -> Self {
109        let timestamp = SystemTime::now()
110            .duration_since(UNIX_EPOCH)
111            .unwrap_or_default()
112            .as_secs();
113
114        Self {
115            timestamp,
116            commit_hash: Self::get_git_commit_hash(),
117            version: Some(env!("CARGO_PKG_VERSION").to_string()),
118            benchmark_name: result.name.clone(),
119            mean_execution_time_nanos: result.statistics.mean_execution_time.as_nanos() as u64,
120            std_dev_nanos: result.statistics.std_dev_execution_time.as_nanos() as u64,
121            coefficient_of_variation: result.statistics.coefficient_of_variation,
122            mean_memory_usage: result.statistics.mean_memory_usage,
123            sample_count: result.statistics.sample_count,
124            metadata: HashMap::new(),
125        }
126    }
127
128    /// Get the current git commit hash
129    fn get_git_commit_hash() -> Option<String> {
130        // In a real implementation, this would execute `git rev-parse HEAD`
131        // For now, we'll return None
132        None
133    }
134
135    /// Get execution time as Duration
136    pub fn execution_time(&self) -> Duration {
137        Duration::from_nanos(self.mean_execution_time_nanos)
138    }
139
140    /// Get standard deviation as Duration
141    pub fn std_dev(&self) -> Duration {
142        Duration::from_nanos(self.std_dev_nanos)
143    }
144}
145
146/// Regression detection result
147#[derive(Debug, Clone)]
148pub struct RegressionAnalysis {
149    /// Benchmark name
150    pub benchmark_name: String,
151    /// Current result
152    pub current_result: HistoricalResult,
153    /// Baseline for comparison
154    pub baseline: HistoricalResult,
155    /// Historical results used for analysis
156    pub historical_results: Vec<HistoricalResult>,
157    /// Whether a regression was detected
158    pub regression_detected: bool,
159    /// Performance ratio (current / baseline)
160    pub performance_ratio: f64,
161    /// Statistical significance of the difference
162    pub statistical_significance: f64,
163    /// Trend analysis
164    pub trend: PerformanceTrend,
165    /// Confidence in the analysis
166    pub confidence: f64,
167}
168
169/// Performance trend analysis
170#[derive(Debug, Clone, Copy, PartialEq, Eq)]
171pub enum PerformanceTrend {
172    /// Performance is improving over time
173    Improving,
174    /// Performance is stable
175    Stable,
176    /// Performance is degrading
177    Degrading,
178    /// Insufficient data for trend analysis
179    Unknown,
180}
181
182/// Regression detector for performance benchmarks
183pub struct RegressionDetector {
184    config: RegressionConfig,
185}
186
187impl RegressionDetector {
188    /// Create a new regression detector
189    pub fn new(config: RegressionConfig) -> Self {
190        Self { config }
191    }
192
193    /// Analyze a benchmark result for regressions
194    pub fn analyze_regression(&self, result: &BenchmarkResult) -> CoreResult<RegressionAnalysis> {
195        let current_result = HistoricalResult::from_result(result);
196
197        // Load historical results
198        let historical_results = self.load_historical_results(&result.name)?;
199
200        if historical_results.len() < self.config.min_historical_samples {
201            return Ok(RegressionAnalysis {
202                benchmark_name: result.name.clone(),
203                current_result: current_result.clone(),
204                baseline: current_result.clone(),
205                historical_results,
206                regression_detected: false,
207                performance_ratio: 1.0,
208                statistical_significance: 0.0,
209                trend: PerformanceTrend::Unknown,
210                confidence: 0.0,
211            });
212        }
213
214        // Calculate baseline from historical results
215        let baseline = self.calculate_baseline(&historical_results)?;
216
217        // Detect regression
218        let performance_ratio = current_result.mean_execution_time_nanos as f64
219            / baseline.mean_execution_time_nanos as f64;
220
221        let regression_detected = performance_ratio > self.config.regression_threshold;
222
223        // Calculate statistical significance
224        let statistical_significance =
225            self.calculate_statistical_significance(&current_result, &historical_results)?;
226
227        // Analyze trend
228        let trend = self.analyze_trend(&historical_results)?;
229
230        // Calculate confidence based on sample size and variance
231        let confidence = self.calculate_confidence(&historical_results, &current_result)?;
232
233        Ok(RegressionAnalysis {
234            benchmark_name: result.name.clone(),
235            current_result,
236            baseline,
237            historical_results,
238            regression_detected,
239            performance_ratio,
240            statistical_significance,
241            trend,
242            confidence,
243        })
244    }
245
246    /// Run regression analysis on multiple benchmarks
247    pub fn analyze_multiple_regressions(
248        &self,
249        results: &[BenchmarkResult],
250    ) -> CoreResult<Vec<RegressionAnalysis>> {
251        let mut analyses = Vec::new();
252
253        for result in results {
254            let analysis = self.analyze_regression(result)?;
255            analyses.push(analysis);
256        }
257
258        Ok(analyses)
259    }
260
261    /// Store a benchmark result for future regression analysis
262    pub fn store_result(&self, result: &BenchmarkResult) -> CoreResult<()> {
263        let historical_result = HistoricalResult::from_result(result);
264
265        // Ensure results directory exists
266        fs::create_dir_all(&self.config.results_directory).map_err(|e| {
267            CoreError::IoError(ErrorContext::new(format!(
268                "Failed to create results directory: {e}"
269            )))
270        })?;
271
272        // Load existing results
273        let mut historical_results = self.load_historical_results(&result.name)?;
274
275        // Add new result
276        historical_results.push(historical_result);
277
278        // Sort by timestamp
279        historical_results.sort_by_key(|r| r.timestamp);
280
281        // Limit history size (keep last 1000 results)
282        if historical_results.len() > 1000 {
283            historical_results.drain(0..historical_results.len() - 1000);
284        }
285
286        // Save results
287        let file_path = self.get_results_file_path(&result.name);
288        let serialized = serde_json::to_string_pretty(&historical_results).map_err(|e| {
289            CoreError::IoError(ErrorContext::new(format!(
290                "Failed to serialize results: {e}"
291            )))
292        })?;
293
294        fs::write(&file_path, serialized).map_err(|e| {
295            CoreError::IoError(ErrorContext::new(format!(
296                "Failed to write results file: {e}"
297            )))
298        })?;
299
300        Ok(())
301    }
302
303    /// Load historical results for a benchmark
304    fn load_historical_results(&self, benchmark_name: &str) -> CoreResult<Vec<HistoricalResult>> {
305        let file_path = self.get_results_file_path(benchmark_name);
306
307        if !file_path.exists() {
308            return Ok(Vec::new());
309        }
310
311        let content = fs::read_to_string(&file_path).map_err(|e| {
312            CoreError::IoError(ErrorContext::new(format!(
313                "Failed to read results file: {e}"
314            )))
315        })?;
316
317        let results: Vec<HistoricalResult> = serde_json::from_str(&content).map_err(|e| {
318            CoreError::IoError(ErrorContext::new(format!(
319                "Failed to parse results file: {e}"
320            )))
321        })?;
322
323        Ok(results)
324    }
325
326    /// Calculate baseline performance from historical results
327    fn calculate_baseline(
328        &self,
329        historical_results: &[HistoricalResult],
330    ) -> CoreResult<HistoricalResult> {
331        if historical_results.is_empty() {
332            return Err(CoreError::ValidationError(crate::error::ErrorContext::new(
333                "No historical _results for baseline calculation",
334            )));
335        }
336
337        // Use the median of recent _results as baseline
338        let recent_count = (historical_results.len() / 3).max(self.config.min_historical_samples);
339        let recent_results = &historical_results[historical_results.len() - recent_count..];
340
341        let mut execution_times: Vec<u64> = recent_results
342            .iter()
343            .map(|r| r.mean_execution_time_nanos)
344            .collect();
345        execution_times.sort();
346
347        let median_time = if execution_times.len().is_multiple_of(2) {
348            let mid = execution_times.len() / 2;
349            (execution_times[mid - 1] + execution_times[mid]) / 2
350        } else {
351            execution_times[execution_times.len() / 2]
352        };
353
354        // Create a synthetic baseline result
355        let mut baseline = recent_results[recent_results.len() / 2].clone();
356        baseline.mean_execution_time_nanos = median_time;
357
358        Ok(baseline)
359    }
360
361    /// Calculate statistical significance of performance difference
362    fn calculate_statistical_significance(
363        &self,
364        current: &HistoricalResult,
365        historical: &[HistoricalResult],
366    ) -> CoreResult<f64> {
367        if historical.len() < 2 {
368            return Ok(0.0);
369        }
370
371        // Calculate mean and standard deviation of historical results
372        let historical_times: Vec<f64> = historical
373            .iter()
374            .map(|r| r.mean_execution_time_nanos as f64)
375            .collect();
376
377        let historical_mean = historical_times.iter().sum::<f64>() / historical_times.len() as f64;
378        let historical_variance = historical_times
379            .iter()
380            .map(|&x| (x - historical_mean).powi(2))
381            .sum::<f64>()
382            / (historical_times.len() - 1) as f64;
383        let historical_std = historical_variance.sqrt();
384
385        // Calculate z-score
386        let current_time = current.mean_execution_time_nanos as f64;
387        let z_score =
388            (current_time - historical_mean) / (historical_std / (historical.len() as f64).sqrt());
389
390        // Convert to p-value (simplified normal distribution approximation)
391        let p_value = if z_score > 0.0 {
392            0.5 * (1.0 - erf(z_score / std::f64::consts::SQRT_2))
393        } else {
394            0.5 * (1.0 + erf(-z_score / std::f64::consts::SQRT_2))
395        };
396
397        Ok(1.0 - p_value) // Return significance level
398    }
399
400    /// Analyze performance trend over time
401    fn analyze_trend(
402        &self,
403        historical_results: &[HistoricalResult],
404    ) -> CoreResult<PerformanceTrend> {
405        if historical_results.len() < 5 {
406            return Ok(PerformanceTrend::Unknown);
407        }
408
409        // Calculate linear regression slope
410        let n = historical_results.len() as f64;
411        let sum_x: f64 = (0..historical_results.len()).map(|i| i as f64).sum();
412        let sum_y: f64 = historical_results
413            .iter()
414            .map(|r| r.mean_execution_time_nanos as f64)
415            .sum();
416        let sum_xy: f64 = historical_results
417            .iter()
418            .enumerate()
419            .map(|(i, r)| i as f64 * r.mean_execution_time_nanos as f64)
420            .sum();
421        let sum_x_sq: f64 = (0..historical_results.len())
422            .map(|i| (i as f64).powi(2))
423            .sum();
424
425        let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x_sq - sum_x.powi(2));
426
427        // Classify trend based on slope
428        let relative_slope = slope / (sum_y / n); // Normalize by mean
429
430        if relative_slope > 0.01 {
431            Ok(PerformanceTrend::Degrading)
432        } else if relative_slope < -0.01 {
433            Ok(PerformanceTrend::Improving)
434        } else {
435            Ok(PerformanceTrend::Stable)
436        }
437    }
438
439    /// Calculate confidence in the regression analysis
440    fn calculate_confidence(
441        &self,
442        historical_results: &[HistoricalResult],
443        current: &HistoricalResult,
444    ) -> CoreResult<f64> {
445        let sample_size_factor = (historical_results.len() as f64 / 10.0).min(1.0);
446        let variance_factor = if current.coefficient_of_variation < 0.1 {
447            1.0
448        } else {
449            (0.1 / current.coefficient_of_variation).min(1.0)
450        };
451
452        Ok(sample_size_factor * variance_factor)
453    }
454
455    /// Get the file path for storing results
456    fn get_results_file_path(&self, benchmark_name: &str) -> PathBuf {
457        let safe_name = benchmark_name.replace(|c: char| !c.is_alphanumeric(), "_");
458        self.config
459            .results_directory
460            .join(format!("{safe_name}.json"))
461    }
462}
463
464/// Regression testing utilities
465pub struct RegressionTestUtils;
466
467impl RegressionTestUtils {
468    /// Run a complete regression test suite
469    pub fn run_regression_tests(benchmark_names: &[&str]) -> CoreResult<Vec<RegressionAnalysis>> {
470        let mut analyses = Vec::new();
471        let benchmark_runner =
472            BenchmarkRunner::new(crate::benchmarking::BenchmarkConfig::default());
473        let detector = RegressionDetector::new(RegressionConfig::default());
474
475        for &name in benchmark_names {
476            // Run benchmark (this is simplified - in practice you'd have the actual benchmark functions)
477            let result = benchmark_runner.run(name, || {
478                // Placeholder benchmark - replace with actual benchmark
479                std::thread::sleep(Duration::from_micros(100));
480                Ok(())
481            })?;
482
483            // Store result for future analysis
484            detector.store_result(&result)?;
485
486            // Analyze for regressions
487            let analysis = detector.analyze_regression(&result)?;
488            analyses.push(analysis);
489        }
490
491        Ok(analyses)
492    }
493
494    /// Generate a regression report
495    pub fn analyses(analyses: &[RegressionAnalysis]) -> String {
496        let mut report = String::new();
497
498        report.push_str("# Performance Regression Report\n\n");
499
500        let regressions: Vec<_> = analyses.iter().filter(|a| a.regression_detected).collect();
501
502        if regressions.is_empty() {
503            report.push_str("✅ No performance regressions detected.\n\n");
504        } else {
505            report.push_str(&format!(
506                "⚠️ {} performance regression(s) detected:\n\n",
507                regressions.len()
508            ));
509
510            for regression in &regressions {
511                report.push_str(&format!(
512                    "- **{}**: {:.1}% slower (ratio: {:.3}, confidence: {:.0}%)\n",
513                    regression.benchmark_name,
514                    (regression.performance_ratio - 1.0) * 100.0,
515                    regression.performance_ratio,
516                    regression.confidence * 100.0
517                ));
518            }
519            report.push('\n');
520        }
521
522        // Summary statistics
523        report.push_str("## Summary\n\n");
524        report.push_str(&format!("- Total benchmarks: {}\n", analyses.len()));
525        report.push_str(&format!("- Regressions detected: {}\n", regressions.len()));
526
527        let improving = analyses
528            .iter()
529            .filter(|a| a.trend == PerformanceTrend::Improving)
530            .count();
531        let stable = analyses
532            .iter()
533            .filter(|a| a.trend == PerformanceTrend::Stable)
534            .count();
535        let degrading = analyses
536            .iter()
537            .filter(|a| a.trend == PerformanceTrend::Degrading)
538            .count();
539
540        report.push_str(&format!("- Improving trends: {improving}\n"));
541        report.push_str(&format!("- Stable trends: {stable}\n"));
542        report.push_str(&format!("- Degrading trends: {degrading}\n"));
543
544        report
545    }
546}
547
548// Simplified error function approximation
549#[allow(dead_code)]
550fn erf(x: f64) -> f64 {
551    // Abramowitz and Stegun approximation
552    let a1 = 0.254829592;
553    let a2 = -0.284496736;
554    let a3 = 1.421413741;
555    let a4 = -1.453152027;
556    let a5 = 1.061405429;
557    let p = 0.3275911;
558
559    let sign = if x < 0.0 { -1.0 } else { 1.0 };
560    let x = x.abs();
561
562    let t = 1.0 / (1.0 + p * x);
563    let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
564
565    sign * y
566}
567
568#[cfg(test)]
569mod tests {
570    use super::*;
571    use tempfile::TempDir;
572
573    #[test]
574    fn test_regression_config() {
575        let config = RegressionConfig::new()
576            .with_regression_threshold(1.2)
577            .with_min_historical_samples(10)
578            .with_confidence_level(0.99)
579            .with_auto_updatebaseline(true);
580
581        assert_eq!(config.regression_threshold, 1.2);
582        assert_eq!(config.min_historical_samples, 10);
583        assert_eq!(config.confidence_level, 0.99);
584        assert!(config.auto_updatebaseline);
585    }
586
587    #[test]
588    fn test_historical_result() {
589        let benchmark_config = crate::benchmarking::BenchmarkConfig::default();
590        let mut result = BenchmarkResult::new("test_benchmark".to_string(), benchmark_config);
591        result.add_measurement(crate::benchmarking::BenchmarkMeasurement::new(
592            Duration::from_millis(100),
593        ));
594        result.finalize().expect("Operation failed");
595
596        let historical = HistoricalResult::from_result(&result);
597
598        assert_eq!(historical.benchmark_name, "test_benchmark");
599        assert!(historical.mean_execution_time_nanos > 0);
600        assert_eq!(historical.sample_count, 1);
601    }
602
603    #[test]
604    fn test_regression_detector() {
605        let temp_dir = TempDir::new().expect("Operation failed");
606        let config = RegressionConfig::new()
607            .with_results_directory(temp_dir.path())
608            .with_min_historical_samples(1);
609
610        let detector = RegressionDetector::new(config);
611
612        // Create a test benchmark result
613        let benchmark_config = crate::benchmarking::BenchmarkConfig::default();
614        let mut result = BenchmarkResult::new("test_regression".to_string(), benchmark_config);
615        result.add_measurement(crate::benchmarking::BenchmarkMeasurement::new(
616            Duration::from_millis(100),
617        ));
618        result.finalize().expect("Operation failed");
619
620        // Store and analyze
621        detector.store_result(&result).expect("Operation failed");
622        let analysis = detector
623            .analyze_regression(&result)
624            .expect("Operation failed");
625
626        assert_eq!(analysis.benchmark_name, "test_regression");
627        assert!(!analysis.regression_detected); // First result can't be a regression
628    }
629
630    #[test]
631    fn test_performance_trend() {
632        assert_eq!(PerformanceTrend::Improving, PerformanceTrend::Improving);
633        assert_ne!(PerformanceTrend::Improving, PerformanceTrend::Degrading);
634    }
635
636    #[test]
637    fn test_erf_function() {
638        // Test a few known values
639        assert!((erf(0.0) - 0.0).abs() < 1e-6);
640        assert!((erf(1.0) - 0.8427007929).abs() < 1e-6);
641        assert!((erf(-1.0) + 0.8427007929).abs() < 1e-6);
642    }
643}