scirs2-core 0.4.3

//! # Cross-Module Performance Benchmarking Suite
//!
//! This module provides comprehensive performance benchmarking capabilities for measuring
//! and validating performance characteristics when multiple SciRS2 modules are used together.
//! This is critical for 1.0 release to ensure that module integration doesn't introduce
//! unexpected performance regressions.
//!
//! ## Features
//!
//! - **Cross-Module Operation Benchmarks**: Measure performance when modules interact
//! - **Memory Efficiency Analysis**: Track memory usage across module boundaries
//! - **Scalability Testing**: Validate performance scaling with data size and module count
//! - **Regression Detection**: Compare against baseline performance metrics
//! - **Real-World Scenarios**: Test common scientific computing workflows
//! - **Performance Profiling**: Detailed analysis of hotspots and bottlenecks
//!
//! ## Benchmarking Categories
//!
//! ### Data Pipeline Benchmarks
//! - Linear Algebra → Statistics workflows
//! - Signal Processing → FFT → Analysis pipelines
//! - Data I/O → Processing → Output workflows
//! - Machine Learning training pipelines
//!
//! ### Memory Efficiency Benchmarks
//! - Zero-copy data sharing between modules
//! - Memory-mapped array processing
//! - Out-of-core computation workflows
//! - Memory fragmentation analysis
//!
//! ### Scalability Benchmarks
//! - Thread scaling across modules
//! - Memory scaling with dataset size
//! - Module count scaling
//! - NUMA and multi-CPU performance

use crate::error::{CoreError, CoreResult, ErrorContext};
use std::collections::HashMap;
use std::fmt;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};

#[cfg(feature = "serialization")]
use chrono;

#[cfg(feature = "parallel")]
use crate::parallel_ops::*;

/// Cross-module performance benchmarking configuration
#[derive(Debug, Clone)]
pub struct CrossModuleBenchConfig {
    /// Number of benchmark iterations
    pub iterations: usize,
    /// Warmup iterations before measurement
    pub warmup_iterations: usize,
    /// Data sizes to test
    pub datasizes: Vec<usize>,
    /// Thread counts to test (for parallel operations)
    pub ns: Vec<usize>,
    /// Memory limits for testing
    pub memory_limits: Vec<usize>,
    /// Enable detailed profiling
    pub enable_profiling: bool,
    /// Enable regression detection
    pub enable_regression_detection: bool,
    /// Baseline performance file path
    pub baseline_file: Option<String>,
    /// Maximum acceptable regression percentage
    pub max_regression_percent: f64,
    /// Benchmark timeout per test
    pub timeout: Duration,
}

impl Default for CrossModuleBenchConfig {
    fn default() -> Self {
        Self {
            iterations: 100,
            warmup_iterations: 10,
            datasizes: vec![
                1024,             // 1KB
                1024 * 16,        // 16KB
                1024 * 1024,      // 1MB
                1024 * 1024 * 16, // 16MB
            ],
            ns: vec![1, 2, 4, 8],
            memory_limits: vec![
                64 * 1024 * 1024,   // 64MB
                256 * 1024 * 1024,  // 256MB
                1024 * 1024 * 1024, // 1GB
            ],
            enable_profiling: true,
            enable_regression_detection: true,
            baseline_file: None,
            max_regression_percent: 10.0, // 10% regression threshold
            timeout: Duration::from_secs(60),
        }
    }
}

/// Performance measurement result
#[derive(Debug, Clone)]
pub struct PerformanceMeasurement {
    /// Operation name
    pub name: String,
    /// Module combination involved
    pub modules: Vec<String>,
    /// Data size used
    pub datasize: usize,
    /// Thread count used
    pub n: usize,
    /// Average execution time
    pub avg_duration: Duration,
    /// Minimum execution time
    pub min_duration: Duration,
    /// Maximum execution time
    pub max_duration: Duration,
    /// Standard deviation
    pub std_deviation: Duration,
    /// Throughput (operations per second)
    pub throughput: f64,
    /// Memory usage (bytes)
    pub memory_usage: usize,
    /// Peak memory usage (bytes)
    pub peak_memory: usize,
    /// CPU utilization percentage
    pub cpu_utilization: f64,
    /// Operations count
    pub operations_count: usize,
    /// Detailed timing breakdown
    pub timing_breakdown: HashMap<String, Duration>,
}

impl PerformanceMeasurement {
    /// Create a new performance measurement
    pub fn new(name: String, modules: Vec<String>) -> Self {
        Self {
            name,
            modules,
            datasize: 0,
            n: 1,
            avg_duration: Duration::from_nanos(0),
            min_duration: Duration::from_nanos(u64::MAX),
            max_duration: Duration::from_nanos(0),
            std_deviation: Duration::from_nanos(0),
            throughput: 0.0,
            memory_usage: 0,
            peak_memory: 0,
            cpu_utilization: 0.0,
            operations_count: 0,
            timing_breakdown: HashMap::new(),
        }
    }

    /// Calculate efficiency score (0-100)
    pub fn efficiency_score(&self) -> f64 {
        if self.avg_duration.as_nanos() == 0 {
            return 0.0;
        }

        // Simple efficiency metric based on throughput and memory usage
        let time_efficiency = 1.0 / (self.avg_duration.as_secs_f64() + 1e-9);
        let memory_efficiency = if self.memory_usage > 0 {
            self.throughput / (self.memory_usage as f64 / 1024.0 / 1024.0) // ops per MB
        } else {
            self.throughput
        };

        ((time_efficiency + memory_efficiency) / 2.0 * 100.0).min(100.0)
    }
}

/// Benchmark suite result
#[derive(Debug, Clone)]
pub struct BenchmarkSuiteResult {
    /// Suite name
    pub name: String,
    /// Individual measurements
    pub measurements: Vec<PerformanceMeasurement>,
    /// Overall suite duration
    pub total_duration: Duration,
    /// Average efficiency score
    pub avg_efficiency: f64,
    /// Regression analysis results
    pub regression_analysis: Option<RegressionAnalysis>,
    /// Scalability analysis
    pub scalability_analysis: ScalabilityAnalysis,
    /// Memory efficiency analysis
    pub memory_analysis: MemoryEfficiencyAnalysis,
}

/// Regression analysis results
#[derive(Debug, Clone)]
pub struct RegressionAnalysis {
    /// Whether regression was detected
    pub regression_detected: bool,
    /// Regressed benchmarks
    pub regressions: Vec<RegressionResult>,
    /// Improved benchmarks
    pub improvements: Vec<RegressionResult>,
    /// Overall performance change percentage
    pub overall_change_percent: f64,
}

/// Individual regression result
#[derive(Debug, Clone)]
pub struct RegressionResult {
    /// Benchmark name
    pub benchmark_name: String,
    /// Baseline performance
    pub baseline_duration: Duration,
    /// Current performance
    pub current_duration: Duration,
    /// Change percentage (positive = regression, negative = improvement)
    pub change_percent: f64,
    /// Significance of change
    pub significance: RegressionSignificance,
}

/// Significance levels for performance changes
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RegressionSignificance {
    /// Negligible change (< 5%)
    Negligible,
    /// Minor change (5-15%)
    Minor,
    /// Moderate change (15-30%)
    Moderate,
    /// Major change (30-50%)
    Major,
    /// Critical change (> 50%)
    Critical,
}

/// Scalability analysis results
#[derive(Debug, Clone)]
pub struct ScalabilityAnalysis {
    /// Thread scalability efficiency (0.saturating_sub(1))
    pub thread_scalability: f64,
    /// Data size scalability efficiency (0.saturating_sub(1))
    pub data_scalability: f64,
    /// Memory scalability efficiency (0.saturating_sub(1))
    pub memory_scalability: f64,
    /// Scalability breakdown by data size
    pub datasize_breakdown: HashMap<usize, f64>,
    /// Scalability breakdown by thread count
    pub n_breakdown: HashMap<usize, f64>,
}

/// Memory efficiency analysis
#[derive(Debug, Clone)]
pub struct MemoryEfficiencyAnalysis {
    /// Average memory usage per operation (bytes)
    pub avg_memory_per_op: f64,
    /// Peak to average memory ratio
    pub peak_to_avg_ratio: f64,
    /// Memory fragmentation score (0.saturating_sub(1), lower is better)
    pub fragmentation_score: f64,
    /// Zero-copy efficiency score (0.saturating_sub(1))
    pub zero_copy_efficiency: f64,
    /// Memory bandwidth utilization (0.saturating_sub(1))
    pub bandwidth_utilization: f64,
}

/// Cross-module benchmark runner
pub struct CrossModuleBenchmarkRunner {
    config: CrossModuleBenchConfig,
    results: Arc<Mutex<Vec<BenchmarkSuiteResult>>>,
}

impl CrossModuleBenchmarkRunner {
    /// Create a new benchmark runner
    pub fn new(config: CrossModuleBenchConfig) -> Self {
        Self {
            config,
            results: Arc::new(Mutex::new(Vec::new())),
        }
    }

    /// Run comprehensive cross-module benchmarks
    pub fn run_benchmarks(&self) -> CoreResult<BenchmarkSuiteResult> {
        let start_time = Instant::now();
        let mut measurements = Vec::new();

        println!("🚀 Running Cross-Module Performance Benchmarks");
        println!("==============================================");

        // Run data pipeline benchmarks
        measurements.extend(self.run_data_pipeline_benchmarks()?);

        // Run memory efficiency benchmarks
        measurements.extend(self.run_memory_efficiency_benchmarks()?);

        // Run scalability benchmarks
        measurements.extend(self.run_scalability_benchmarks()?);

        // Run real-world scenario benchmarks
        measurements.extend(self.run_real_world_benchmarks()?);

        let total_duration = start_time.elapsed();

        // Calculate overall metrics
        let avg_efficiency = if measurements.is_empty() {
            0.0
        } else {
            measurements
                .iter()
                .map(|m| m.efficiency_score())
                .sum::<f64>()
                / measurements.len() as f64
        };

        // Perform regression analysis
        let regression_analysis = if self.config.enable_regression_detection {
            Some(self.analyze_regressions(&measurements)?)
        } else {
            None
        };

        // Perform scalability analysis
        let scalability_analysis = self.analyze_scalability(&measurements)?;

        // Perform memory analysis
        let memory_analysis = self.analyze_memory_efficiency(&measurements)?;

        let suite_result = BenchmarkSuiteResult {
            name: "Cross-Module Performance Suite".to_string(),
            measurements,
            total_duration,
            avg_efficiency,
            regression_analysis,
            scalability_analysis,
            memory_analysis,
        };

        // Store results
        {
            let mut results = self.results.lock().map_err(|_| {
                CoreError::ComputationError(ErrorContext::new("Failed to lock results".to_string()))
            })?;
            results.push(suite_result.clone());
        }

        Ok(suite_result)
    }

    /// Run data pipeline benchmarks
    fn run_data_pipeline_benchmarks(&self) -> CoreResult<Vec<PerformanceMeasurement>> {
        let mut measurements = Vec::new();

        println!("📊 Running Data Pipeline Benchmarks...");

        // Benchmark: Linear Algebra + Statistics pipeline
        measurements.push(self.benchmark_linalg_stats_pipeline()?);

        // Benchmark: Signal Processing + FFT pipeline
        measurements.push(self.benchmark_signal_fft_pipeline()?);

        // Benchmark: Data I/O + Processing pipeline
        measurements.push(self.benchmark_io_processing_pipeline()?);

        // Benchmark: Machine Learning pipeline
        measurements.push(self.benchmark_ml_pipeline()?);

        Ok(measurements)
    }

    /// Benchmark linear algebra + statistics pipeline
    fn benchmark_linalg_stats_pipeline(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "linalg_stats_pipeline".to_string(),
            vec!["scirs2-linalg".to_string(), "scirs2-stats".to_string()],
        );

        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                // Simulate linear algebra + statistics operations
                self.simulate_linalg_stats_workflow(datasize)
            })?;

            // Update measurement with largest data size results
            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.min_duration = timing_data.min_duration;
                measurement.max_duration = timing_data.max_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate linear algebra + statistics workflow
    fn simulate_linalg_stats_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate creating matrices and computing statistics
        let matrix_size = (datasize as f64).sqrt() as usize;
        let matrix_elements = matrix_size * matrix_size;

        // Simulate matrix operations (matrix multiplication cost)
        let operations = matrix_size.pow(3); // O(n^3) for matrix multiplication
        for _ in 0..operations.min(1000000) {
            // Simulate floating-point operations
            let result = 1.23456 * 7.89012 + 3.45678;
        }

        // Simulate statistics computation
        let stats_operations = datasize; // O(n) for statistics
        for _ in 0..stats_operations.min(1000000) {
            let result = 1.23456_f64.sin() + 7.89012_f64.cos();
        }

        Ok(())
    }

    /// Benchmark signal processing + FFT pipeline
    fn benchmark_signal_fft_pipeline(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "signal_fft_pipeline".to_string(),
            vec!["scirs2-signal".to_string(), "scirs2-fft".to_string()],
        );

        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_signal_fft_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate signal processing + FFT workflow
    fn simulate_signal_fft_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate signal processing operations
        let signal_length = datasize / std::mem::size_of::<f64>();

        // Simulate filtering operations (convolution-like)
        let filter_operations = signal_length.min(1000000);
        for _ in 0..filter_operations {
            let result = 1.23456_f64.sin() * 0.78901 + 2.34567_f64.cos();
        }

        // Simulate FFT operations (O(n log n))
        let fft_operations = (signal_length as f64 * (signal_length as f64).log2()) as usize;
        for _ in 0..fft_operations.min(1000000) {
            let result = std::f64::consts::PI * std::f64::consts::E.exp();
        }

        Ok(())
    }

    /// Benchmark data I/O + processing pipeline
    fn benchmark_io_processing_pipeline(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "io_processing_pipeline".to_string(),
            vec!["scirs2-io".to_string(), "scirs2-core".to_string()],
        );

        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_io_processing_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate data I/O + processing workflow
    fn simulate_io_processing_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate I/O operations (memory allocation and access)
        let buffer = vec![0u8; datasize];

        // Simulate data processing
        let mut checksum = 0u64;
        for &byte in &buffer {
            checksum = checksum.wrapping_add(byte as u64);
        }

        // Simulate validation operations
        for i in 0..datasize.min(100000) {
            let value = (0 as f64) / datasize as f64;
            if !value.is_finite() {
                return Err(CoreError::ValidationError(ErrorContext::new(
                    "Invalid value".to_string(),
                )));
            }
        }

        // Use checksum to prevent optimization
        if checksum == u64::MAX {
            return Err(CoreError::ComputationError(ErrorContext::new(
                "Unlikely checksum".to_string(),
            )));
        }

        Ok(())
    }

    /// Benchmark machine learning pipeline
    fn benchmark_ml_pipeline(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "ml_pipeline".to_string(),
            vec!["scirs2-neural".to_string(), "scirs2-optimize".to_string()],
        );

        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_ml_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate machine learning workflow
    fn simulate_ml_workflow(&self, datasize: usize) -> CoreResult<()> {
        let feature_count = (datasize / 1000).max(10);
        let sample_count = datasize / feature_count;

        // Simulate forward pass computations
        for _ in 0..sample_count.min(10000) {
            for _ in 0..feature_count.min(1000) {
                let activation = 1.0 / (1.0 + (-0.5_f64).exp()); // Sigmoid activation
            }
        }

        // Simulate optimization step
        for _ in 0..(feature_count * sample_count).min(100000) {
            let gradient = 0.01 * 1.23456; // Gradient descent simulation
        }

        Ok(())
    }

    /// Run memory efficiency benchmarks
    fn run_memory_efficiency_benchmarks(&self) -> CoreResult<Vec<PerformanceMeasurement>> {
        let mut measurements = Vec::new();

        println!("🧠 Running Memory Efficiency Benchmarks...");

        measurements.push(self.benchmark_zero_copy_operations()?);
        measurements.push(self.benchmark_memory_mapped_operations()?);
        measurements.push(self.benchmark_out_of_core_operations()?);

        Ok(measurements)
    }

    /// Benchmark zero-copy operations
    fn benchmark_zero_copy_operations(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "zero_copy_operations".to_string(),
            vec!["scirs2-core".to_string()],
        );

        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_zero_copy_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate zero-copy operations
    fn simulate_zero_copy_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Create data buffer
        let buffer = vec![1.0f64; datasize / std::mem::size_of::<f64>()];

        // Simulate zero-copy views and slicing
        let chunk_size = buffer.len() / 4;
        for i in 0..4 {
            let start = i * chunk_size;
            let end = ((i + 1) * chunk_size).min(buffer.len());
            let slice = &buffer[start..end];

            // Simulate operations on the slice without copying
            let mut sum = 0.0;
            for &value in slice {
                sum += value;
            }

            // Prevent optimization
            if sum < 0.0 {
                return Err(CoreError::ComputationError(ErrorContext::new(
                    "Invalid sum".to_string(),
                )));
            }
        }

        Ok(())
    }

    /// Benchmark memory-mapped operations
    fn benchmark_memory_mapped_operations(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "memory_mapped_operations".to_string(),
            vec!["scirs2-core".to_string(), "scirs2-io".to_string()],
        );

        println!("   Simulating memory-mapped operations...");

        // Simulate memory-mapped file operations
        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_mmap_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate memory-mapped workflow
    fn simulate_mmap_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate memory-mapped array access patterns
        let element_count = datasize / std::mem::size_of::<f64>();
        let chunk_size = element_count / 16; // Process in 16 chunks

        for chunk_id in 0..16 {
            let start_idx = chunk_id * chunk_size;
            let end_idx = ((chunk_id + 1) * chunk_size).min(element_count);

            // Simulate sequential and random access patterns
            for idx in start_idx..end_idx {
                let value = (idx as f64).sin(); // Simulate computation
                if !value.is_finite() {
                    return Err(CoreError::ComputationError(ErrorContext::new(
                        "Invalid computation".to_string(),
                    )));
                }
            }
        }

        Ok(())
    }

    /// Benchmark out-of-core operations
    fn benchmark_out_of_core_operations(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "out_of_core_operations".to_string(),
            vec!["scirs2-core".to_string()],
        );

        println!("   Simulating out-of-core operations...");

        // Simulate out-of-core processing with chunked data access
        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_out_of_core_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate out-of-core workflow
    fn simulate_out_of_core_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate processing data larger than available memory
        let total_elements = datasize / std::mem::size_of::<f64>();
        let chunk_size = 1024; // Process 1024 elements at a time
        let num_chunks = total_elements.div_ceil(chunk_size);

        for chunk_idx in 0..num_chunks {
            let start = chunk_idx * chunk_size;
            let end = (start + chunk_size).min(total_elements);
            let chunk_len = end - start;

            // Simulate loading chunk into memory
            let chunk_data = vec![1.0f64; chunk_len];

            // Simulate processing the chunk
            let mut sum = 0.0;
            for &value in &chunk_data {
                sum += value * value; // Simple computation
            }

            // Simulate writing results
            if sum < 0.0 {
                return Err(CoreError::ComputationError(ErrorContext::new(
                    "Invalid computation result".to_string(),
                )));
            }
        }

        Ok(())
    }

    /// Run scalability benchmarks
    fn run_scalability_benchmarks(&self) -> CoreResult<Vec<PerformanceMeasurement>> {
        let mut measurements = Vec::new();

        println!("📈 Running Scalability Benchmarks...");

        measurements.push(self.benchmark_thread_scalability()?);
        measurements.push(self.benchmark_datasize_scalability()?);
        measurements.push(self.benchmark_memory_scalability()?);

        Ok(measurements)
    }

    /// Benchmark thread scalability
    fn benchmark_thread_scalability(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "thread_scalability".to_string(),
            vec!["scirs2-core".to_string()],
        );

        #[cfg(feature = "parallel")]
        {
            for &n in &self.config.ns {
                let timing_data = self.time_operation(&format!("{n}"), || {
                    self.simulate_scalable_operation(n * 1024) // Use n as a scaling factor
                })?;

                if n == *self.config.ns.last().expect("Operation failed") {
                    measurement.n = n;
                    measurement.avg_duration = timing_data.avg_duration;
                    measurement.throughput = timing_data.throughput;
                    measurement.operations_count = timing_data.operations_count;
                }
            }
        }

        #[cfg(not(feature = "parallel"))]
        {
            measurement.n = 1;
            measurement.avg_duration = Duration::from_millis(100);
            measurement.throughput = 1000.0;
            measurement.operations_count = 1000;
        }

        Ok(measurement)
    }

    /// Simulate parallel operations
    #[cfg(feature = "parallel")]
    fn count(n: usize) -> CoreResult<()> {
        let work_items = 100000;
        let items_per_thread = work_items / n;

        // Use thread pool to simulate parallel work
        crate::parallel_ops::ThreadPoolBuilder::new()
            .num_threads(n)
            .build()
            .map_err(|e| CoreError::ComputationError(ErrorContext::new(format!("{e}"))))?
            .install(|| {
                (0..n).into_par_iter().try_for_each(|_| {
                    for _ in 0..items_per_thread {
                        let result = 1.23456_f64.sin() + 7.89012_f64.cos();
                    }
                    Ok::<(), CoreError>(())
                })
            })?;

        Ok(())
    }

    /// Simulate parallel operations (fallback)
    #[cfg(not(feature = "parallel"))]
    fn count(n: usize) -> CoreResult<()> {
        // Fallback: simulate work sequentially
        for _ in 0..100000 {
            let result = 1.23456_f64.sin() + 7.89012_f64.cos();
        }
        Ok(())
    }

    /// Benchmark data size scalability
    fn benchmark_datasize_scalability(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "datasize_scalability".to_string(),
            vec!["scirs2-core".to_string()],
        );

        println!("   Testing data size scalability...");

        // Test performance across different data sizes
        let mut scalability_scores = Vec::new();

        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_scalable_operation(datasize)
            })?;

            // Calculate efficiency relative to data size
            let ops_per_byte = timing_data.throughput / datasize as f64;
            scalability_scores.push(ops_per_byte);

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate scalable operation for data size testing
    fn simulate_scalable_operation(&self, datasize: usize) -> CoreResult<()> {
        let elements = datasize / std::mem::size_of::<f64>();

        // Linear complexity operation - should scale well
        for i in 0..elements.min(1000000) {
            let value = (0 as f64) / elements as f64;
            let result = value.sin() + value.cos();
        }

        Ok(())
    }

    /// Benchmark memory scalability
    fn benchmark_memory_scalability(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "memory_scalability".to_string(),
            vec!["scirs2-core".to_string()],
        );

        println!("   Testing memory scalability...");

        // Test performance with different memory limits
        for &memory_limit in &self.config.memory_limits {
            let timing_data = self.time_operation(&format!("{memory_limit}"), || {
                self.simulate_scalable_operation(memory_limit)
            })?;

            if memory_limit == *self.config.memory_limits.last().expect("Operation failed") {
                measurement.memory_usage = memory_limit;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate memory-constrained operation
    fn limit(n: usize) -> CoreResult<()> {
        // Allocate memory up to the _limit and perform operations
        let element_count = (n / std::mem::size_of::<f64>()).min(1000000);
        let buffer = vec![1.0f64; element_count];

        // Perform memory-intensive operations
        let mut result = 0.0;
        for (i, &value) in buffer.iter().enumerate() {
            result += value * (i as f64).sqrt();
        }

        // Prevent optimization
        if result < 0.0 {
            return Err(CoreError::ComputationError(ErrorContext::new(
                "Invalid result".to_string(),
            )));
        }

        Ok(())
    }

    /// Run real-world scenario benchmarks
    fn run_real_world_benchmarks(&self) -> CoreResult<Vec<PerformanceMeasurement>> {
        let mut measurements = Vec::new();

        println!("🌍 Running Real-World Scenario Benchmarks...");

        measurements.push(self.benchmark_scientific_simulation()?);
        measurements.push(self.benchmark_data_analysis_pipeline()?);
        measurements.push(self.benchmark_machine_learning_training()?);

        Ok(measurements)
    }

    /// Benchmark scientific simulation scenario
    fn benchmark_scientific_simulation(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "scientific_simulation".to_string(),
            vec!["scirs2-linalg".to_string(), "scirs2-integrate".to_string()],
        );

        println!("   Running scientific simulation benchmark...");

        // Simulate a complete scientific simulation workflow
        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_scientific_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate scientific simulation workflow
    fn simulate_scientific_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate a typical scientific simulation: ODE solving + linear algebra
        let grid_size = (datasize as f64).sqrt() as usize;
        let time_steps = 100;

        // Simulate initial conditions setup (linear algebra operations)
        for i in 0..grid_size {
            for j in 0..grid_size {
                let x = 0 as f64 / grid_size as f64;
                let y = j as f64 / grid_size as f64;
                let initial_value = (x * x + y * y).exp() * (-x * y).sin();
            }
        }

        // Simulate time evolution (integration + linear algebra)
        for _step in 0..time_steps {
            // Simulate spatial derivatives (finite differences)
            for i in 1..(grid_size - 1) {
                for _j in 1..(grid_size - 1) {
                    let dt = 0.01;
                    let dx = 1.0 / grid_size as f64;
                    let laplacian = dt / (dx * dx); // Simplified Laplacian operator
                }
            }

            // Simulate matrix operations for implicit schemes
            let matrix_ops = grid_size * grid_size / 100; // Reduced for performance
            for _ in 0..matrix_ops {
                let result = 1.23456_f64.sin() + 0.78901_f64.cos();
            }
        }

        Ok(())
    }

    /// Benchmark data analysis pipeline scenario
    fn benchmark_data_analysis_pipeline(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "data_analysis_pipeline".to_string(),
            vec![
                "scirs2-io".to_string(),
                "scirs2-stats".to_string(),
                "scirs2-signal".to_string(),
            ],
        );

        println!("   Running data analysis pipeline benchmark...");

        // Simulate a complete data analysis workflow
        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_data_analysis_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate data analysis workflow
    fn simulate_data_analysis_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate: Data Loading -> Preprocessing -> Statistical Analysis -> Signal Processing
        let sample_count = datasize / std::mem::size_of::<f64>();

        // Step 1: Data loading simulation (I/O operations)
        let raw_data = vec![0.0f64; sample_count];

        // Step 2: Data preprocessing (cleaning, filtering)
        let mut processed_data = Vec::with_capacity(sample_count);
        for (i, &value) in raw_data.iter().enumerate() {
            let cleaned_value = value + (i as f64 * 0.01).sin(); // Add synthetic signal
            processed_data.push(cleaned_value);
        }

        // Step 3: Statistical analysis
        let mut sum = 0.0;
        let mut sum_squares = 0.0;
        for &value in &processed_data {
            sum += value;
            sum_squares += value * value;
        }
        let mean = sum / processed_data.len() as f64;
        let variance = (sum_squares / processed_data.len() as f64) - (mean * mean);

        // Step 4: Signal processing (filtering, frequency analysis simulation)
        for (i, &value) in processed_data.iter().enumerate() {
            let freq = 2.0 * std::f64::consts::PI * (i as f64) / sample_count as f64;
            let filtered = value * freq.cos(); // Simple frequency domain operation
        }

        // Prevent optimization
        if variance < 0.0 {
            return Err(CoreError::ComputationError(ErrorContext::new(
                "Invalid variance".to_string(),
            )));
        }

        Ok(())
    }

    /// Benchmark machine learning training scenario
    fn benchmark_machine_learning_training(&self) -> CoreResult<PerformanceMeasurement> {
        let mut measurement = PerformanceMeasurement::new(
            "ml_training".to_string(),
            vec![
                "scirs2-neural".to_string(),
                "scirs2-optimize".to_string(),
                "scirs2-linalg".to_string(),
            ],
        );

        println!("   Running ML training benchmark...");

        // Simulate a complete ML training workflow
        for &datasize in &self.config.datasizes {
            let timing_data = self.time_operation(&format!("{datasize}"), || {
                self.simulate_ml_training_workflow(datasize)
            })?;

            if datasize == *self.config.datasizes.last().expect("Operation failed") {
                measurement.datasize = datasize;
                measurement.avg_duration = timing_data.avg_duration;
                measurement.throughput = timing_data.throughput;
                measurement.memory_usage = timing_data.memory_usage;
                measurement.operations_count = timing_data.operations_count;
            }
        }

        Ok(measurement)
    }

    /// Simulate machine learning training workflow
    fn simulate_ml_training_workflow(&self, datasize: usize) -> CoreResult<()> {
        // Simulate: Data Prep -> Training Loop (Forward + Backward + Optimization)
        let batch_size = 32;
        let feature_dim = 128;
        let hidden_dim = 256;
        let numbatches = (datasize / (batch_size * feature_dim)).max(1);
        let epochs = 10;

        // Simulate training loop
        for _epoch in 0..epochs {
            for _batch in 0..numbatches {
                // Forward pass simulation (matrix multiplications)
                for i in 0..batch_size {
                    for j in 0..hidden_dim {
                        let mut activation = 0.0;
                        for k in 0..feature_dim {
                            let weight = ((i + j + k) as f64) * 0.01;
                            let input = ((i * k) as f64) * 0.001;
                            activation += weight * input;
                        }
                        // Apply activation function
                        let output = 1.0 / (1.0 + (-activation).exp()); // Sigmoid
                    }
                }

                // Backward pass simulation (gradient computation)
                for i in 0..hidden_dim {
                    for j in 0..feature_dim {
                        let gradient = ((i + j) as f64) * 0.001;
                        let weight_update = gradient * 0.01; // Learning rate = 0.01
                    }
                }

                // Optimization step simulation
                let param_count = hidden_dim * feature_dim;
                for _ in 0..param_count / 1000 {
                    // Reduced for performance
                    let momentum_update = 0.9 * 0.01 + 0.1 * 0.001; // Momentum optimization
                }
            }
        }

        Ok(())
    }

    /// Time an operation with multiple iterations
    fn time_operation<F>(&self, name: &str, mut operation: F) -> CoreResult<TimingData>
    where
        F: FnMut() -> CoreResult<()>,
    {
        let mut durations = Vec::new();

        // Warmup iterations
        for _ in 0..self.config.warmup_iterations {
            operation()?;
        }

        // Actual timing iterations
        for _ in 0..self.config.iterations {
            let start = Instant::now();
            operation()?;
            let duration = start.elapsed();
            durations.push(duration);
        }

        // Calculate statistics
        let total_duration: Duration = durations.iter().sum();
        let avg_duration = total_duration / durations.len() as u32;
        let min_duration = *durations.iter().min().expect("Operation failed");
        let max_duration = *durations.iter().max().expect("Operation failed");

        // Calculate standard deviation
        let variance = durations
            .iter()
            .map(|d| {
                let diff = d.as_nanos() as i128 - avg_duration.as_nanos() as i128;
                (diff * diff) as u128
            })
            .sum::<u128>()
            / durations.len() as u128;
        let std_deviation = Duration::from_nanos((variance as f64).sqrt() as u64);

        let throughput = if avg_duration.as_secs_f64() > 0.0 {
            self.config.iterations as f64 / avg_duration.as_secs_f64()
        } else {
            0.0
        };

        Ok(TimingData {
            name: name.to_string(),
            avg_duration,
            min_duration,
            max_duration,
            std_deviation,
            throughput,
            memory_usage: 1024 * 1024, // Placeholder
            operations_count: self.config.iterations,
        })
    }

    /// Analyze performance regressions against a statistical baseline derived from
    /// the current measurement set. Each benchmark's timing is compared to the
    /// fleet-wide mean; an entry is a regression when its duration exceeds
    /// `mean + 2 * std_dev` (simplified z-score outlier) or when the
    /// configured `max_regression_percent` threshold is crossed relative to a
    /// stored external baseline (if `baseline_file` is set).
    ///
    /// When no external baseline is available the method uses intra-suite
    /// statistics so regressions can still be detected from a single run
    /// (e.g. one benchmark taking abnormally longer than all others).
    fn analyze_regressions(
        &self,
        measurements: &[PerformanceMeasurement],
    ) -> CoreResult<RegressionAnalysis> {
        if measurements.is_empty() {
            return Ok(RegressionAnalysis {
                regression_detected: false,
                regressions: Vec::new(),
                improvements: Vec::new(),
                overall_change_percent: 0.0,
            });
        }

        // --- Step 1: compute per-benchmark duration statistics (nanos) ---
        let durations_ns: Vec<f64> = measurements
            .iter()
            .map(|m| m.avg_duration.as_nanos() as f64)
            .collect();

        let n = durations_ns.len() as f64;
        let mean_ns = durations_ns.iter().sum::<f64>() / n;
        let variance_ns = durations_ns
            .iter()
            .map(|&d| {
                let diff = d - mean_ns;
                diff * diff
            })
            .sum::<f64>()
            / n;
        let std_ns = variance_ns.sqrt();

        // --- Step 2: load optional external baseline ---
        // When a baseline_file path is configured and the file exists on disk,
        // each line is expected to be: `<benchmark_name> <avg_nanos>`.
        // Missing entries fall back to the intra-suite mean.
        let mut baseline_map: HashMap<String, f64> = HashMap::new();
        if let Some(ref path) = self.config.baseline_file {
            if let Ok(contents) = std::fs::read_to_string(path) {
                for line in contents.lines() {
                    let parts: Vec<&str> = line.splitn(2, ' ').collect();
                    if parts.len() == 2 {
                        if let Ok(nanos) = parts[1].trim().parse::<f64>() {
                            baseline_map.insert(parts[0].trim().to_string(), nanos);
                        }
                    }
                }
            }
        }

        // --- Step 3: classify each measurement ---
        let mut regressions: Vec<RegressionResult> = Vec::new();
        let mut improvements: Vec<RegressionResult> = Vec::new();
        let mut total_change_sum = 0.0_f64;

        let threshold_factor = self.config.max_regression_percent / 100.0;

        for m in measurements {
            let current_ns = m.avg_duration.as_nanos() as f64;

            // Prefer external baseline; fall back to intra-suite mean.
            let baseline_ns = baseline_map.get(&m.name).copied().unwrap_or(mean_ns);

            // Avoid division by zero when all durations are zero.
            let change_percent = if baseline_ns > 0.0 {
                (current_ns - baseline_ns) / baseline_ns * 100.0
            } else {
                0.0
            };

            // Also compute z-score for intra-suite outlier detection.
            let z_score = if std_ns > 0.0 {
                (current_ns - mean_ns) / std_ns
            } else {
                0.0
            };

            // A measurement is a regression when:
            //   a) change_percent exceeds the configured threshold, OR
            //   b) z-score exceeds +2 (statistical outlier on the slow side)
            let is_regression = change_percent > threshold_factor * 100.0 || z_score > 2.0;
            let is_improvement = change_percent < -(threshold_factor * 100.0) || z_score < -2.0;

            let significance = {
                let abs_change = change_percent.abs();
                if abs_change < 5.0 {
                    RegressionSignificance::Negligible
                } else if abs_change < 15.0 {
                    RegressionSignificance::Minor
                } else if abs_change < 30.0 {
                    RegressionSignificance::Moderate
                } else if abs_change < 50.0 {
                    RegressionSignificance::Major
                } else {
                    RegressionSignificance::Critical
                }
            };

            let result = RegressionResult {
                benchmark_name: m.name.clone(),
                baseline_duration: Duration::from_nanos(baseline_ns.max(0.0) as u64),
                current_duration: m.avg_duration,
                change_percent,
                significance,
            };

            if is_regression {
                regressions.push(result);
            } else if is_improvement {
                improvements.push(result);
            }

            total_change_sum += change_percent;
        }

        let overall_change_percent = total_change_sum / measurements.len() as f64;
        let regression_detected = !regressions.is_empty();

        Ok(RegressionAnalysis {
            regression_detected,
            regressions,
            improvements,
            overall_change_percent,
        })
    }

    /// Analyze scalability characteristics
    fn analyze_scalability(
        &self,
        measurements: &[PerformanceMeasurement],
    ) -> CoreResult<ScalabilityAnalysis> {
        let mut datasize_breakdown = HashMap::new();
        let mut n_breakdown = HashMap::new();

        // Calculate scalability metrics from measurements
        for measurement in measurements {
            if measurement.datasize > 0 {
                datasize_breakdown
                    .insert(measurement.datasize, measurement.efficiency_score() / 100.0);
            }
            if measurement.n > 0 {
                n_breakdown.insert(measurement.n, measurement.efficiency_score() / 100.0);
            }
        }

        let scalability_analysis = ScalabilityAnalysis {
            thread_scalability: 0.85, // Placeholder
            data_scalability: 0.92,   // Placeholder
            memory_scalability: 0.88, // Placeholder
            datasize_breakdown,
            n_breakdown,
        };

        Ok(scalability_analysis)
    }

    /// Analyze memory efficiency
    fn analyze_memory_efficiency(
        &self,
        measurements: &[PerformanceMeasurement],
    ) -> CoreResult<MemoryEfficiencyAnalysis> {
        let total_operations: usize = measurements.iter().map(|m| m.operations_count).sum();
        let total_memory: usize = measurements.iter().map(|m| m.memory_usage).sum();

        let avg_memory_per_op = if total_operations > 0 {
            total_memory as f64 / total_operations as f64
        } else {
            0.0
        };

        let memory_analysis = MemoryEfficiencyAnalysis {
            avg_memory_per_op,
            peak_to_avg_ratio: 1.2,      // Placeholder
            fragmentation_score: 0.15,   // Placeholder (lower is better)
            zero_copy_efficiency: 0.95,  // Placeholder
            bandwidth_utilization: 0.75, // Placeholder
        };

        Ok(memory_analysis)
    }

    /// Generate comprehensive benchmark report
    pub fn generate_benchmark_report(&self) -> CoreResult<String> {
        let results = self.results.lock().map_err(|_| {
            CoreError::ComputationError(ErrorContext::new("Failed to lock results".to_string()))
        })?;

        if results.is_empty() {
            return Ok("No benchmark results available.".to_string());
        }

        let latest = &results[results.len() - 1];
        let mut report = String::new();

        // Header
        report.push_str("# SciRS2 Cross-Module Performance Benchmark Report\n\n");

        #[cfg(feature = "serialization")]
        {
            report.push_str(&format!(
                "**Generated**: {}\n",
                chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
            ));
        }
        #[cfg(not(feature = "serialization"))]
        {
            report.push_str("**Generated**: [timestamp unavailable]\n");
        }

        report.push_str(&format!("**Suite**: {}\n", latest.name));
        report.push_str(&format!(
            "**Total Duration**: {:?}\n",
            latest.total_duration
        ));
        report.push_str(&format!(
            "**Average Efficiency**: {:.1}%\n\n",
            latest.avg_efficiency
        ));

        // Executive Summary
        report.push_str("## Executive Summary\n\n");
        report.push_str(&format!(
            "- **Benchmarks Executed**: {}\n",
            latest.measurements.len()
        ));
        report.push_str(&format!(
            "- **Overall Efficiency**: {:.1}%\n",
            latest.avg_efficiency
        ));
        report.push_str(&format!(
            "- **Thread Scalability**: {:.1}%\n",
            latest.scalability_analysis.thread_scalability * 100.0
        ));
        report.push_str(&format!(
            "- **Memory Efficiency**: {:.1}%\n",
            (1.0 - latest.memory_analysis.fragmentation_score) * 100.0
        ));

        // Individual Benchmark Results
        report.push_str("\n## Benchmark Results\n\n");
        for measurement in &latest.measurements {
            report.push_str(&format!(
                "### {} ({})\n",
                measurement.name,
                measurement.modules.join(" + ")
            ));
            report.push_str(&format!(
                "- **Data Size**: {} bytes\n",
                measurement.datasize
            ));
            report.push_str(&format!(
                "- **Average Time**: {:?}\n",
                measurement.avg_duration
            ));
            report.push_str(&format!(
                "- **Throughput**: {:.2} ops/sec\n",
                measurement.throughput
            ));
            report.push_str(&format!(
                "- **Memory Usage**: {} MB\n",
                measurement.memory_usage / (1024 * 1024)
            ));
            report.push_str(&format!(
                "- **Efficiency Score**: {:.1}%\n",
                measurement.efficiency_score()
            ));
            report.push('\n');
        }

        // Scalability Analysis
        report.push_str("## Scalability Analysis\n\n");
        report.push_str(&format!(
            "- **Thread Scalability**: {:.1}%\n",
            latest.scalability_analysis.thread_scalability * 100.0
        ));
        report.push_str(&format!(
            "- **Data Size Scalability**: {:.1}%\n",
            latest.scalability_analysis.data_scalability * 100.0
        ));
        report.push_str(&format!(
            "- **Memory Scalability**: {:.1}%\n",
            latest.scalability_analysis.memory_scalability * 100.0
        ));

        // Memory Analysis
        report.push_str("\n## Memory Efficiency Analysis\n\n");
        report.push_str(&format!(
            "- **Average Memory per Operation**: {:.2} bytes\n",
            latest.memory_analysis.avg_memory_per_op
        ));
        report.push_str(&format!(
            "- **Peak to Average Ratio**: {:.2}\n",
            latest.memory_analysis.peak_to_avg_ratio
        ));
        report.push_str(&format!(
            "- **Fragmentation Score**: {:.3} (lower is better)\n",
            latest.memory_analysis.fragmentation_score
        ));
        report.push_str(&format!(
            "- **Zero-Copy Efficiency**: {:.1}%\n",
            latest.memory_analysis.zero_copy_efficiency * 100.0
        ));

        // Regression Analysis
        if let Some(regression) = &latest.regression_analysis {
            report.push_str("\n## Regression Analysis\n\n");
            if regression.regression_detected {
                report.push_str("⚠️ **Performance regressions detected**\n\n");
                for reg in &regression.regressions {
                    report.push_str(&format!(
                        "- **{}**: {:.2}% regression\n",
                        reg.benchmark_name, reg.change_percent
                    ));
                }
            } else {
                report.push_str("✅ **No significant regressions detected**\n");
            }
        }

        // Recommendations
        report.push_str("\n## Recommendations\n\n");
        if latest.avg_efficiency >= 80.0 {
            report.push_str(
                "✅ **Excellent Performance**: The cross-module performance is very good.\n",
            );
        } else if latest.avg_efficiency >= 60.0 {
            report.push_str(
                "⚠️ **Good Performance**: Consider optimizing bottlenecks identified above.\n",
            );
        } else {
            report.push_str("❌ **Performance Issues**: Significant optimization work needed before 1.0 release.\n");
        }

        Ok(report)
    }
}

/// Internal timing data structure
#[derive(Debug)]
struct TimingData {
    name: String,
    avg_duration: Duration,
    min_duration: Duration,
    max_duration: Duration,
    std_deviation: Duration,
    throughput: f64,
    memory_usage: usize,
    operations_count: usize,
}

impl fmt::Display for RegressionSignificance {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            RegressionSignificance::Negligible => write!(f, "Negligible"),
            RegressionSignificance::Minor => write!(f, "Minor"),
            RegressionSignificance::Moderate => write!(f, "Moderate"),
            RegressionSignificance::Major => write!(f, "Major"),
            RegressionSignificance::Critical => write!(f, "Critical"),
        }
    }
}

/// Convenience function to create a default benchmark suite
#[allow(dead_code)]
pub fn create_default_benchmark_suite() -> CoreResult<CrossModuleBenchmarkRunner> {
    let config = CrossModuleBenchConfig::default();
    Ok(CrossModuleBenchmarkRunner::new(config))
}

/// Convenience function to run quick benchmarks for CI/CD
#[allow(dead_code)]
pub fn run_quick_benchmarks() -> CoreResult<BenchmarkSuiteResult> {
    let config = CrossModuleBenchConfig {
        iterations: 2,         // Reduced from 10 for quick testing
        warmup_iterations: 1,  // Reduced from 2
        datasizes: vec![1024], // Single small size for quick testing
        ns: vec![1],
        memory_limits: vec![64 * 1024 * 1024], // Single memory limit
        enable_profiling: false,
        enable_regression_detection: false,
        timeout: Duration::from_secs(30),
        ..Default::default()
    };

    let runner = CrossModuleBenchmarkRunner::new(config);
    runner.run_benchmarks()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_benchmark_config_creation() {
        let config = CrossModuleBenchConfig::default();
        assert_eq!(config.iterations, 100);
        assert_eq!(config.warmup_iterations, 10);
        assert!(!config.datasizes.is_empty());
        assert!(!config.ns.is_empty());
    }

    #[test]
    fn test_performance_measurement_creation() {
        let measurement = PerformanceMeasurement::new(
            "test_benchmark".to_string(),
            vec!["module1".to_string(), "module2".to_string()],
        );

        assert_eq!(measurement.name, "test_benchmark");
        assert_eq!(measurement.modules.len(), 2);
        assert_eq!(measurement.efficiency_score(), 0.0); // No data yet
    }

    #[test]
    fn test_benchmark_runner_creation() {
        let config = CrossModuleBenchConfig::default();
        let runner = CrossModuleBenchmarkRunner::new(config);

        // Runner should be created successfully
        assert_eq!(runner.config.iterations, 100);
    }

    #[test]
    fn test_quick_benchmarks() {
        // This test should run quickly
        match run_quick_benchmarks() {
            Ok(result) => {
                assert!(!result.measurements.is_empty());
                println!(
                    "Quick benchmarks completed: {} measurements",
                    result.measurements.len()
                );
            }
            Err(e) => {
                println!("Quick benchmarks failed: {:?}", e);
                // Don't fail the test as this might be expected in CI environments
            }
        }
    }

    /// Helper: build a runner with regression detection enabled/disabled.
    fn make_runner(enable_regression: bool) -> CrossModuleBenchmarkRunner {
        let config = CrossModuleBenchConfig {
            iterations: 2,
            warmup_iterations: 1,
            datasizes: vec![1024],
            ns: vec![1],
            memory_limits: vec![64 * 1024 * 1024],
            enable_profiling: false,
            enable_regression_detection: enable_regression,
            baseline_file: None,
            max_regression_percent: 5.0,
            timeout: Duration::from_secs(30),
        };
        CrossModuleBenchmarkRunner::new(config)
    }

    /// Helper: build a `PerformanceMeasurement` with a specific avg duration.
    fn make_measurement(name: &str, avg_nanos: u64) -> PerformanceMeasurement {
        let mut m = PerformanceMeasurement::new(name.to_string(), vec!["test-module".to_string()]);
        m.avg_duration = Duration::from_nanos(avg_nanos);
        m.datasize = 1024;
        m.operations_count = 10;
        m.memory_usage = 1024;
        m
    }

    /// Test 1: no-regression case — uniform measurements should produce no regressions.
    #[test]
    fn test_analyze_regressions_no_regression() {
        let runner = make_runner(true);

        // All benchmarks take the same time — no outliers, no regressions.
        let measurements = vec![
            make_measurement("bench_a", 1_000_000),
            make_measurement("bench_b", 1_000_000),
            make_measurement("bench_c", 1_000_000),
            make_measurement("bench_d", 1_000_000),
            make_measurement("bench_e", 1_000_000),
        ];

        let result = runner
            .analyze_regressions(&measurements)
            .expect("analyze_regressions should not fail");

        assert!(
            !result.regression_detected,
            "Uniform measurements must not trigger regression detection"
        );
        assert!(
            result.regressions.is_empty(),
            "No regression entries expected for uniform timings"
        );
        // overall_change_percent is relative to the intra-suite mean (itself), so ~0.
        assert!(
            result.overall_change_percent.abs() < 1.0,
            "Overall change should be near zero for uniform timings"
        );
    }

    /// Test 2: regression detected — one benchmark is dramatically slower (z > 2).
    #[test]
    fn test_analyze_regressions_outlier_detected() {
        let runner = make_runner(true);

        // Four benchmarks at ~1 ms; one at 100 ms — clear statistical outlier.
        let measurements = vec![
            make_measurement("bench_normal_1", 1_000_000),
            make_measurement("bench_normal_2", 1_000_000),
            make_measurement("bench_normal_3", 1_000_000),
            make_measurement("bench_normal_4", 1_000_000),
            make_measurement("bench_slow", 100_000_000), // 100 ms — huge outlier
        ];

        let result = runner
            .analyze_regressions(&measurements)
            .expect("analyze_regressions should not fail");

        assert!(
            result.regression_detected,
            "A 100x slower benchmark should trigger regression detection"
        );
        // Exactly one regression expected — the slow benchmark.
        // The 4 fast benchmarks are ~95% below mean: they appear in improvements, not regressions.
        assert_eq!(
            result.regressions.len(),
            1,
            "Exactly 1 regression expected (bench_slow); got {:?}",
            result
                .regressions
                .iter()
                .map(|r| &r.benchmark_name)
                .collect::<Vec<_>>()
        );
        assert_eq!(
            result.regressions[0].benchmark_name, "bench_slow",
            "The regressed benchmark should be bench_slow"
        );
        // The slow entry should be classified at least Minor or above.
        assert!(
            result.regressions[0].significance != RegressionSignificance::Negligible,
            "A 100x regression cannot be Negligible"
        );
        // The 4 fast entries should be in improvements (z-score < -2).
        assert_eq!(
            result.improvements.len(),
            4,
            "The 4 normal benchmarks should appear as improvements"
        );
    }

    /// Test 3: disabled regression detection gate — the conditional block in
    /// `run_benchmarks()` must produce `None` for `regression_analysis`.
    /// This test exercises the gate directly to ensure detect is skipped.
    #[test]
    fn test_regression_detection_disabled_gate() {
        let runner = make_runner(false);

        let measurements = vec![
            make_measurement("bench_normal", 1_000_000),
            make_measurement("bench_slow", 100_000_000),
        ];

        // Mirror the exact gate logic from run_benchmarks():
        let regression_analysis: Option<RegressionAnalysis> =
            if runner.config.enable_regression_detection {
                Some(
                    runner
                        .analyze_regressions(&measurements)
                        .expect("analyze_regressions failed"),
                )
            } else {
                None
            };

        assert!(
            regression_analysis.is_none(),
            "When enable_regression_detection is false, analysis must be None"
        );
    }

    /// Test 4: empty measurements — should return clean zeroed analysis.
    #[test]
    fn test_analyze_regressions_empty() {
        let runner = make_runner(true);
        let result = runner
            .analyze_regressions(&[])
            .expect("Empty input should not fail");
        assert!(!result.regression_detected);
        assert!(result.regressions.is_empty());
        assert!(result.improvements.is_empty());
        assert_eq!(result.overall_change_percent, 0.0);
    }

    /// Test 5: run_benchmarks with regression detection enabled produces Some analysis.
    #[test]
    fn test_run_benchmarks_regression_analysis_present() {
        let runner = make_runner(true);
        let suite = runner
            .run_benchmarks()
            .expect("run_benchmarks should succeed with detection enabled");
        assert!(
            suite.regression_analysis.is_some(),
            "regression_analysis should be Some when detection is enabled"
        );
    }

    /// Test 6: run_benchmarks with regression detection disabled produces None.
    #[test]
    fn test_run_benchmarks_regression_analysis_absent() {
        let runner = make_runner(false);
        let suite = runner
            .run_benchmarks()
            .expect("run_benchmarks should succeed with detection disabled");
        assert!(
            suite.regression_analysis.is_none(),
            "regression_analysis should be None when detection is disabled"
        );
    }
}