tenflowers-core 0.1.1

// Simplified Performance Benchmarks for TenfloweRS
// Validates performance optimizations with current codebase capabilities

use crate::{Device, Result, Tensor};
use std::collections::HashMap;
use std::time::Instant;

/// Simplified benchmark configuration
#[derive(Debug, Clone)]
pub struct SimpleBenchmarkConfig {
    pub warmup_iterations: usize,
    pub benchmark_iterations: usize,
    pub test_sizes: Vec<Vec<usize>>,
}

impl Default for SimpleBenchmarkConfig {
    fn default() -> Self {
        Self {
            warmup_iterations: 5,
            benchmark_iterations: 20,
            test_sizes: vec![
                vec![256, 256],   // Small tensors
                vec![512, 512],   // Medium tensors
                vec![1024, 1024], // Large tensors
            ],
        }
    }
}

/// Simple benchmark result
#[derive(Debug, Clone)]
pub struct SimpleBenchmarkResult {
    pub benchmark_name: String,
    pub execution_time_ms: f64,
    pub throughput_ops_per_sec: f64,
    pub memory_usage_mb: f64,
}

/// Simple benchmarking suite
pub struct SimpleBenchmarkSuite {
    config: SimpleBenchmarkConfig,
    results: HashMap<String, SimpleBenchmarkResult>,
}

impl SimpleBenchmarkSuite {
    /// Create new benchmark suite
    pub fn new(config: SimpleBenchmarkConfig) -> Self {
        Self {
            config,
            results: HashMap::new(),
        }
    }

    /// Run comprehensive benchmarks
    pub fn run_benchmarks(&mut self) -> Result<BenchmarkReport> {
        println!("🚀 Running TenfloweRS Performance Benchmarks...");

        // Test basic tensor operations
        self.benchmark_tensor_creation()?;
        self.benchmark_element_wise_operations()?;
        self.benchmark_matrix_operations()?;

        // Generate report
        Ok(BenchmarkReport {
            results: self.results.clone(),
            summary: self.generate_summary(),
        })
    }

    /// Benchmark tensor creation performance
    fn benchmark_tensor_creation(&mut self) -> Result<()> {
        println!("  📊 Benchmarking tensor creation...");

        for size in &self.config.test_sizes {
            if size.len() == 2 {
                let rows = size[0];
                let cols = size[1];

                // Warmup
                for _ in 0..self.config.warmup_iterations {
                    let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
                }

                // Benchmark
                let start = Instant::now();
                for _ in 0..self.config.benchmark_iterations {
                    let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
                }
                let elapsed = start.elapsed();

                let avg_time_ms =
                    elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
                let ops_per_sec = 1000.0 / avg_time_ms;
                let memory_mb = (rows * cols * 4) as f64 / (1024.0 * 1024.0); // Assuming f32

                let result = SimpleBenchmarkResult {
                    benchmark_name: format!("TensorCreation_{}x{}", rows, cols),
                    execution_time_ms: avg_time_ms,
                    throughput_ops_per_sec: ops_per_sec,
                    memory_usage_mb: memory_mb,
                };

                self.results.insert(result.benchmark_name.clone(), result);
            }
        }

        Ok(())
    }

    /// Benchmark element-wise operations
    fn benchmark_element_wise_operations(&mut self) -> Result<()> {
        println!("  ⚡ Benchmarking element-wise operations...");

        for size in &self.config.test_sizes {
            if size.len() == 2 {
                let rows = size[0];
                let cols = size[1];

                let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
                let b: Tensor<f32> = Tensor::ones(&[rows, cols]);

                // Warmup
                for _ in 0..self.config.warmup_iterations {
                    let _ = a.add(&b)?;
                }

                // Benchmark addition
                let start = Instant::now();
                for _ in 0..self.config.benchmark_iterations {
                    let _ = a.add(&b)?;
                }
                let elapsed = start.elapsed();

                let avg_time_ms =
                    elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
                let elements_per_sec = (rows * cols) as f64 / (avg_time_ms / 1000.0);
                let memory_mb = (rows * cols * 3 * 4) as f64 / (1024.0 * 1024.0); // 3 tensors * f32

                let result = SimpleBenchmarkResult {
                    benchmark_name: format!("ElementwiseAdd_{}x{}", rows, cols),
                    execution_time_ms: avg_time_ms,
                    throughput_ops_per_sec: elements_per_sec,
                    memory_usage_mb: memory_mb,
                };

                self.results.insert(result.benchmark_name.clone(), result);
            }
        }

        Ok(())
    }

    /// Benchmark matrix operations
    fn benchmark_matrix_operations(&mut self) -> Result<()> {
        println!("  🔢 Benchmarking matrix operations...");

        for size in &self.config.test_sizes {
            if size.len() == 2 {
                let rows = size[0];
                let cols = size[1];

                let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
                let b: Tensor<f32> = Tensor::ones(&[cols, rows]);

                // Warmup
                for _ in 0..self.config.warmup_iterations {
                    let _ = a.matmul(&b)?;
                }

                // Benchmark matrix multiplication
                let start = Instant::now();
                for _ in 0..self.config.benchmark_iterations {
                    let _ = a.matmul(&b)?;
                }
                let elapsed = start.elapsed();

                let avg_time_ms =
                    elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
                let flops = (rows * cols * cols * 2) as f64; // Multiply-accumulate operations
                let gflops_per_sec = flops / (avg_time_ms / 1000.0) / 1e9;
                let memory_mb = (rows * cols * 2 + rows * rows) as f64 * 4.0 / (1024.0 * 1024.0);

                let result = SimpleBenchmarkResult {
                    benchmark_name: format!("MatMul_{}x{}", rows, cols),
                    execution_time_ms: avg_time_ms,
                    throughput_ops_per_sec: gflops_per_sec,
                    memory_usage_mb: memory_mb,
                };

                self.results.insert(result.benchmark_name.clone(), result);
            }
        }

        Ok(())
    }

    /// Generate benchmark summary
    fn generate_summary(&self) -> BenchmarkSummary {
        let mut total_time = 0.0f64;
        let mut max_throughput = 0.0f64;
        let mut total_memory = 0.0f64;

        for result in self.results.values() {
            total_time += result.execution_time_ms;
            max_throughput = max_throughput.max(result.throughput_ops_per_sec);
            total_memory += result.memory_usage_mb;
        }

        BenchmarkSummary {
            total_benchmarks: self.results.len(),
            average_execution_time_ms: total_time / self.results.len() as f64,
            peak_throughput: max_throughput,
            total_memory_usage_mb: total_memory,
            performance_score: Self::calculate_performance_score(&self.results),
        }
    }

    /// Calculate overall performance score
    fn calculate_performance_score(results: &HashMap<String, SimpleBenchmarkResult>) -> f64 {
        let mut score = 0.0;
        let mut count = 0;

        for result in results.values() {
            // Simple scoring based on throughput (higher is better)
            let normalized_score = (result.throughput_ops_per_sec / 1e6).min(10.0); // Cap at 10
            score += normalized_score;
            count += 1;
        }

        if count > 0 {
            score / count as f64
        } else {
            0.0
        }
    }
}

/// Benchmark report
#[derive(Debug, Clone)]
pub struct BenchmarkReport {
    pub results: HashMap<String, SimpleBenchmarkResult>,
    pub summary: BenchmarkSummary,
}

/// Benchmark summary
#[derive(Debug, Clone)]
pub struct BenchmarkSummary {
    pub total_benchmarks: usize,
    pub average_execution_time_ms: f64,
    pub peak_throughput: f64,
    pub total_memory_usage_mb: f64,
    pub performance_score: f64,
}

impl BenchmarkReport {
    /// Print comprehensive report
    pub fn print_report(&self) {
        println!("\n🎯 === TENFLOWERS PERFORMANCE BENCHMARK REPORT ===");

        println!("\n📊 Individual Benchmark Results:");
        for (name, result) in &self.results {
            println!(
                "  {} - {:.2}ms, {:.1} ops/s, {:.1} MB",
                name,
                result.execution_time_ms,
                result.throughput_ops_per_sec,
                result.memory_usage_mb
            );
        }

        println!("\n📈 Summary:");
        println!("  Total Benchmarks: {}", self.summary.total_benchmarks);
        println!(
            "  Average Execution Time: {:.2}ms",
            self.summary.average_execution_time_ms
        );
        println!(
            "  Peak Throughput: {:.1} ops/s",
            self.summary.peak_throughput
        );
        println!(
            "  Total Memory Usage: {:.1} MB",
            self.summary.total_memory_usage_mb
        );
        println!(
            "  Performance Score: {:.2}/10",
            self.summary.performance_score
        );

        // Performance evaluation
        match self.summary.performance_score {
            score if score >= 8.0 => println!("  ✅ Excellent Performance!"),
            score if score >= 6.0 => println!("  ✅ Good Performance"),
            score if score >= 4.0 => println!("  ⚠️  Moderate Performance"),
            _ => println!("  ❌ Performance Needs Improvement"),
        }
    }
}

/// Run simple benchmarks (convenience function)
pub fn run_simple_benchmarks() -> Result<BenchmarkReport> {
    let config = SimpleBenchmarkConfig::default();
    let mut suite = SimpleBenchmarkSuite::new(config);
    suite.run_benchmarks()
}

/// Validate optimization effectiveness
pub fn validate_optimizations() -> Result<()> {
    println!("🔍 === OPTIMIZATION VALIDATION ===");

    // Test CPU performance
    println!("⚡ Testing CPU Performance...");
    let _cpu_device = Device::Cpu;
    let a: Tensor<f32> = Tensor::ones(&[1000, 1000]);
    let b: Tensor<f32> = Tensor::ones(&[1000, 1000]);

    let start = Instant::now();
    let _result = a.matmul(&b)?;
    let cpu_time = start.elapsed();

    println!(
        "  CPU MatMul (1000x1000): {:.2}ms",
        cpu_time.as_secs_f64() * 1000.0
    );

    // Test SIMD effectiveness (simplified)
    println!("📊 Testing SIMD Effectiveness...");
    let large_tensor: Tensor<f32> = Tensor::ones(&[10000]);
    let another_tensor: Tensor<f32> = Tensor::ones(&[10000]);

    let start = Instant::now();
    let _result = large_tensor.add(&another_tensor)?;
    let simd_time = start.elapsed();

    println!(
        "  Element-wise Add (10k elements): {:.2}ms",
        simd_time.as_secs_f64() * 1000.0
    );

    // Memory efficiency validation
    println!("💾 Testing Memory Efficiency...");
    let memory_test_size = 5000;
    let start = Instant::now();
    let _large_matrix: Tensor<f32> = Tensor::zeros(&[memory_test_size, memory_test_size]);
    let memory_time = start.elapsed();

    let memory_mb = (memory_test_size * memory_test_size * 4) as f64 / (1024.0 * 1024.0);
    let allocation_rate = memory_mb / memory_time.as_secs_f64();

    println!(
        "  Memory Allocation ({}MB): {:.2}ms, {:.1} MB/s",
        memory_mb,
        memory_time.as_secs_f64() * 1000.0,
        allocation_rate
    );

    println!("✅ Optimization validation complete!");

    Ok(())
}