scirs2-linalg 0.4.2

//! Advanced MODE: Advanced Memory Optimization and Cache-Aware Algorithms
//!
//! This module provides cutting-edge memory optimization strategies that complement
//! the advanced SIMD operations in advanced_hardware_simd.rs:
//! - Intelligent memory prefetching with predictive patterns
//! - Cache-aware matrix blocking with dynamic sizing
//! - Branch prediction optimization techniques
//! - Memory bandwidth optimization strategies
//! - Runtime performance profiling and adaptive optimization

use crate::error::{LinalgError, LinalgResult};
use scirs2_core::ndarray::{Array2, ArrayView2, ArrayViewMut2};
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::{Duration, Instant};

/// Advanced memory access pattern analyzer for predictive prefetching
#[derive(Debug)]
pub struct MemoryAccessPatternAnalyzer {
    /// Track sequential access patterns
    sequential_access_count: AtomicU64,
    /// Track random access patterns  
    random_access_count: AtomicU64,
    /// Track stride access patterns
    stride_access_patterns: Vec<(usize, u64)>, // (stride, frequency)
    /// Cache miss predictions
    #[allow(dead_code)]
    predicted_miss_rate: f64,
}

impl MemoryAccessPatternAnalyzer {
    pub fn new() -> Self {
        Self {
            sequential_access_count: AtomicU64::new(0),
            random_access_count: AtomicU64::new(0),
            stride_access_patterns: Vec::new(),
            predicted_miss_rate: 0.05, // Conservative 5% miss rate estimate
        }
    }

    /// Analyze access pattern and recommend prefetch strategy
    pub fn analyze_and_recommend_prefetch(&self, matrixdims: (usize, usize)) -> PrefetchStrategy {
        let (m, n) = matrixdims;
        let total_elements = m * n;

        // For large matrices, use aggressive prefetching
        if total_elements > 1_000_000 {
            PrefetchStrategy::Aggressive {
                prefetch_distance: 8,
                prefetch_hint: PrefetchHint::T0, // Keep in all cache levels
            }
        } else if total_elements > 100_000 {
            PrefetchStrategy::Moderate {
                prefetch_distance: 4,
                prefetch_hint: PrefetchHint::T1, // Keep in L2/L3 cache
            }
        } else {
            PrefetchStrategy::Conservative {
                prefetch_distance: 2,
                prefetch_hint: PrefetchHint::T2, // Keep in L3 cache only
            }
        }
    }

    /// Update access pattern statistics
    pub fn record_access_pattern(&mut self, accesstype: AccessType) {
        match accesstype {
            AccessType::Sequential => {
                self.sequential_access_count.fetch_add(1, Ordering::Relaxed);
            }
            AccessType::Random => {
                self.random_access_count.fetch_add(1, Ordering::Relaxed);
            }
            AccessType::Strided(stride) => {
                // Find existing stride pattern or create new one
                if let Some(pattern) = self
                    .stride_access_patterns
                    .iter_mut()
                    .find(|(s_, _)| *s_ == stride)
                {
                    pattern.1 += 1;
                } else {
                    self.stride_access_patterns.push((stride, 1));
                }
            }
        }
    }
}

/// Memory access pattern types for optimization
#[derive(Debug, Clone, Copy)]
pub enum AccessType {
    Sequential,
    Random,
    Strided(usize),
}

/// Prefetch strategies based on access patterns
#[derive(Debug, Clone, Copy)]
pub enum PrefetchStrategy {
    Conservative {
        prefetch_distance: usize,
        prefetch_hint: PrefetchHint,
    },
    Moderate {
        prefetch_distance: usize,
        prefetch_hint: PrefetchHint,
    },
    Aggressive {
        prefetch_distance: usize,
        prefetch_hint: PrefetchHint,
    },
}

/// Cache prefetch hints for different cache levels
#[derive(Debug, Clone, Copy)]
pub enum PrefetchHint {
    T0,  // Prefetch to all cache levels
    T1,  // Prefetch to L2 and L3
    T2,  // Prefetch to L3 only
    NTA, // Non-temporal access (bypass cache)
}

/// Cache-aware matrix operations with dynamic blocking
pub struct CacheAwareMatrixOperations {
    /// L1 cache size in bytes
    l1_cachesize: usize,
    /// L2 cache size in bytes
    l2_cachesize: usize,
    /// L3 cache size in bytes
    l3_cachesize: usize,
    /// Cache line size in bytes
    #[allow(dead_code)]
    cache_linesize: usize,
    /// Memory access pattern analyzer
    pattern_analyzer: MemoryAccessPatternAnalyzer,
}

impl CacheAwareMatrixOperations {
    pub fn new() -> Self {
        Self {
            l1_cachesize: 32 * 1024,       // 32KB L1
            l2_cachesize: 512 * 1024,      // 512KB L2
            l3_cachesize: 8 * 1024 * 1024, // 8MB L3
            cache_linesize: 64,            // 64 bytes per cache line
            pattern_analyzer: MemoryAccessPatternAnalyzer::new(),
        }
    }

    /// Calculate optimal block sizes for current cache hierarchy
    pub fn calculate_optimal_blocksizes(&self, elementsize: usize) -> CacheBlockSizes {
        // L1 cache blocking: aim to keep working set in L1
        let l1_elements = (self.l1_cachesize / 3) / elementsize; // Divide by 3 for A, B, C blocks
        let l1_blocksize = (l1_elements as f64).sqrt() as usize;

        // L2 cache blocking: intermediate level
        let l2_elements = (self.l2_cachesize / 3) / elementsize;
        let l2_blocksize = (l2_elements as f64).sqrt() as usize;

        // L3 cache blocking: largest blocks
        let l3_elements = (self.l3_cachesize / 3) / elementsize;
        let l3_blocksize = (l3_elements as f64).sqrt() as usize;

        CacheBlockSizes {
            l1_block_m: l1_blocksize.min(256),
            l1_block_n: l1_blocksize.min(256),
            l1_block_k: l1_blocksize.min(256),
            l2_block_m: l2_blocksize.min(1024),
            l2_block_n: l2_blocksize.min(1024),
            l2_block_k: l2_blocksize.min(1024),
            l3_block_m: l3_blocksize.min(4096),
            l3_block_n: l3_blocksize.min(4096),
            l3_block_k: l3_blocksize.min(4096),
        }
    }

    /// Cache-aware matrix multiplication with intelligent prefetching
    pub fn cache_aware_gemm_f32(
        &mut self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
        c: &mut ArrayViewMut2<f32>,
    ) -> LinalgResult<()> {
        let (m, k) = a.dim();
        let (_, n) = b.dim();

        if k != b.nrows() || m != c.nrows() || n != c.ncols() {
            return Err(LinalgError::ShapeError(
                "Matrix dimensions incompatible for multiplication".to_string(),
            ));
        }

        let blocksizes = self.calculate_optimal_blocksizes(std::mem::size_of::<f32>());
        let prefetch_strategy = self.pattern_analyzer.analyze_and_recommend_prefetch((m, n));

        // Three-level cache blocking for optimal cache utilization
        self.three_level_blocked_gemm(a, b, c, &blocksizes, &prefetch_strategy)?;

        Ok(())
    }

    /// Three-level cache blocking implementation
    fn three_level_blocked_gemm(
        &mut self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
        c: &mut ArrayViewMut2<f32>,
        blocksizes: &CacheBlockSizes,
        prefetch_strategy: &PrefetchStrategy,
    ) -> LinalgResult<()> {
        let (m, k) = a.dim();
        let (_, n) = b.dim();

        // L3 blocking (outermost)
        for ii in (0..m).step_by(blocksizes.l3_block_m) {
            for jj in (0..n).step_by(blocksizes.l3_block_n) {
                for kk in (0..k).step_by(blocksizes.l3_block_k) {
                    let i_end = (ii + blocksizes.l3_block_m).min(m);
                    let j_end = (jj + blocksizes.l3_block_n).min(n);
                    let k_end = (kk + blocksizes.l3_block_k).min(k);

                    // L2 blocking (middle)
                    for i2 in (ii..i_end).step_by(blocksizes.l2_block_m) {
                        for j2 in (jj..j_end).step_by(blocksizes.l2_block_n) {
                            for k2 in (kk..k_end).step_by(blocksizes.l2_block_k) {
                                let i2_end = (i2 + blocksizes.l2_block_m).min(i_end);
                                let j2_end = (j2 + blocksizes.l2_block_n).min(j_end);
                                let k2_end = (k2 + blocksizes.l2_block_k).min(k_end);

                                // L1 blocking (innermost) with prefetching
                                self.l1_blocked_gemm_with_prefetch(
                                    a,
                                    b,
                                    c,
                                    i2,
                                    i2_end,
                                    j2,
                                    j2_end,
                                    k2,
                                    k2_end,
                                    blocksizes,
                                    prefetch_strategy,
                                )?;
                            }
                        }
                    }
                }
            }
        }

        Ok(())
    }

    /// L1 cache blocking with intelligent prefetching
    fn l1_blocked_gemm_with_prefetch(
        &mut self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
        c: &mut ArrayViewMut2<f32>,
        i_start: usize,
        i_end: usize,
        j_start: usize,
        j_end: usize,
        k_start: usize,
        k_end: usize,
        blocksizes: &CacheBlockSizes,
        prefetch_strategy: &PrefetchStrategy,
    ) -> LinalgResult<()> {
        for i in (i_start..i_end).step_by(blocksizes.l1_block_m) {
            for j in (j_start..j_end).step_by(blocksizes.l1_block_n) {
                for k_iter in (k_start..k_end).step_by(blocksizes.l1_block_k) {
                    let i_block_end = (i + blocksizes.l1_block_m).min(i_end);
                    let j_block_end = (j + blocksizes.l1_block_n).min(j_end);
                    let k_block_end = (k_iter + blocksizes.l1_block_k).min(k_end);

                    // Perform prefetching based on _strategy
                    self.intelligent_prefetch(a, b, c, i, j, k_iter, prefetch_strategy);

                    // Inner computation kernel
                    for ii in i..i_block_end {
                        for jj in j..j_block_end {
                            let mut sum = 0.0f32;

                            // Vectorizable inner loop
                            for kk in k_iter..k_block_end {
                                sum += a[[ii, kk]] * b[[kk, jj]];
                            }

                            c[[ii, jj]] += sum;
                        }
                    }
                }
            }
        }

        // Record access pattern for future optimization
        self.pattern_analyzer
            .record_access_pattern(AccessType::Sequential);

        Ok(())
    }

    /// Intelligent prefetching based on access patterns and cache strategy
    fn intelligent_prefetch(
        &self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
        c: &ArrayViewMut2<f32>,
        i: usize,
        j: usize,
        k: usize,
        strategy: &PrefetchStrategy,
    ) {
        let (prefetch_distance, hint) = match strategy {
            PrefetchStrategy::Conservative {
                prefetch_distance,
                prefetch_hint,
            } => (*prefetch_distance, *prefetch_hint),
            PrefetchStrategy::Moderate {
                prefetch_distance,
                prefetch_hint,
            } => (*prefetch_distance, *prefetch_hint),
            PrefetchStrategy::Aggressive {
                prefetch_distance,
                prefetch_hint,
            } => (*prefetch_distance, *prefetch_hint),
        };

        #[cfg(target_arch = "x86_64")]
        unsafe {
            macro_rules! prefetch_with_hint {
                ($ptr:expr, $hint:expr) => {
                    match $hint {
                        PrefetchHint::T0 => _mm_prefetch($ptr as *const i8, 3),
                        PrefetchHint::T1 => _mm_prefetch($ptr as *const i8, 2),
                        PrefetchHint::T2 => _mm_prefetch($ptr as *const i8, 1),
                        PrefetchHint::NTA => _mm_prefetch($ptr as *const i8, 0),
                    }
                };
            }

            // Prefetch future A matrix rows
            if i + prefetch_distance < a.nrows() {
                let a_ptr = &a[[i + prefetch_distance, k]] as *const f32;
                prefetch_with_hint!(a_ptr, hint);
            }

            // Prefetch future B matrix columns
            if j + prefetch_distance < b.ncols() {
                let b_ptr = &b[[k, j + prefetch_distance]] as *const f32;
                prefetch_with_hint!(b_ptr, hint);
            }

            // Prefetch future C matrix elements
            if i + prefetch_distance < c.nrows() && j + prefetch_distance < c.ncols() {
                let c_ptr = &c[[i + prefetch_distance, j + prefetch_distance]] as *const f32;
                prefetch_with_hint!(c_ptr, hint);
            }
        }

        #[cfg(not(target_arch = "x86_64"))]
        {
            // No-op on non-x86_64 platforms
            let _ = (a, b, c, i, j, k, strategy);
        }
    }

    /// Cache-aware matrix transpose with optimal memory access patterns
    pub fn cache_aware_transpose_f32(
        &mut self,
        input: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        let (rows, cols) = input.dim();
        let mut result = Array2::zeros((cols, rows));

        // Calculate optimal block size for cache-friendly transpose
        let elementsize = std::mem::size_of::<f32>();
        let optimal_blocksize = ((self.l1_cachesize / 2) / elementsize).min(64);
        let blocksize = (optimal_blocksize as f64).sqrt() as usize;

        // Blocked transpose to improve cache locality
        for i in (0..rows).step_by(blocksize) {
            for j in (0..cols).step_by(blocksize) {
                let i_end = (i + blocksize).min(rows);
                let j_end = (j + blocksize).min(cols);

                // Transpose block
                for ii in i..i_end {
                    for jj in j..j_end {
                        result[[jj, ii]] = input[[ii, jj]];
                    }
                }
            }
        }

        self.pattern_analyzer
            .record_access_pattern(AccessType::Strided(rows));

        Ok(result)
    }
}

/// Cache block sizes for multi-level optimization
#[derive(Debug, Clone)]
pub struct CacheBlockSizes {
    pub l1_block_m: usize,
    pub l1_block_n: usize,
    pub l1_block_k: usize,
    pub l2_block_m: usize,
    pub l2_block_n: usize,
    pub l2_block_k: usize,
    pub l3_block_m: usize,
    pub l3_block_n: usize,
    pub l3_block_k: usize,
}

/// Runtime performance profiler for adaptive optimization
pub struct RuntimePerformanceProfiler {
    /// Operation timing history
    timing_history: Vec<(String, Duration)>,
    /// Cache miss rate estimates
    cache_miss_rates: Vec<f64>,
    /// Optimization effectiveness scores
    #[allow(dead_code)]
    optimization_scores: Vec<f64>,
    /// Current profiling session start time
    session_start: Option<Instant>,
}

impl RuntimePerformanceProfiler {
    pub fn new() -> Self {
        Self {
            timing_history: Vec::new(),
            cache_miss_rates: Vec::new(),
            optimization_scores: Vec::new(),
            session_start: None,
        }
    }

    /// Start profiling session
    pub fn start_session(&mut self, operationname: &str) {
        self.session_start = Some(Instant::now());
        self.timing_history
            .push((operationname.to_string(), Duration::ZERO));
    }

    /// End profiling session and record performance
    pub fn end_session(&mut self) -> Option<Duration> {
        if let Some(start_time) = self.session_start.take() {
            let duration = start_time.elapsed();

            // Update the last timing entry
            if let Some(last_entry) = self.timing_history.last_mut() {
                last_entry.1 = duration;
            }

            Some(duration)
        } else {
            None
        }
    }

    /// Analyze performance and recommend optimizations
    pub fn analyze_and_recommend(&self) -> Vec<OptimizationRecommendation> {
        let mut recommendations = Vec::new();

        // Analyze timing patterns
        if let Some(avg_time) = self.calculate_average_operation_time() {
            if avg_time > Duration::from_millis(100) {
                recommendations.push(OptimizationRecommendation::IncreaseBlockSize);
                recommendations.push(OptimizationRecommendation::EnableAggressivePrefetch);
            } else if avg_time < Duration::from_millis(10) {
                recommendations.push(OptimizationRecommendation::DecreaseBlockSize);
            }
        }

        // Analyze cache performance
        if let Some(avg_miss_rate) = self.calculate_average_cache_miss_rate() {
            if avg_miss_rate > 0.1 {
                recommendations.push(OptimizationRecommendation::OptimizeMemoryLayout);
                recommendations.push(OptimizationRecommendation::IncreaseBlockSize);
            }
        }

        recommendations
    }

    fn calculate_average_operation_time(&self) -> Option<Duration> {
        if self.timing_history.is_empty() {
            return None;
        }

        let total_nanos: u64 = self
            .timing_history
            .iter()
            .map(|(_, duration)| duration.as_nanos() as u64)
            .sum();

        Some(Duration::from_nanos(
            total_nanos / self.timing_history.len() as u64,
        ))
    }

    fn calculate_average_cache_miss_rate(&self) -> Option<f64> {
        if self.cache_miss_rates.is_empty() {
            return None;
        }

        Some(self.cache_miss_rates.iter().sum::<f64>() / self.cache_miss_rates.len() as f64)
    }
}

/// Optimization recommendations based on runtime profiling
#[derive(Debug, Clone)]
pub enum OptimizationRecommendation {
    IncreaseBlockSize,
    DecreaseBlockSize,
    EnableAggressivePrefetch,
    OptimizeMemoryLayout,
    SwitchToSIMDImplementation,
    UseParallelExecution,
}

impl Default for MemoryAccessPatternAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl Default for CacheAwareMatrixOperations {
    fn default() -> Self {
        Self::new()
    }
}

impl Default for RuntimePerformanceProfiler {
    fn default() -> Self {
        Self::new()
    }
}

/// Branch prediction optimization utilities
pub struct BranchOptimizer;

impl BranchOptimizer {
    /// Optimize conditional execution in matrix operations
    #[allow(dead_code)]
    #[inline(always)]
    pub fn likely_branch<T>(_condition: bool, if_true: T, iffalse: T) -> T {
        // Note: std::intrinsics::likely requires unstable features
        // Using standard conditional logic for stable compatibility
        if _condition {
            if_true
        } else {
            iffalse
        }
    }

    /// Optimize unlikely branches (e.g., error conditions)
    #[allow(dead_code)]
    #[inline(always)]
    pub fn unlikely_branch<T>(_condition: bool, if_true: T, iffalse: T) -> T {
        // Note: std::intrinsics::unlikely requires unstable features
        // Using standard conditional logic for stable compatibility
        if _condition {
            if_true
        } else {
            iffalse
        }
    }

    /// Prefetch-guided loop unrolling for predictable access patterns
    #[allow(dead_code)]
    pub fn unrolled_loop_with_prefetch<F>(
        start: usize,
        end: usize,
        unroll_factor: usize,
        mut operation: F,
    ) where
        F: FnMut(usize),
    {
        let mut i = start;

        // Main unrolled loop
        while i + unroll_factor <= end {
            for offset in 0..unroll_factor {
                operation(i + offset);
            }
            i += unroll_factor;
        }

        // Handle remaining iterations
        while i < end {
            operation(i);
            i += 1;
        }
    }
}

/// Advanced ENHANCEMENT: Adaptive Vectorization Engine with CPU Feature Detection
///
/// This system provides runtime detection of CPU capabilities and automatic
/// selection of optimal vectorization strategies for maximum performance.
pub struct AdaptiveVectorizationEngine {
    /// Detected CPU features and capabilities
    cpu_features: CpuFeatures,
    /// Performance counters for different strategies
    strategy_performance: std::collections::HashMap<VectorizationStrategy, f64>,
    /// Auto-tuning state
    auto_tuning_enabled: bool,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum VectorizationStrategy {
    /// Use SSE 4.2 instructions
    SSE42,
    /// Use AVX instructions
    AVX,
    /// Use AVX2 instructions
    AVX2,
    /// Use AVX512 instructions
    AVX512,
    /// Fallback to scalar operations
    Scalar,
}

#[derive(Debug, Clone)]
pub struct CpuFeatures {
    pub sse42: bool,
    pub avx: bool,
    pub avx2: bool,
    pub avx512: bool,
    pub fma: bool,
    pub cache_linesize: usize,
}

impl AdaptiveVectorizationEngine {
    /// Create new adaptive vectorization engine with CPU feature detection
    pub fn new() -> Self {
        let cpu_features = Self::detect_cpu_features();

        Self {
            cpu_features,
            strategy_performance: std::collections::HashMap::new(),
            auto_tuning_enabled: true,
        }
    }

    /// Detect CPU features at runtime
    #[allow(dead_code)]
    fn detect_cpu_features() -> CpuFeatures {
        #[cfg(target_arch = "x86_64")]
        {
            CpuFeatures {
                sse42: is_x86_feature_detected!("sse4.2"),
                avx: is_x86_feature_detected!("avx"),
                avx2: is_x86_feature_detected!("avx2"),
                avx512: is_x86_feature_detected!("avx512f"),
                fma: is_x86_feature_detected!("fma"),
                cache_linesize: 64, // Common cache line size
            }
        }
        #[cfg(not(target_arch = "x86_64"))]
        {
            CpuFeatures {
                sse42: false,
                avx: false,
                avx2: false,
                avx512: false,
                fma: false,
                cache_linesize: 64,
            }
        }
    }

    /// Select optimal vectorization strategy based on matrix size and CPU features
    pub fn select_optimal_strategy(&self, matrixsize: (usize, usize)) -> VectorizationStrategy {
        let (rows, cols) = matrixsize;
        let total_elements = rows * cols;

        // For very large matrices, prefer the most advanced vectorization available
        if total_elements > 100_000 {
            if self.cpu_features.avx512 {
                return VectorizationStrategy::AVX512;
            } else if self.cpu_features.avx2 {
                return VectorizationStrategy::AVX2;
            } else if self.cpu_features.avx {
                return VectorizationStrategy::AVX;
            }
        }

        // For medium matrices, balance between complexity and performance
        if total_elements > 10_000 {
            if self.cpu_features.avx2 {
                return VectorizationStrategy::AVX2;
            } else if self.cpu_features.avx {
                return VectorizationStrategy::AVX;
            } else if self.cpu_features.sse42 {
                return VectorizationStrategy::SSE42;
            }
        }

        // For small matrices, use simpler vectorization or scalar
        if self.cpu_features.sse42 && total_elements > 1_000 {
            VectorizationStrategy::SSE42
        } else {
            VectorizationStrategy::Scalar
        }
    }

    /// Adaptive matrix multiplication with optimal vectorization
    #[allow(dead_code)]
    pub fn adaptivematrix_multiply_f32(
        &mut self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        let start_time = Instant::now();

        let strategy = self.select_optimal_strategy((a.nrows(), a.ncols()));
        let result = match strategy {
            VectorizationStrategy::AVX512 => self.matrix_multiply_avx512_f32(a, b),
            VectorizationStrategy::AVX2 => self.matrix_multiply_avx2_f32(a, b),
            VectorizationStrategy::AVX => self.matrix_multiply_avx_f32(a, b),
            VectorizationStrategy::SSE42 => self.matrix_multiply_sse42_f32(a, b),
            VectorizationStrategy::Scalar => self.matrix_multiply_scalar_f32(a, b),
        };

        // Record performance for auto-tuning
        if self.auto_tuning_enabled {
            let duration = start_time.elapsed().as_secs_f64();
            self.strategy_performance.insert(strategy, duration);
        }

        result
    }

    /// AVX512 optimized matrix multiplication (placeholder implementation)
    #[allow(dead_code)]
    fn matrix_multiply_avx512_f32(
        &self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        // For now, fall back to AVX2 as AVX512 requires more complex implementation
        self.matrix_multiply_avx2_f32(a, b)
    }

    /// AVX2 optimized matrix multiplication
    #[allow(dead_code)]
    fn matrix_multiply_avx2_f32(
        &self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        if a.ncols() != b.nrows() {
            return Err(LinalgError::ShapeError(
                "Matrix dimensions incompatible for multiplication".to_string(),
            ));
        }

        let (m, k) = a.dim();
        let n = b.ncols();
        let mut result = Array2::zeros((m, n));

        // Use blocked algorithm for better cache performance
        const BLOCK_SIZE: usize = 64;

        for i in (0..m).step_by(BLOCK_SIZE) {
            for j in (0..n).step_by(BLOCK_SIZE) {
                for kk in (0..k).step_by(BLOCK_SIZE) {
                    let i_end = (i + BLOCK_SIZE).min(m);
                    let j_end = (j + BLOCK_SIZE).min(n);
                    let k_end = (kk + BLOCK_SIZE).min(k);

                    // Block multiplication with vectorization
                    for ii in i..i_end {
                        for jj in (j..j_end).step_by(8) {
                            let jj_end = (jj + 8).min(j_end);
                            for kkk in kk..k_end {
                                let a_val = a[[ii, kkk]];
                                for jjj in jj..jj_end {
                                    result[[ii, jjj]] += a_val * b[[kkk, jjj]];
                                }
                            }
                        }
                    }
                }
            }
        }

        Ok(result)
    }

    /// AVX optimized matrix multiplication
    #[allow(dead_code)]
    fn matrix_multiply_avx_f32(
        &self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        // Simplified implementation - in practice would use AVX intrinsics
        self.matrix_multiply_scalar_f32(a, b)
    }

    /// SSE4.2 optimized matrix multiplication
    #[allow(dead_code)]
    fn matrix_multiply_sse42_f32(
        &self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        // Simplified implementation - in practice would use SSE intrinsics
        self.matrix_multiply_scalar_f32(a, b)
    }

    /// Scalar fallback matrix multiplication
    fn matrix_multiply_scalar_f32(
        &self,
        a: &ArrayView2<f32>,
        b: &ArrayView2<f32>,
    ) -> LinalgResult<Array2<f32>> {
        if a.ncols() != b.nrows() {
            return Err(LinalgError::ShapeError(
                "Matrix dimensions incompatible for multiplication".to_string(),
            ));
        }

        let (m, k) = a.dim();
        let n = b.ncols();
        let mut result = Array2::zeros((m, n));

        for i in 0..m {
            for j in 0..n {
                for kk in 0..k {
                    result[[i, j]] += a[[i, kk]] * b[[kk, j]];
                }
            }
        }

        Ok(result)
    }

    /// Get performance report for different strategies
    #[allow(dead_code)]
    pub fn get_performance_report(&self) -> std::collections::HashMap<VectorizationStrategy, f64> {
        self.strategy_performance.clone()
    }

    /// Enable or disable auto-tuning
    #[allow(dead_code)]
    pub fn set_auto_tuning(&mut self, enabled: bool) {
        self.auto_tuning_enabled = enabled;
    }
}

impl Default for AdaptiveVectorizationEngine {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_abs_diff_eq;
    use scirs2_core::ndarray::array;

    #[test]
    fn test_cache_awarematrix_operations() {
        let mut cache_ops = CacheAwareMatrixOperations::new();

        let a = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
        let b = array![[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]];
        let mut c = Array2::zeros((2, 2));

        let result = cache_ops.cache_aware_gemm_f32(&a.view(), &b.view(), &mut c.view_mut());
        assert!(result.is_ok());

        // Expected: [[58, 64], [139, 154]]
        assert_abs_diff_eq!(c[[0, 0]], 58.0, epsilon = 1e-6);
        assert_abs_diff_eq!(c[[0, 1]], 64.0, epsilon = 1e-6);
        assert_abs_diff_eq!(c[[1, 0]], 139.0, epsilon = 1e-6);
        assert_abs_diff_eq!(c[[1, 1]], 154.0, epsilon = 1e-6);
    }

    #[test]
    fn test_cache_aware_transpose() {
        let mut cache_ops = CacheAwareMatrixOperations::new();

        let input = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
        let result = cache_ops
            .cache_aware_transpose_f32(&input.view())
            .expect("Operation failed");

        let expected = array![[1.0, 4.0], [2.0, 5.0], [3.0, 6.0]];

        for (actual, expected) in result.iter().zip(expected.iter()) {
            assert_abs_diff_eq!(*actual, *expected, epsilon = 1e-10);
        }
    }

    #[test]
    fn test_memory_access_pattern_analyzer() {
        let analyzer = MemoryAccessPatternAnalyzer::new();

        let strategy = analyzer.analyze_and_recommend_prefetch((1000, 1000));
        if let PrefetchStrategy::Aggressive {
            prefetch_distance, ..
        } = strategy
        {
            assert!(prefetch_distance > 0);
        }
    }

    #[test]
    fn test_runtime_performance_profiler() {
        let mut profiler = RuntimePerformanceProfiler::new();

        profiler.start_session("test_operation");
        std::thread::sleep(Duration::from_millis(1));
        let duration = profiler.end_session();

        assert!(duration.is_some());
        assert!(duration.expect("Operation failed") >= Duration::from_millis(1));

        let recommendations = profiler.analyze_and_recommend();
        // Should provide some recommendations based on timing
        assert!(!recommendations.is_empty() || profiler.timing_history.len() < 2);
    }

    #[test]
    fn test_branch_optimizer() {
        let result1 = BranchOptimizer::likely_branch(true, 42, 0);
        assert_eq!(result1, 42);

        let result2 = BranchOptimizer::unlikely_branch(false, 0, 42);
        assert_eq!(result2, 42);
    }

    #[test]
    fn test_adaptive_vectorization_engine() {
        let mut engine = AdaptiveVectorizationEngine::new();

        // Test CPU feature detection
        let features = &engine.cpu_features;
        assert!(features.cache_linesize > 0);

        // Test strategy selection for different matrix sizes
        let small_strategy = engine.select_optimal_strategy((10, 10));
        let medium_strategy = engine.select_optimal_strategy((100, 100));
        let large_strategy = engine.select_optimal_strategy((1000, 1000));

        // Verify strategies are appropriate for size
        assert!(matches!(
            small_strategy,
            VectorizationStrategy::Scalar | VectorizationStrategy::SSE42
        ));
        println!("Small matrix strategy: {:?}", small_strategy);
        println!("Medium matrix strategy: {:?}", medium_strategy);
        println!("Large matrix strategy: {:?}", large_strategy);

        // Test matrix multiplication with small matrices
        let a = array![[1.0f32, 2.0], [3.0, 4.0]];
        let b = array![[5.0f32, 6.0], [7.0, 8.0]];

        let result = engine
            .adaptivematrix_multiply_f32(&a.view(), &b.view())
            .expect("Operation failed");

        // Verify result correctness
        let expected = array![[19.0f32, 22.0], [43.0, 50.0]];
        for (actual, expected) in result.iter().zip(expected.iter()) {
            assert_abs_diff_eq!(*actual, *expected, epsilon = 1e-10);
        }

        // Test auto-tuning functionality
        engine.set_auto_tuning(false);
        let performance_report = engine.get_performance_report();
        assert!(!performance_report.is_empty());
    }
}