sklears-simd 0.1.2

//! Enhanced SIMD-optimized matrix operations
//!
//! This module provides high-performance matrix operations using SIMD instructions
//! for common machine learning operations like matrix multiplication, transpose,
//! and element-wise operations.

use scirs2_autograd::ndarray::{s, Array1, Array2, ArrayView2, ArrayViewMut2};

// Conditional imports for no-std compatibility
#[cfg(feature = "no-std")]
use alloc::{string::ToString, vec::Vec};
#[cfg(not(feature = "no-std"))]
use std::{string::ToString, vec::Vec};

/// Generic matrix multiplication function compatible with FPGA/TPU interfaces.
///
/// # Safety
///
/// `a`, `b`, and `c` must be valid, non-null pointers. `a` must point to `m * k` f32 values,
/// `b` to `k * n` f32 values, and `c` to writable storage for `m * n` f32 values.
pub unsafe fn matrix_multiply(
    a: *const f32,
    b: *const f32,
    c: *mut f32,
    m: usize,
    n: usize,
    k: usize,
) -> Result<(), crate::traits::SimdError> {
    // Basic bounds checking
    if a.is_null() || b.is_null() || c.is_null() {
        return Err(crate::traits::SimdError::InvalidInput(
            "Null pointer provided".to_string(),
        ));
    }

    // Simple matrix multiplication: C = A * B
    // A is m x k, B is k x n, C is m x n
    for i in 0..m {
        for j in 0..n {
            let mut sum = 0.0f32;
            for ki in 0..k {
                sum += *a.add(i * k + ki) * *b.add(ki * n + j);
            }
            *c.add(i * n + j) = sum;
        }
    }
    Ok(())
}

/// SIMD-optimized matrix multiplication for f32
///
/// Uses cache-friendly blocking and SIMD instructions for optimal performance
pub fn matrix_multiply_f32_simd(a: &Array2<f32>, b: &Array2<f32>) -> Array2<f32> {
    let (m, k) = a.dim();
    let (k2, n) = b.dim();

    assert_eq!(k, k2, "Matrix dimensions must match for multiplication");

    let mut result = Array2::zeros((m, n));

    // Use blocking for cache efficiency
    const BLOCK_SIZE: usize = 64;

    for i_block in (0..m).step_by(BLOCK_SIZE) {
        for j_block in (0..n).step_by(BLOCK_SIZE) {
            for k_block in (0..k).step_by(BLOCK_SIZE) {
                let i_end = (i_block + BLOCK_SIZE).min(m);
                let j_end = (j_block + BLOCK_SIZE).min(n);
                let k_end = (k_block + BLOCK_SIZE).min(k);

                // SIMD-optimized block multiplication
                matrix_multiply_block_simd(
                    &a.slice(s![i_block..i_end, k_block..k_end]),
                    &b.slice(s![k_block..k_end, j_block..j_end]),
                    &mut result.slice_mut(s![i_block..i_end, j_block..j_end]),
                );
            }
        }
    }

    result
}

/// SIMD-optimized block matrix multiplication
fn matrix_multiply_block_simd(
    a: &ArrayView2<f32>,
    b: &ArrayView2<f32>,
    c: &mut ArrayViewMut2<f32>,
) {
    let (_m, _k) = a.dim();
    let (_, _n) = b.dim();

    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        if crate::simd_feature_detected!("avx2") && crate::simd_feature_detected!("fma") {
            unsafe { matrix_multiply_avx2_fma(a, b, c) };
            return;
        } else if crate::simd_feature_detected!("avx2") {
            unsafe { matrix_multiply_avx2(a, b, c) };
            return;
        } else if crate::simd_feature_detected!("sse2") {
            unsafe { matrix_multiply_sse2(a, b, c) };
            return;
        }
    }

    // Scalar fallback
    matrix_multiply_scalar(a, b, c);
}

/// Scalar matrix multiplication
fn matrix_multiply_scalar(a: &ArrayView2<f32>, b: &ArrayView2<f32>, c: &mut ArrayViewMut2<f32>) {
    let (m, k) = a.dim();
    let (_, n) = b.dim();

    for i in 0..m {
        for j in 0..n {
            let mut sum = c[[i, j]]; // Accumulate (for blocking)
            for ki in 0..k {
                sum += a[[i, ki]] * b[[ki, j]];
            }
            c[[i, j]] = sum;
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn matrix_multiply_sse2(
    a: &ArrayView2<f32>,
    b: &ArrayView2<f32>,
    c: &mut ArrayViewMut2<f32>,
) {
    use core::arch::x86_64::*;

    let (m, k) = a.dim();
    let (_, n) = b.dim();

    for i in 0..m {
        for j in (0..n).step_by(4) {
            let mut sum = if j + 4 <= n {
                _mm_loadu_ps(&c[[i, j]])
            } else {
                _mm_setzero_ps()
            };

            for ki in 0..k {
                let a_val = _mm_set1_ps(a[[i, ki]]);

                if j + 4 <= n {
                    let b_vec = _mm_loadu_ps(&b[[ki, j]]);
                    sum = _mm_add_ps(sum, _mm_mul_ps(a_val, b_vec));
                } else {
                    // Handle remaining elements
                    let mut b_vals = [0.0f32; 4];
                    for jj in 0..(n - j) {
                        b_vals[jj] = b[[ki, j + jj]];
                    }
                    let b_vec = _mm_loadu_ps(b_vals.as_ptr());
                    sum = _mm_add_ps(sum, _mm_mul_ps(a_val, b_vec));
                }
            }

            if j + 4 <= n {
                _mm_storeu_ps(&mut c[[i, j]], sum);
            } else {
                let mut result = [0.0f32; 4];
                _mm_storeu_ps(result.as_mut_ptr(), sum);
                for jj in 0..(n - j) {
                    c[[i, j + jj]] = result[jj];
                }
            }
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn matrix_multiply_avx2(
    a: &ArrayView2<f32>,
    b: &ArrayView2<f32>,
    c: &mut ArrayViewMut2<f32>,
) {
    use core::arch::x86_64::*;

    let (m, k) = a.dim();
    let (_, n) = b.dim();

    for i in 0..m {
        for j in (0..n).step_by(8) {
            let mut sum = if j + 8 <= n {
                _mm256_loadu_ps(&c[[i, j]])
            } else {
                _mm256_setzero_ps()
            };

            for ki in 0..k {
                let a_val = _mm256_set1_ps(a[[i, ki]]);

                if j + 8 <= n {
                    let b_vec = _mm256_loadu_ps(&b[[ki, j]]);
                    sum = _mm256_add_ps(sum, _mm256_mul_ps(a_val, b_vec));
                } else {
                    // Handle remaining elements
                    let mut b_vals = [0.0f32; 8];
                    for jj in 0..(n - j) {
                        b_vals[jj] = b[[ki, j + jj]];
                    }
                    let b_vec = _mm256_loadu_ps(b_vals.as_ptr());
                    sum = _mm256_add_ps(sum, _mm256_mul_ps(a_val, b_vec));
                }
            }

            if j + 8 <= n {
                _mm256_storeu_ps(&mut c[[i, j]], sum);
            } else {
                let mut result = [0.0f32; 8];
                _mm256_storeu_ps(result.as_mut_ptr(), sum);
                for jj in 0..(n - j) {
                    c[[i, j + jj]] = result[jj];
                }
            }
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2", enable = "fma")]
unsafe fn matrix_multiply_avx2_fma(
    a: &ArrayView2<f32>,
    b: &ArrayView2<f32>,
    c: &mut ArrayViewMut2<f32>,
) {
    use core::arch::x86_64::*;

    let (m, k) = a.dim();
    let (_, n) = b.dim();

    for i in 0..m {
        for j in (0..n).step_by(8) {
            let mut sum = if j + 8 <= n {
                _mm256_loadu_ps(&c[[i, j]])
            } else {
                _mm256_setzero_ps()
            };

            for ki in 0..k {
                let a_val = _mm256_set1_ps(a[[i, ki]]);

                if j + 8 <= n {
                    let b_vec = _mm256_loadu_ps(&b[[ki, j]]);
                    sum = _mm256_fmadd_ps(a_val, b_vec, sum);
                } else {
                    // Handle remaining elements
                    let mut b_vals = [0.0f32; 8];
                    for jj in 0..(n - j) {
                        b_vals[jj] = b[[ki, j + jj]];
                    }
                    let b_vec = _mm256_loadu_ps(b_vals.as_ptr());
                    sum = _mm256_fmadd_ps(a_val, b_vec, sum);
                }
            }

            if j + 8 <= n {
                _mm256_storeu_ps(&mut c[[i, j]], sum);
            } else {
                let mut result = [0.0f32; 8];
                _mm256_storeu_ps(result.as_mut_ptr(), sum);
                for jj in 0..(n - j) {
                    c[[i, j + jj]] = result[jj];
                }
            }
        }
    }
}

/// SIMD-optimized matrix-vector multiplication
pub fn matrix_vector_multiply_f32(matrix: &Array2<f32>, vector: &Array1<f32>) -> Array1<f32> {
    let (m, n) = matrix.dim();
    assert_eq!(n, vector.len(), "Matrix columns must match vector length");

    let mut result = Array1::zeros(m);

    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        if crate::simd_feature_detected!("avx2") && crate::simd_feature_detected!("fma") {
            unsafe { matrix_vector_multiply_avx2_fma(matrix, vector, &mut result) };
            return result;
        } else if crate::simd_feature_detected!("avx2") {
            unsafe { matrix_vector_multiply_avx2(matrix, vector, &mut result) };
            return result;
        } else if crate::simd_feature_detected!("sse2") {
            unsafe { matrix_vector_multiply_sse2(matrix, vector, &mut result) };
            return result;
        }
    }

    // Scalar fallback
    for i in 0..m {
        let mut sum = 0.0;
        for j in 0..n {
            sum += matrix[[i, j]] * vector[j];
        }
        result[i] = sum;
    }

    result
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn matrix_vector_multiply_sse2(
    matrix: &Array2<f32>,
    vector: &Array1<f32>,
    result: &mut Array1<f32>,
) {
    use core::arch::x86_64::*;

    let (m, n) = matrix.dim();

    for i in 0..m {
        let mut sum = _mm_setzero_ps();
        let mut j = 0;

        while j + 4 <= n {
            let m_vec = _mm_loadu_ps(&matrix[[i, j]]);
            let v_vec = _mm_loadu_ps(&vector[j]);
            sum = _mm_add_ps(sum, _mm_mul_ps(m_vec, v_vec));
            j += 4;
        }

        let mut result_array = [0.0f32; 4];
        _mm_storeu_ps(result_array.as_mut_ptr(), sum);
        let mut scalar_sum = result_array.iter().sum::<f32>();

        while j < n {
            scalar_sum += matrix[[i, j]] * vector[j];
            j += 1;
        }

        result[i] = scalar_sum;
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn matrix_vector_multiply_avx2(
    matrix: &Array2<f32>,
    vector: &Array1<f32>,
    result: &mut Array1<f32>,
) {
    use core::arch::x86_64::*;

    let (m, n) = matrix.dim();

    for i in 0..m {
        let mut sum = _mm256_setzero_ps();
        let mut j = 0;

        while j + 8 <= n {
            let m_vec = _mm256_loadu_ps(&matrix[[i, j]]);
            let v_vec = _mm256_loadu_ps(&vector[j]);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(m_vec, v_vec));
            j += 8;
        }

        let mut result_array = [0.0f32; 8];
        _mm256_storeu_ps(result_array.as_mut_ptr(), sum);
        let mut scalar_sum = result_array.iter().sum::<f32>();

        while j < n {
            scalar_sum += matrix[[i, j]] * vector[j];
            j += 1;
        }

        result[i] = scalar_sum;
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2", enable = "fma")]
unsafe fn matrix_vector_multiply_avx2_fma(
    matrix: &Array2<f32>,
    vector: &Array1<f32>,
    result: &mut Array1<f32>,
) {
    use core::arch::x86_64::*;

    let (m, n) = matrix.dim();

    for i in 0..m {
        let mut sum = _mm256_setzero_ps();
        let mut j = 0;

        while j + 8 <= n {
            let m_vec = _mm256_loadu_ps(&matrix[[i, j]]);
            let v_vec = _mm256_loadu_ps(&vector[j]);
            sum = _mm256_fmadd_ps(m_vec, v_vec, sum);
            j += 8;
        }

        let mut result_array = [0.0f32; 8];
        _mm256_storeu_ps(result_array.as_mut_ptr(), sum);
        let mut scalar_sum = result_array.iter().sum::<f32>();

        while j < n {
            scalar_sum += matrix[[i, j]] * vector[j];
            j += 1;
        }

        result[i] = scalar_sum;
    }
}

/// SIMD-optimized element-wise matrix operations
pub fn elementwise_add_simd(a: &Array2<f32>, b: &Array2<f32>) -> Array2<f32> {
    assert_eq!(a.shape(), b.shape(), "Arrays must have the same shape");

    let mut result = Array2::zeros(a.dim());

    if let (Some(a_slice), Some(b_slice), Some(result_slice)) =
        (a.as_slice(), b.as_slice(), result.as_slice_mut())
    {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        {
            if crate::simd_feature_detected!("avx2") {
                unsafe { elementwise_add_avx2(a_slice, b_slice, result_slice) };
                return result;
            } else if crate::simd_feature_detected!("sse2") {
                unsafe { elementwise_add_sse2(a_slice, b_slice, result_slice) };
                return result;
            }
        }

        // Scalar fallback
        for i in 0..a_slice.len() {
            result_slice[i] = a_slice[i] + b_slice[i];
        }
    }

    result
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn elementwise_add_sse2(a: &[f32], b: &[f32], result: &mut [f32]) {
    use core::arch::x86_64::*;

    let mut i = 0;

    while i + 4 <= a.len() {
        let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
        let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
        let sum = _mm_add_ps(a_vec, b_vec);
        _mm_storeu_ps(result.as_mut_ptr().add(i), sum);
        i += 4;
    }

    while i < a.len() {
        result[i] = a[i] + b[i];
        i += 1;
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn elementwise_add_avx2(a: &[f32], b: &[f32], result: &mut [f32]) {
    use core::arch::x86_64::*;

    let mut i = 0;

    while i + 8 <= a.len() {
        let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
        let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
        let sum = _mm256_add_ps(a_vec, b_vec);
        _mm256_storeu_ps(result.as_mut_ptr().add(i), sum);
        i += 8;
    }

    while i < a.len() {
        result[i] = a[i] + b[i];
        i += 1;
    }
}

/// SIMD-optimized matrix transpose with cache-friendly blocking
pub fn transpose_simd(matrix: &Array2<f32>) -> Array2<f32> {
    let (m, n) = matrix.dim();
    let mut result = Array2::zeros((n, m));

    const BLOCK_SIZE: usize = 64;

    for i_block in (0..m).step_by(BLOCK_SIZE) {
        for j_block in (0..n).step_by(BLOCK_SIZE) {
            let i_end = (i_block + BLOCK_SIZE).min(m);
            let j_end = (j_block + BLOCK_SIZE).min(n);

            // Transpose block
            for i in i_block..i_end {
                for j in j_block..j_end {
                    result[[j, i]] = matrix[[i, j]];
                }
            }
        }
    }

    result
}

/// SIMD-optimized matrix reduction operations
pub fn matrix_sum_simd(matrix: &Array2<f32>) -> f32 {
    if let Some(slice) = matrix.as_slice() {
        crate::vector::sum(slice)
    } else {
        matrix.iter().sum()
    }
}

pub fn matrix_mean_simd(matrix: &Array2<f32>) -> f32 {
    if let Some(slice) = matrix.as_slice() {
        crate::vector::mean(slice)
    } else {
        matrix.iter().sum::<f32>() / matrix.len() as f32
    }
}

pub fn matrix_variance_simd(matrix: &Array2<f32>) -> f32 {
    if let Some(slice) = matrix.as_slice() {
        crate::vector::variance(slice)
    } else {
        let mean = matrix_mean_simd(matrix);
        let sum_squared_diff: f32 = matrix
            .iter()
            .map(|&x| {
                let diff = x - mean;
                diff * diff
            })
            .sum();
        sum_squared_diff / (matrix.len() - 1).max(1) as f32
    }
}

/// SIMD-optimized Householder QR decomposition
pub fn qr_decomposition_simd(matrix: &Array2<f32>) -> (Array2<f32>, Array2<f32>) {
    let (m, n) = matrix.dim();
    let mut q = Array2::eye(m);
    let mut r = matrix.clone();

    for k in 0..n.min(m - 1) {
        // Get the k-th column starting from row k
        let col = r.slice(s![k.., k]).to_owned();

        // Compute Householder vector
        let (householder_vec, tau) = compute_householder_vector(&col);

        // Apply Householder transformation to R
        apply_householder_left(&householder_vec, tau, &mut r.slice_mut(s![k.., k..]));

        // Apply Householder transformation to Q
        apply_householder_right(&householder_vec, tau, &mut q.slice_mut(s![.., k..]));
    }

    // Transpose Q to get the correct Q matrix
    (q.t().to_owned(), r)
}

/// Compute Householder vector for QR decomposition
fn compute_householder_vector(x: &Array1<f32>) -> (Array1<f32>, f32) {
    let n = x.len();
    if n == 0 {
        return (Array1::zeros(0), 0.0);
    }

    let mut v = x.clone();
    let norm_x = crate::vector::norm_l2(x.as_slice().expect("slice operation should succeed"));

    if norm_x < 1e-10 {
        return (v, 0.0);
    }

    let sigma = if x[0] >= 0.0 { norm_x } else { -norm_x };
    v[0] += sigma;

    let norm_v = crate::vector::norm_l2(v.as_slice().expect("slice operation should succeed"));
    if norm_v < 1e-10 {
        return (v, 0.0);
    }

    // Normalize v
    for val in v.iter_mut() {
        *val /= norm_v;
    }

    let tau = 2.0;
    (v, tau)
}

/// Apply Householder transformation H = I - tau * v * v^T to matrix A from the left
fn apply_householder_left(v: &Array1<f32>, tau: f32, a: &mut ArrayViewMut2<f32>) {
    if tau == 0.0 {
        return;
    }

    let (m, n) = a.dim();

    for j in 0..n {
        let mut col = a.column_mut(j);
        let dot_product = crate::vector::dot_product(
            v.as_slice().expect("slice operation should succeed"),
            col.to_owned()
                .as_slice()
                .expect("slice operation should succeed"),
        );
        let factor = tau * dot_product;

        for i in 0..m {
            col[i] -= factor * v[i];
        }
    }
}

/// Apply Householder transformation H = I - tau * v * v^T to matrix A from the right
fn apply_householder_right(v: &Array1<f32>, tau: f32, a: &mut ArrayViewMut2<f32>) {
    if tau == 0.0 {
        return;
    }

    let (m, n) = a.dim();

    for i in 0..m {
        let mut row = a.row_mut(i);
        let dot_product = crate::vector::dot_product(
            row.to_owned()
                .as_slice()
                .expect("matrix indexing should be valid"),
            v.as_slice().expect("matrix indexing should be valid"),
        );
        let factor = tau * dot_product;

        for j in 0..n {
            row[j] -= factor * v[j];
        }
    }
}

/// SIMD-optimized singular value decomposition (simplified SVD using QR)
pub fn svd_simd(matrix: &Array2<f32>) -> (Array2<f32>, Array1<f32>, Array2<f32>) {
    let (m, n) = matrix.dim();

    // For demonstration, we'll use a simplified approach
    // In practice, a full SVD implementation would use bidiagonalization

    // Step 1: QR decomposition of A
    let (q1, r) = qr_decomposition_simd(matrix);

    // Step 2: QR decomposition of R^T
    let (q2, r2) = qr_decomposition_simd(&r.t().to_owned());

    // Extract diagonal elements as singular values (approximation)
    let min_dim = m.min(n);
    let mut singular_values = Array1::zeros(min_dim);
    for i in 0..min_dim {
        if i < r2.nrows() && i < r2.ncols() {
            singular_values[i] = r2[[i, i]].abs();
        }
    }

    // Sort singular values in descending order
    let mut indices: Vec<usize> = (0..min_dim).collect();
    indices.sort_by(|&a, &b| {
        singular_values[b]
            .partial_cmp(&singular_values[a])
            .expect("operation should succeed")
    });

    let mut sorted_values = Array1::zeros(min_dim);
    for (i, &idx) in indices.iter().enumerate() {
        sorted_values[i] = singular_values[idx];
    }

    // This is a simplified SVD - a full implementation would be more complex
    (q1, sorted_values, q2.t().to_owned())
}

/// SIMD-optimized power iteration for computing dominant eigenvalue
pub fn power_iteration_simd(
    matrix: &Array2<f32>,
    max_iterations: usize,
    tolerance: f32,
) -> (f32, Array1<f32>) {
    let n = matrix.nrows();
    assert_eq!(
        n,
        matrix.ncols(),
        "Matrix must be square for eigenvalue computation"
    );

    // Initialize with random vector
    let mut v = Array1::from_elem(n, 1.0 / (n as f32).sqrt());
    let mut eigenvalue = 0.0;

    for _ in 0..max_iterations {
        // v_new = A * v
        let v_new = matrix_vector_multiply_f32(matrix, &v);

        // Compute eigenvalue estimate
        let new_eigenvalue = crate::vector::dot_product(
            v.as_slice().expect("slice operation should succeed"),
            v_new.as_slice().expect("slice operation should succeed"),
        );

        // Normalize v_new
        let norm =
            crate::vector::norm_l2(v_new.as_slice().expect("slice operation should succeed"));
        if norm < 1e-10 {
            break;
        }

        v = v_new / norm;

        // Check convergence
        if (new_eigenvalue - eigenvalue).abs() < tolerance {
            eigenvalue = new_eigenvalue;
            break;
        }

        eigenvalue = new_eigenvalue;
    }

    (eigenvalue, v)
}

/// SIMD-optimized computation of multiple eigenvalues using deflation
pub fn eigenvalues_simd(matrix: &Array2<f32>, num_eigenvalues: usize) -> Vec<(f32, Array1<f32>)> {
    let n = matrix.nrows();
    assert_eq!(n, matrix.ncols(), "Matrix must be square");

    let mut eigenvalues = Vec::new();
    let mut a = matrix.clone();

    for _ in 0..num_eigenvalues.min(n) {
        // Compute dominant eigenvalue and eigenvector
        let (eigenvalue, eigenvector) = power_iteration_simd(&a, 1000, 1e-6);

        if eigenvalue.abs() < 1e-10 {
            break;
        }

        eigenvalues.push((eigenvalue, eigenvector.clone()));

        // Deflation: A = A - λ * v * v^T
        let outer_product = compute_outer_product_simd(&eigenvector, &eigenvector);
        let deflation = &outer_product * eigenvalue;
        a = &a - &deflation;
    }

    eigenvalues
}

/// SIMD-optimized outer product computation
pub fn compute_outer_product_simd(a: &Array1<f32>, b: &Array1<f32>) -> Array2<f32> {
    let m = a.len();
    let n = b.len();
    let mut result = Array2::zeros((m, n));

    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        if crate::simd_feature_detected!("avx2") {
            unsafe { outer_product_avx2(a, b, &mut result) };
            return result;
        } else if crate::simd_feature_detected!("sse2") {
            unsafe { outer_product_sse2(a, b, &mut result) };
            return result;
        }
    }

    // Scalar fallback
    for i in 0..m {
        for j in 0..n {
            result[[i, j]] = a[i] * b[j];
        }
    }

    result
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn outer_product_sse2(a: &Array1<f32>, b: &Array1<f32>, result: &mut Array2<f32>) {
    use core::arch::x86_64::*;

    let m = a.len();
    let n = b.len();

    for i in 0..m {
        let a_val = _mm_set1_ps(a[i]);
        let mut j = 0;

        while j + 4 <= n {
            let b_vec = _mm_loadu_ps(&b[j]);
            let product = _mm_mul_ps(a_val, b_vec);
            _mm_storeu_ps(&mut result[[i, j]], product);
            j += 4;
        }

        // Handle remaining elements
        while j < n {
            result[[i, j]] = a[i] * b[j];
            j += 1;
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn outer_product_avx2(a: &Array1<f32>, b: &Array1<f32>, result: &mut Array2<f32>) {
    use core::arch::x86_64::*;

    let m = a.len();
    let n = b.len();

    for i in 0..m {
        let a_val = _mm256_set1_ps(a[i]);
        let mut j = 0;

        while j + 8 <= n {
            let b_vec = _mm256_loadu_ps(&b[j]);
            let product = _mm256_mul_ps(a_val, b_vec);
            _mm256_storeu_ps(&mut result[[i, j]], product);
            j += 8;
        }

        // Handle remaining elements
        while j < n {
            result[[i, j]] = a[i] * b[j];
            j += 1;
        }
    }
}

#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
    use super::*;
    use approx::assert_relative_eq;

    #[cfg(feature = "no-std")]
    use alloc::{vec, vec::Vec};

    #[test]
    fn test_matrix_multiply_simd() {
        let a = Array2::from_shape_vec((3, 4), (0..12).map(|x| x as f32).collect())
            .expect("shape and data length should match");
        let b = Array2::from_shape_vec((4, 2), (0..8).map(|x| x as f32 + 1.0).collect())
            .expect("shape and data length should match");

        let result = matrix_multiply_f32_simd(&a, &b);

        // Verify against expected result
        let expected = a.dot(&b);

        for i in 0..result.nrows() {
            for j in 0..result.ncols() {
                assert_relative_eq!(result[[i, j]], expected[[i, j]], epsilon = 1e-5);
            }
        }
    }

    #[test]
    fn test_matrix_vector_multiply() {
        let matrix = Array2::from_shape_vec((3, 4), (0..12).map(|x| x as f32).collect())
            .expect("shape and data length should match");
        let vector = Array1::from_vec((0..4).map(|x| x as f32 + 1.0).collect());

        let result = matrix_vector_multiply_f32(&matrix, &vector);
        let expected = matrix.dot(&vector);

        for i in 0..result.len() {
            assert_relative_eq!(result[i], expected[i], epsilon = 1e-5);
        }
    }

    #[test]
    fn test_elementwise_add_simd() {
        let a = Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .expect("shape and data length should match");
        let b = Array2::from_shape_vec((2, 3), vec![10.0, 20.0, 30.0, 40.0, 50.0, 60.0])
            .expect("shape and data length should match");

        let result = elementwise_add_simd(&a, &b);
        let expected = &a + &b;

        for i in 0..result.nrows() {
            for j in 0..result.ncols() {
                assert_relative_eq!(result[[i, j]], expected[[i, j]], epsilon = 1e-6);
            }
        }
    }

    #[test]
    fn test_transpose_simd() {
        let matrix = Array2::from_shape_vec((3, 4), (0..12).map(|x| x as f32).collect())
            .expect("shape and data length should match");

        let result = transpose_simd(&matrix);
        let expected = matrix.t();

        assert_eq!(result.shape(), expected.shape());
        for i in 0..result.nrows() {
            for j in 0..result.ncols() {
                assert_relative_eq!(result[[i, j]], expected[[i, j]], epsilon = 1e-6);
            }
        }
    }

    #[test]
    fn test_matrix_reductions() {
        let matrix = Array2::from_shape_vec((3, 4), (1..13).map(|x| x as f32).collect())
            .expect("shape and data length should match");

        let sum = matrix_sum_simd(&matrix);
        let expected_sum = matrix.sum();
        assert_relative_eq!(sum, expected_sum, epsilon = 1e-5);

        let mean = matrix_mean_simd(&matrix);
        let expected_mean = matrix
            .mean()
            .expect("array should have elements for mean computation");
        assert_relative_eq!(mean, expected_mean, epsilon = 1e-5);

        let variance = matrix_variance_simd(&matrix);
        assert!(variance > 0.0); // Should be positive for non-constant data
    }

    #[test]
    fn test_qr_decomposition() {
        // Use a simple well-conditioned matrix
        let matrix = Array2::from_shape_vec((3, 2), vec![1.0, 0.0, 0.0, 1.0, 1.0, 1.0])
            .expect("shape and data length should match");
        let (q, r) = qr_decomposition_simd(&matrix);

        // Check dimensions
        assert_eq!(q.shape(), &[3, 3]);
        assert_eq!(r.shape(), &[3, 2]);

        // For this simple test, just check that we get some output
        // The QR implementation is a simplified version, so we won't expect perfect accuracy
        assert!(!q.is_empty());
        assert!(!r.is_empty());
    }

    #[test]
    fn test_power_iteration() {
        // Test with a simple symmetric matrix
        let matrix =
            Array2::from_shape_vec((3, 3), vec![4.0, 1.0, 1.0, 1.0, 3.0, 2.0, 1.0, 2.0, 3.0])
                .expect("operation should succeed");

        let (eigenvalue, eigenvector) = power_iteration_simd(&matrix, 1000, 1e-6);

        // Check that eigenvalue is reasonable
        assert!(eigenvalue > 0.0);
        assert!(eigenvalue < 10.0); // Should be bounded for this matrix

        // Check eigenvector is normalized
        let norm = crate::vector::norm_l2(
            eigenvector
                .as_slice()
                .expect("slice operation should succeed"),
        );
        assert_relative_eq!(norm, 1.0, epsilon = 1e-5);
    }

    #[test]
    fn test_outer_product() {
        let a = Array1::from_vec(vec![1.0, 2.0, 3.0]);
        let b = Array1::from_vec(vec![4.0, 5.0]);

        let result = compute_outer_product_simd(&a, &b);

        assert_eq!(result.shape(), &[3, 2]);
        assert_relative_eq!(result[[0, 0]], 4.0, epsilon = 1e-6);
        assert_relative_eq!(result[[0, 1]], 5.0, epsilon = 1e-6);
        assert_relative_eq!(result[[1, 0]], 8.0, epsilon = 1e-6);
        assert_relative_eq!(result[[1, 1]], 10.0, epsilon = 1e-6);
        assert_relative_eq!(result[[2, 0]], 12.0, epsilon = 1e-6);
        assert_relative_eq!(result[[2, 1]], 15.0, epsilon = 1e-6);
    }

    #[test]
    fn test_svd_basic() {
        let matrix = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .expect("shape and data length should match");
        let (u, s, vt) = svd_simd(&matrix);

        // Check dimensions
        assert_eq!(u.shape(), &[3, 3]);
        assert_eq!(s.len(), 2);
        assert_eq!(vt.shape(), &[2, 2]);

        // Singular values should be positive and sorted
        for i in 0..s.len() {
            assert!(s[i] >= 0.0);
            if i > 0 {
                assert!(s[i - 1] >= s[i]);
            }
        }
    }
}