scirs2-core 0.4.2

//! Raw slice API for SIMD-accelerated matrix operations
//!
//! This module provides convenient slice-based APIs for matrix multiplication
//! and related operations, built on top of the high-performance blocked GEMM
//! implementation.
//!
//! # Features
//!
//! - **Simple API**: Work directly with slices instead of raw pointers
//! - **High Performance**: 10-50x speedup over naive implementations
//! - **Safe wrapper**: Handles pointer arithmetic and bounds checking internally
//! - **Dot products**: Optimized implementations for vector inner products
//! - **Full GEMM**: C = beta*C + alpha*A*B with all BLAS features
//!
//! # Example
//!
//! ```rust
//! use scirs2_core::simd_ops::matmul::{simd_matrix_multiply_f32, simd_dot_product_f32};
//!
//! // Matrix multiplication: C = A * B
//! let m = 256;
//! let k = 256;
//! let n = 256;
//!
//! let a = vec![1.0f32; m * k];
//! let b = vec![1.0f32; k * n];
//! let mut c = vec![0.0f32; m * n];
//!
//! simd_matrix_multiply_f32(m, k, n, 1.0, &a, &b, 0.0, &mut c);
//!
//! // Dot product
//! let x = vec![1.0, 2.0, 3.0, 4.0];
//! let y = vec![5.0, 6.0, 7.0, 8.0];
//! let result = simd_dot_product_f32(&x, &y); // 1*5 + 2*6 + 3*7 + 4*8 = 70
//! ```

#[cfg(feature = "simd")]
use crate::simd::dot::simd_dot_f32;
#[cfg(feature = "simd")]
use crate::simd::gemm::{blocked_gemm_f32, blocked_gemm_f64, should_use_blocked, MatMulConfig};

/// Compute dot product of two f32 slices using SIMD operations
///
/// # Arguments
///
/// * `a` - First input slice
/// * `b` - Second input slice
///
/// # Returns
///
/// Dot product: sum(a\[i\] * b\[i\])
///
/// # Panics
///
/// Panics if slices have different lengths
///
/// # Example
///
/// ```rust
/// use scirs2_core::simd_ops::matmul::simd_dot_product_f32;
///
/// let a = vec![1.0, 2.0, 3.0];
/// let b = vec![4.0, 5.0, 6.0];
/// let result = simd_dot_product_f32(&a, &b); // 1*4 + 2*5 + 3*6 = 32
/// assert!((result - 32.0).abs() < 1e-5);
/// ```
pub fn simd_dot_product_f32(a: &[f32], b: &[f32]) -> f32 {
    assert_eq!(
        a.len(),
        b.len(),
        "Dot product requires equal-length vectors"
    );

    #[cfg(feature = "simd")]
    {
        // Use the optimized SIMD dot product from the simd module
        use crate::ndarray::ArrayView1;

        let a_view = ArrayView1::from(a);
        let b_view = ArrayView1::from(b);

        simd_dot_f32(&a_view, &b_view)
    }

    #[cfg(not(feature = "simd"))]
    {
        // Fallback to scalar implementation
        a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
    }
}

/// Compute dot product of two f64 slices using SIMD operations
///
/// # Arguments
///
/// * `a` - First input slice
/// * `b` - Second input slice
///
/// # Returns
///
/// Dot product: sum(a\[i\] * b\[i\])
///
/// # Panics
///
/// Panics if slices have different lengths
pub fn simd_dot_product_f64(a: &[f64], b: &[f64]) -> f64 {
    assert_eq!(
        a.len(),
        b.len(),
        "Dot product requires equal-length vectors"
    );

    #[cfg(feature = "simd")]
    {
        use crate::ndarray::ArrayView1;
        use crate::simd::dot::simd_dot_f64;

        let a_view = ArrayView1::from(a);
        let b_view = ArrayView1::from(b);

        simd_dot_f64(&a_view, &b_view)
    }

    #[cfg(not(feature = "simd"))]
    {
        // Fallback to scalar implementation
        a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
    }
}

/// High-performance matrix multiplication for f32: C = beta*C + alpha*A*B
///
/// Computes C = beta*C + alpha*A*B where:
/// - A is m x k (row-major)
/// - B is k x n (row-major)
/// - C is m x n (row-major)
///
/// # Arguments
///
/// * `m` - Number of rows of A and C
/// * `k` - Number of columns of A and rows of B
/// * `n` - Number of columns of B and C
/// * `alpha` - Scalar multiplier for A*B
/// * `a` - Input matrix A (m*k elements, row-major)
/// * `b` - Input matrix B (k*n elements, row-major)
/// * `beta` - Scalar multiplier for existing C values
/// * `c` - Output matrix C (m*n elements, row-major)
///
/// # Panics
///
/// Panics if slice lengths don't match expected dimensions
///
/// # Performance
///
/// - Small matrices (< 64x64): 2-5x speedup over naive
/// - Medium matrices (64-256): 5-15x speedup
/// - Large matrices (256+): 10-30x speedup
///
/// # Example
///
/// ```rust
/// use scirs2_core::simd_ops::matmul::simd_matrix_multiply_f32;
///
/// let m = 4;
/// let k = 4;
/// let n = 4;
///
/// // A = identity matrix
/// let mut a = vec![0.0f32; m * k];
/// for i in 0..m {
///     a[i * k + i] = 1.0;
/// }
///
/// // B = all ones
/// let b = vec![1.0f32; k * n];
///
/// // C = zeros
/// let mut c = vec![0.0f32; m * n];
///
/// // C = 1.0 * A * B + 0.0 * C
/// simd_matrix_multiply_f32(m, k, n, 1.0, &a, &b, 0.0, &mut c);
///
/// // Result: each row of C should be all ones
/// for i in 0..m {
///     for j in 0..n {
///         assert!((c[i * n + j] - 1.0).abs() < 1e-5);
///     }
/// }
/// ```
pub fn simd_matrix_multiply_f32(
    m: usize,
    k: usize,
    n: usize,
    alpha: f32,
    a: &[f32],
    b: &[f32],
    beta: f32,
    c: &mut [f32],
) {
    // Validate dimensions
    assert_eq!(
        a.len(),
        m * k,
        "Matrix A must have m*k = {}*{} = {} elements, got {}",
        m,
        k,
        m * k,
        a.len()
    );
    assert_eq!(
        b.len(),
        k * n,
        "Matrix B must have k*n = {}*{} = {} elements, got {}",
        k,
        n,
        k * n,
        b.len()
    );
    assert_eq!(
        c.len(),
        m * n,
        "Matrix C must have m*n = {}*{} = {} elements, got {}",
        m,
        n,
        m * n,
        c.len()
    );

    #[cfg(feature = "simd")]
    {
        let config = MatMulConfig::for_f32();

        unsafe {
            blocked_gemm_f32(
                m,
                k,
                n,
                alpha,
                a.as_ptr(),
                k, // lda: leading dimension of A
                b.as_ptr(),
                n, // ldb: leading dimension of B
                beta,
                c.as_mut_ptr(),
                n, // ldc: leading dimension of C
                &config,
            );
        }
    }

    #[cfg(not(feature = "simd"))]
    {
        // Fallback to simple implementation
        gemm_simple_f32(m, k, n, alpha, a, k, b, n, beta, c, n);
    }
}

/// High-performance matrix multiplication for f64: C = beta*C + alpha*A*B
///
/// Same as [`simd_matrix_multiply_f32`] but for double-precision data.
///
/// # Panics
///
/// Panics if slice lengths don't match expected dimensions
pub fn simd_matrix_multiply_f64(
    m: usize,
    k: usize,
    n: usize,
    alpha: f64,
    a: &[f64],
    b: &[f64],
    beta: f64,
    c: &mut [f64],
) {
    // Validate dimensions
    assert_eq!(
        a.len(),
        m * k,
        "Matrix A must have m*k = {}*{} = {} elements, got {}",
        m,
        k,
        m * k,
        a.len()
    );
    assert_eq!(
        b.len(),
        k * n,
        "Matrix B must have k*n = {}*{} = {} elements, got {}",
        k,
        n,
        k * n,
        b.len()
    );
    assert_eq!(
        c.len(),
        m * n,
        "Matrix C must have m*n = {}*{} = {} elements, got {}",
        m,
        n,
        m * n,
        c.len()
    );

    #[cfg(feature = "simd")]
    {
        let config = MatMulConfig::for_f64();

        unsafe {
            blocked_gemm_f64(
                m,
                k,
                n,
                alpha,
                a.as_ptr(),
                k, // lda: leading dimension of A
                b.as_ptr(),
                n, // ldb: leading dimension of B
                beta,
                c.as_mut_ptr(),
                n, // ldc: leading dimension of C
                &config,
            );
        }
    }

    #[cfg(not(feature = "simd"))]
    {
        // Fallback to simple implementation
        gemm_simple_f64(m, k, n, alpha, a, k, b, n, beta, c, n);
    }
}

/// Simple fallback GEMM for f32
fn gemm_simple_f32(
    m: usize,
    k: usize,
    n: usize,
    alpha: f32,
    a: &[f32],
    _lda: usize,
    b: &[f32],
    _ldb: usize,
    beta: f32,
    c: &mut [f32],
    _ldc: usize,
) {
    // Scale C by beta
    if beta == 0.0 {
        c.fill(0.0);
    } else if beta != 1.0 {
        for val in c.iter_mut() {
            *val *= beta;
        }
    }

    // C += alpha * A * B
    for i in 0..m {
        for p in 0..k {
            let a_val = alpha * a[i * k + p];
            for j in 0..n {
                c[i * n + j] += a_val * b[p * n + j];
            }
        }
    }
}

/// Simple fallback GEMM for f64 (to be replaced with blocked version)
fn gemm_simple_f64(
    m: usize,
    k: usize,
    n: usize,
    alpha: f64,
    a: &[f64],
    _lda: usize,
    b: &[f64],
    _ldb: usize,
    beta: f64,
    c: &mut [f64],
    _ldc: usize,
) {
    // Scale C by beta
    if beta == 0.0 {
        c.fill(0.0);
    } else if beta != 1.0 {
        for val in c.iter_mut() {
            *val *= beta;
        }
    }

    // C += alpha * A * B
    for i in 0..m {
        for p in 0..k {
            let a_val = alpha * a[i * k + p];
            for j in 0..n {
                c[i * n + j] += a_val * b[p * n + j];
            }
        }
    }
}

/// Check if matrices are large enough to benefit from blocked GEMM
///
/// Returns true if all dimensions are >= 64, indicating that the blocked
/// algorithm will provide significant performance benefits.
///
/// # Example
///
/// ```rust
/// use scirs2_core::simd_ops::matmul::should_use_simd_matmul;
///
/// assert!(!should_use_simd_matmul(32, 32, 32));  // Too small
/// assert!(should_use_simd_matmul(256, 256, 256)); // Large enough
/// ```
pub fn should_use_simd_matmul(m: usize, n: usize, k: usize) -> bool {
    #[cfg(feature = "simd")]
    {
        should_use_blocked(m, n, k)
    }

    #[cfg(not(feature = "simd"))]
    {
        // Without SIMD, large matrices don't benefit as much
        m >= 128 && n >= 128 && k >= 128
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_dot_product_f32() {
        let a = vec![1.0, 2.0, 3.0, 4.0];
        let b = vec![5.0, 6.0, 7.0, 8.0];

        let result = simd_dot_product_f32(&a, &b);

        // 1*5 + 2*6 + 3*7 + 4*8 = 5 + 12 + 21 + 32 = 70
        assert!((result - 70.0).abs() < 1e-5);
    }

    #[test]
    fn test_dot_product_f64() {
        let a = vec![1.0, 2.0, 3.0];
        let b = vec![4.0, 5.0, 6.0];

        let result = simd_dot_product_f64(&a, &b);

        // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
        assert!((result - 32.0).abs() < 1e-10);
    }

    #[test]
    #[should_panic(expected = "equal-length")]
    fn test_dot_product_length_mismatch() {
        let a = vec![1.0, 2.0, 3.0];
        let b = vec![4.0, 5.0];

        simd_dot_product_f32(&a, &b);
    }

    #[test]
    fn test_matrix_multiply_identity() {
        let n = 4;

        // A = identity matrix
        let mut a = vec![0.0f32; n * n];
        for i in 0..n {
            a[i * n + i] = 1.0;
        }

        // B = sequential values
        let b: Vec<f32> = (0..n * n).map(|i| i as f32).collect();

        let mut c = vec![0.0f32; n * n];

        simd_matrix_multiply_f32(n, n, n, 1.0, &a, &b, 0.0, &mut c);

        // C should equal B
        for i in 0..n * n {
            assert!(
                (c[i] - b[i]).abs() < 1e-5,
                "Mismatch at {}: expected {}, got {}",
                i,
                b[i],
                c[i]
            );
        }
    }

    #[test]
    fn test_matrix_multiply_alpha_beta() {
        let m = 2;
        let k = 2;
        let n = 2;

        let a = vec![1.0, 2.0, 3.0, 4.0];
        let b = vec![5.0, 6.0, 7.0, 8.0];
        let mut c = vec![1.0, 1.0, 1.0, 1.0];

        // C = 2*C + 3*A*B
        simd_matrix_multiply_f32(m, k, n, 3.0, &a, &b, 2.0, &mut c);

        // A*B = [[19, 22], [43, 50]]
        // 3*A*B = [[57, 66], [129, 150]]
        // 2*C + 3*A*B = [[2,2],[2,2]] + [[57,66],[129,150]] = [[59,68],[131,152]]

        let expected = [59.0, 68.0, 131.0, 152.0];
        for i in 0..4 {
            assert!(
                (c[i] - expected[i]).abs() < 1e-4,
                "Mismatch at {}: expected {}, got {}",
                i,
                expected[i],
                c[i]
            );
        }
    }

    #[test]
    fn test_matrix_multiply_large() {
        // Test with larger matrix to trigger blocked path
        let n = 128;

        let a = vec![1.0f32; n * n];
        let b = vec![2.0f32; n * n];
        let mut c = vec![0.0f32; n * n];

        simd_matrix_multiply_f32(n, n, n, 1.0, &a, &b, 0.0, &mut c);

        // Each element should be 2*n
        let expected = 2.0 * n as f32;
        for val in &c {
            assert!(
                (*val - expected).abs() < 1e-2,
                "Expected {}, got {}",
                expected,
                val
            );
        }
    }

    #[test]
    fn test_rectangular_multiply() {
        // C[3x4] = A[3x2] * B[2x4]
        let m = 3;
        let k = 2;
        let n = 4;

        let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
        let b = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let mut c = vec![0.0f32; m * n];

        simd_matrix_multiply_f32(m, k, n, 1.0, &a, &b, 0.0, &mut c);

        // Verify first element: C[0,0] = A[0,0]*B[0,0] + A[0,1]*B[1,0] = 1*1 + 2*5 = 11
        assert!(
            (c[0] - 11.0).abs() < 1e-5,
            "C[0,0] expected 11.0, got {}",
            c[0]
        );

        // Verify another: C[0,1] = A[0,0]*B[0,1] + A[0,1]*B[1,1] = 1*2 + 2*6 = 14
        assert!(
            (c[1] - 14.0).abs() < 1e-5,
            "C[0,1] expected 14.0, got {}",
            c[1]
        );
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_should_use_simd_matmul() {
        assert!(!should_use_simd_matmul(32, 32, 32));
        assert!(should_use_simd_matmul(64, 64, 64));
        assert!(should_use_simd_matmul(256, 256, 256));
    }

    // ==================== f64 Tests ====================

    #[test]
    fn test_matrix_multiply_identity_f64() {
        let n = 4;

        // A = identity matrix
        let mut a = vec![0.0f64; n * n];
        for i in 0..n {
            a[i * n + i] = 1.0;
        }

        // B = sequential values
        let b: Vec<f64> = (0..n * n).map(|i| i as f64).collect();

        let mut c = vec![0.0f64; n * n];

        simd_matrix_multiply_f64(n, n, n, 1.0, &a, &b, 0.0, &mut c);

        // C should equal B
        for i in 0..n * n {
            assert!(
                (c[i] - b[i]).abs() < 1e-10,
                "Mismatch at {}: expected {}, got {}",
                i,
                b[i],
                c[i]
            );
        }
    }

    #[test]
    fn test_matrix_multiply_alpha_beta_f64() {
        let m = 2;
        let k = 2;
        let n = 2;

        let a = vec![1.0, 2.0, 3.0, 4.0];
        let b = vec![5.0, 6.0, 7.0, 8.0];
        let mut c = vec![1.0, 1.0, 1.0, 1.0];

        // C = 2*C + 3*A*B
        simd_matrix_multiply_f64(m, k, n, 3.0, &a, &b, 2.0, &mut c);

        // A*B = [[19, 22], [43, 50]]
        // 3*A*B = [[57, 66], [129, 150]]
        // 2*C + 3*A*B = [[2,2],[2,2]] + [[57,66],[129,150]] = [[59,68],[131,152]]

        let expected = [59.0, 68.0, 131.0, 152.0];
        for i in 0..4 {
            assert!(
                (c[i] - expected[i]).abs() < 1e-10,
                "Mismatch at {}: expected {}, got {}",
                i,
                expected[i],
                c[i]
            );
        }
    }

    #[test]
    fn test_matrix_multiply_large_f64() {
        // Test with larger matrix to trigger blocked path
        let n = 128;

        let a = vec![1.0f64; n * n];
        let b = vec![2.0f64; n * n];
        let mut c = vec![0.0f64; n * n];

        simd_matrix_multiply_f64(n, n, n, 1.0, &a, &b, 0.0, &mut c);

        // Each element should be 2*n
        let expected = 2.0 * n as f64;
        for val in &c {
            assert!(
                (*val - expected).abs() < 1e-8,
                "Expected {}, got {}",
                expected,
                val
            );
        }
    }

    #[test]
    fn test_rectangular_multiply_f64() {
        // C[3x4] = A[3x2] * B[2x4]
        let m = 3;
        let k = 2;
        let n = 4;

        let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
        let b = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let mut c = vec![0.0f64; m * n];

        simd_matrix_multiply_f64(m, k, n, 1.0, &a, &b, 0.0, &mut c);

        // Verify first element: C[0,0] = A[0,0]*B[0,0] + A[0,1]*B[1,0] = 1*1 + 2*5 = 11
        assert!(
            (c[0] - 11.0).abs() < 1e-10,
            "C[0,0] expected 11.0, got {}",
            c[0]
        );

        // Verify another: C[0,1] = A[0,0]*B[0,1] + A[0,1]*B[1,1] = 1*2 + 2*6 = 14
        assert!(
            (c[1] - 14.0).abs() < 1e-10,
            "C[0,1] expected 14.0, got {}",
            c[1]
        );
    }

    #[test]
    fn test_dot_product_large() {
        // Test larger vectors
        let n = 1024;
        let a: Vec<f32> = (1..=n).map(|i| i as f32).collect();
        let b: Vec<f32> = (1..=n).map(|i| i as f32).collect();

        let result = simd_dot_product_f32(&a, &b);

        // sum of i^2 for i=1 to n = n*(n+1)*(2n+1)/6
        let n64 = n as f64;
        let expected = (n64 * (n64 + 1.0) * (2.0 * n64 + 1.0) / 6.0) as f32;
        assert!(
            (result - expected).abs() / expected < 1e-5,
            "Large dot product: expected {}, got {}",
            expected,
            result
        );
    }

    #[test]
    fn test_dot_product_large_f64() {
        // Test larger vectors
        let n = 1024;
        let a: Vec<f64> = (1..=n).map(|i| i as f64).collect();
        let b: Vec<f64> = (1..=n).map(|i| i as f64).collect();

        let result = simd_dot_product_f64(&a, &b);

        // sum of i^2 for i=1 to n = n*(n+1)*(2n+1)/6
        let n64 = n as f64;
        let expected = n64 * (n64 + 1.0) * (2.0 * n64 + 1.0) / 6.0;
        assert!(
            (result - expected).abs() / expected < 1e-10,
            "Large dot product: expected {}, got {}",
            expected,
            result
        );
    }
}