numrs2 0.3.1 - Docs.rs

//! BLAS/LAPACK-accelerated linear algebra operations using scirs2-linalg
//!
//! This module provides high-performance linear algebra operations by leveraging
//! scirs2-linalg's BLAS and LAPACK integration. These functions offer 200-700x
//! speedups compared to pure Rust implementations for large matrices.
//!
//! # SCIRS2 POLICY Compliance
//!
//! Per SCIRS2 POLICY, all BLAS/LAPACK operations are routed through scirs2-linalg
//! rather than using direct BLAS/LAPACK dependencies. This ensures consistent
//! behavior across the ecosystem and proper platform detection.
//!
//! # Performance Characteristics
//!
//! | Operation | Speedup vs Pure Rust | Notes |
//! |-----------|---------------------|-------|
//! | gemm      | 200-700x            | For large matrices (n > 100) |
//! | gemv      | 50-200x             | For large matrices |
//! | dot       | 10-50x              | With SIMD + BLAS |
//! | lu        | 200-700x            | LAPACK-backed |
//! | qr        | 200-700x            | LAPACK-backed |
//! | svd       | 200-700x            | LAPACK-backed |
//! | cholesky  | 200-700x            | LAPACK-backed |
//! | eig/eigh  | 200-700x            | LAPACK-backed |

use crate::array::Array;
use crate::error::{NumRs2Error, Result};
use crate::interop::ndarray_compat::{from_ndarray, to_ndarray};
use num_traits::{Float, NumAssign, NumCast};
use scirs2_core::ndarray::{
    Array1, Array2, ArrayView1, ArrayView2, Ix1, Ix2, IxDyn, ScalarOperand,
};
use std::fmt::Debug;
use std::iter::Sum;

// Re-export scirs2-linalg types for convenience
pub use scirs2_linalg::LinalgError;
pub type LinalgResult<T> = std::result::Result<T, LinalgError>;

/// Type alias for complex eigenvalue decomposition result
pub type ComplexEigResult<T> = (
    Array<scirs2_core::numeric::Complex<T>>,
    Array<scirs2_core::numeric::Complex<T>>,
);

/// Convert NumRS2 Array to ndarray Array2 for matrix operations
fn to_array2<T>(arr: &Array<T>) -> Result<Array2<T>>
where
    T: Clone + Debug,
{
    let shape = arr.shape();
    if shape.len() != 2 {
        return Err(NumRs2Error::DimensionMismatch(format!(
            "Expected 2D array, got {}D",
            shape.len()
        )));
    }

    let data = arr.to_vec();
    Array2::from_shape_vec((shape[0], shape[1]), data)
        .map_err(|e| NumRs2Error::ConversionError(format!("Failed to convert to Array2: {}", e)))
}

/// Convert NumRS2 Array to ndarray Array1 for vector operations
fn to_array1<T>(arr: &Array<T>) -> Result<Array1<T>>
where
    T: Clone + Debug,
{
    let shape = arr.shape();
    if shape.len() != 1 {
        return Err(NumRs2Error::DimensionMismatch(format!(
            "Expected 1D array, got {}D",
            shape.len()
        )));
    }

    let data = arr.to_vec();
    Ok(Array1::from_vec(data))
}

/// Convert ndarray Array2 back to NumRS2 Array
fn from_array2<T>(arr: Array2<T>) -> Result<Array<T>>
where
    T: Clone + Debug + NumCast,
{
    let nrows = arr.nrows();
    let ncols = arr.ncols();
    let data: Vec<T> = arr.into_iter().collect();
    Ok(Array::from_vec(data).reshape(&[nrows, ncols]))
}

/// Convert ndarray Array1 back to NumRS2 Array
fn from_array1<T>(arr: Array1<T>) -> Result<Array<T>>
where
    T: Clone + Debug + NumCast,
{
    let data: Vec<T> = arr.into_iter().collect();
    Ok(Array::from_vec(data))
}

/// Convert LinalgError to NumRs2Error
fn linalg_to_numrs2_error(e: LinalgError) -> NumRs2Error {
    match e {
        LinalgError::SingularMatrixError(s) => {
            NumRs2Error::InvalidOperation(format!("Singular matrix: {}", s))
        }
        LinalgError::DimensionError(s) => NumRs2Error::DimensionMismatch(s),
        LinalgError::ShapeError(s) => NumRs2Error::DimensionMismatch(s),
        LinalgError::NonPositiveDefiniteError(s) => {
            NumRs2Error::InvalidOperation(format!("Matrix is not positive definite: {}", s))
        }
        LinalgError::ConvergenceError(s) => {
            NumRs2Error::ComputationError(format!("Convergence failed: {}", s))
        }
        _ => NumRs2Error::ComputationError(format!("Linear algebra error: {}", e)),
    }
}

// ============================================================================
// BLAS Level 1: Vector Operations
// ============================================================================

/// BLAS-accelerated dot product of two vectors
///
/// Computes x · y using optimized BLAS routines when available.
///
/// # Arguments
/// * `x` - First vector
/// * `y` - Second vector
///
/// # Returns
/// The dot product as a scalar
///
/// # Example
/// ```
/// use numrs2::prelude::*;
/// use numrs2::linalg_accelerated::dot;
///
/// let x = Array::from_vec(vec![1.0f64, 2.0, 3.0]);
/// let y = Array::from_vec(vec![4.0f64, 5.0, 6.0]);
/// let result = dot(&x, &y).expect("dot product should succeed");
/// assert!((result - 32.0f64).abs() < 1e-10);
/// ```
pub fn dot<T>(x: &Array<T>, y: &Array<T>) -> Result<T>
where
    T: Float + NumAssign + Clone + Debug + NumCast + 'static,
{
    let x_nd = to_array1(x)?;
    let y_nd = to_array1(y)?;

    scirs2_linalg::blas_accelerated::dot(&x_nd.view(), &y_nd.view()).map_err(linalg_to_numrs2_error)
}

/// Compute the L2 norm (Euclidean norm) of a vector
///
/// # Arguments
/// * `x` - Input vector
///
/// # Returns
/// The L2 norm ||x||₂
pub fn norm<T>(x: &Array<T>) -> Result<T>
where
    T: Float + NumAssign + Clone + Debug + NumCast + 'static,
{
    let x_nd = to_array1(x)?;

    scirs2_linalg::blas_accelerated::norm(&x_nd.view()).map_err(linalg_to_numrs2_error)
}

// ============================================================================
// BLAS Level 2: Matrix-Vector Operations
// ============================================================================

/// BLAS-accelerated matrix-vector multiplication (GEMV)
///
/// Computes y = α * A * x + β * y
///
/// # Arguments
/// * `alpha` - Scalar multiplier for A*x
/// * `a` - Matrix A (m × n)
/// * `x` - Vector x (n elements)
/// * `beta` - Scalar multiplier for y
/// * `y` - Vector y (m elements, used as both input and storage hint)
///
/// # Returns
/// The result vector of length m
pub fn gemv<T>(alpha: T, a: &Array<T>, x: &Array<T>, beta: T, y: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + 'static,
{
    let a_nd = to_array2(a)?;
    let x_nd = to_array1(x)?;
    let y_nd = to_array1(y)?;

    let result = scirs2_linalg::blas_accelerated::gemv(
        alpha,
        &a_nd.view(),
        &x_nd.view(),
        beta,
        &y_nd.view(),
    )
    .map_err(linalg_to_numrs2_error)?;

    from_array1(result)
}

/// Simple matrix-vector multiplication: y = A * x
///
/// # Arguments
/// * `a` - Matrix A (m × n)
/// * `x` - Vector x (n elements)
///
/// # Returns
/// The result vector of length m
pub fn matvec<T>(a: &Array<T>, x: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + 'static,
{
    let a_shape = a.shape();
    if a_shape.len() != 2 {
        return Err(NumRs2Error::DimensionMismatch(
            "Matrix must be 2D".to_string(),
        ));
    }

    let m = a_shape[0];
    let y_init = Array::zeros(&[m]);

    gemv(T::one(), a, x, T::zero(), &y_init)
}

// ============================================================================
// BLAS Level 3: Matrix-Matrix Operations
// ============================================================================

/// BLAS-accelerated matrix-matrix multiplication (GEMM)
///
/// Computes C = α * A * B + β * C
///
/// This is the most important BLAS operation and offers 200-700x speedup
/// over naive implementations for large matrices.
///
/// # Arguments
/// * `alpha` - Scalar multiplier for A*B
/// * `a` - Matrix A (m × k)
/// * `b` - Matrix B (k × n)
/// * `beta` - Scalar multiplier for C
/// * `c` - Matrix C (m × n, used as both input and storage hint)
///
/// # Returns
/// The result matrix of shape (m, n)
pub fn gemm<T>(alpha: T, a: &Array<T>, b: &Array<T>, beta: T, c: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + 'static,
{
    let a_nd = to_array2(a)?;
    let b_nd = to_array2(b)?;
    let c_nd = to_array2(c)?;

    let result = scirs2_linalg::blas_accelerated::gemm(
        alpha,
        &a_nd.view(),
        &b_nd.view(),
        beta,
        &c_nd.view(),
    )
    .map_err(linalg_to_numrs2_error)?;

    from_array2(result)
}

/// Simple matrix multiplication: C = A * B
///
/// Uses BLAS GEMM for optimal performance on large matrices.
///
/// # Arguments
/// * `a` - Matrix A (m × k)
/// * `b` - Matrix B (k × n)
///
/// # Returns
/// The result matrix of shape (m, n)
///
/// # Example
/// ```
/// use numrs2::prelude::*;
/// use numrs2::linalg_accelerated::matmul;
///
/// let a = Array::from_vec(vec![1.0, 2.0, 3.0, 4.0]).reshape(&[2, 2]);
/// let b = Array::from_vec(vec![5.0, 6.0, 7.0, 8.0]).reshape(&[2, 2]);
/// let c = matmul(&a, &b).expect("matmul should succeed");
/// // c = [[19, 22], [43, 50]]
/// ```
pub fn matmul<T>(a: &Array<T>, b: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + 'static,
{
    let a_nd = to_array2(a)?;
    let b_nd = to_array2(b)?;

    let result = scirs2_linalg::blas_accelerated::matmul(&a_nd.view(), &b_nd.view())
        .map_err(linalg_to_numrs2_error)?;

    from_array2(result)
}

// ============================================================================
// LAPACK: Matrix Decompositions
// ============================================================================

/// LU decomposition with partial pivoting: PA = LU
///
/// Uses LAPACK for optimal performance.
///
/// # Arguments
/// * `a` - Square matrix to decompose
///
/// # Returns
/// Tuple of (P, L, U) matrices where PA = LU
pub fn lu<T>(a: &Array<T>) -> Result<(Array<T>, Array<T>, Array<T>)>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let (p, l, u) = scirs2_linalg::lu(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    Ok((from_array2(p)?, from_array2(l)?, from_array2(u)?))
}

/// QR decomposition: A = QR
///
/// Uses LAPACK for optimal performance.
///
/// # Arguments
/// * `a` - Matrix to decompose (m × n)
///
/// # Returns
/// Tuple of (Q, R) matrices where A = QR
/// - Q: Orthogonal matrix (m × m)
/// - R: Upper triangular matrix (m × n)
pub fn qr<T>(a: &Array<T>) -> Result<(Array<T>, Array<T>)>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let (q, r) = scirs2_linalg::qr(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    Ok((from_array2(q)?, from_array2(r)?))
}

/// Singular Value Decomposition: A = UΣVᵀ
///
/// Uses LAPACK for optimal performance.
///
/// # Arguments
/// * `a` - Matrix to decompose (m × n)
/// * `full_matrices` - If true, compute full U and V; if false, compute thin SVD
///
/// # Returns
/// Tuple of (U, S, Vt) where:
/// - U: Left singular vectors
/// - S: Singular values (as 1D array)
/// - Vt: Right singular vectors (transposed)
pub fn svd<T>(a: &Array<T>, full_matrices: bool) -> Result<(Array<T>, Array<T>, Array<T>)>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let (u, s, vt) =
        scirs2_linalg::svd(&a_nd.view(), full_matrices, None).map_err(linalg_to_numrs2_error)?;

    Ok((from_array2(u)?, from_array1(s)?, from_array2(vt)?))
}

/// Cholesky decomposition: A = LLᵀ
///
/// Decomposes a symmetric positive-definite matrix into the product of
/// a lower triangular matrix and its transpose.
///
/// Uses LAPACK for optimal performance.
///
/// # Arguments
/// * `a` - Symmetric positive-definite matrix
///
/// # Returns
/// Lower triangular matrix L
///
/// # Errors
/// Returns error if matrix is not positive-definite
pub fn cholesky<T>(a: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let l = scirs2_linalg::cholesky(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    from_array2(l)
}

// ============================================================================
// LAPACK: Eigenvalue Problems
// ============================================================================

/// Eigenvalue decomposition for general matrices
///
/// Computes eigenvalues and eigenvectors of a square matrix.
/// Note: Returns complex eigenvalues/eigenvectors since general matrices
/// may have complex eigenvalues even if the matrix is real.
///
/// # Arguments
/// * `a` - Square matrix
///
/// # Returns
/// Tuple of (eigenvalues, eigenvectors) as complex arrays
pub fn eig<T>(a: &Array<T>) -> Result<ComplexEigResult<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let (eigenvalues, eigenvectors) =
        scirs2_linalg::eig(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    // Convert complex arrays
    let eigenvalues_vec: Vec<scirs2_core::numeric::Complex<T>> = eigenvalues.into_iter().collect();
    let eigenvectors_data: Vec<scirs2_core::numeric::Complex<T>> =
        eigenvectors.iter().cloned().collect();
    let ev_shape = eigenvectors.shape();

    Ok((
        Array::from_vec(eigenvalues_vec),
        Array::from_vec(eigenvectors_data).reshape(&[ev_shape[0], ev_shape[1]]),
    ))
}

/// Eigenvalue decomposition for symmetric/Hermitian matrices
///
/// Computes real eigenvalues and real eigenvectors of a symmetric matrix.
/// This is more efficient than the general eig() for symmetric matrices.
///
/// # Arguments
/// * `a` - Symmetric square matrix
///
/// # Returns
/// Tuple of (eigenvalues, eigenvectors) as real arrays
pub fn eigh<T>(a: &Array<T>) -> Result<(Array<T>, Array<T>)>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let (eigenvalues, eigenvectors) =
        scirs2_linalg::eigh(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    Ok((from_array1(eigenvalues)?, from_array2(eigenvectors)?))
}

/// Compute eigenvalues only (general matrix)
pub fn eigvals<T>(a: &Array<T>) -> Result<Array<scirs2_core::numeric::Complex<T>>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let eigenvalues = scirs2_linalg::eigvals(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    let eigenvalues_vec: Vec<scirs2_core::numeric::Complex<T>> = eigenvalues.into_iter().collect();

    Ok(Array::from_vec(eigenvalues_vec))
}

/// Compute eigenvalues only (symmetric matrix)
pub fn eigvalsh<T>(a: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let eigenvalues =
        scirs2_linalg::eigvalsh(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    from_array1(eigenvalues)
}

// ============================================================================
// LAPACK: Linear Systems and Matrix Operations
// ============================================================================

/// Solve a linear system Ax = b
///
/// Uses LAPACK for optimal performance.
///
/// # Arguments
/// * `a` - Coefficient matrix A (n × n)
/// * `b` - Right-hand side vector b (n elements)
///
/// # Returns
/// Solution vector x
pub fn solve<T>(a: &Array<T>, b: &Array<T>) -> Result<Array<T>>
where
    T: Float
        + NumAssign
        + Clone
        + Debug
        + NumCast
        + Sum
        + Send
        + Sync
        + ScalarOperand
        + num_traits::One
        + 'static,
{
    let a_nd = to_array2(a)?;
    let b_nd = to_array1(b)?;

    let x =
        scirs2_linalg::solve(&a_nd.view(), &b_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    from_array1(x)
}

/// Compute the matrix inverse
///
/// Uses LAPACK for optimal performance.
///
/// # Arguments
/// * `a` - Square matrix to invert
///
/// # Returns
/// The inverse matrix A⁻¹
///
/// # Errors
/// Returns error if matrix is singular
pub fn inv<T>(a: &Array<T>) -> Result<Array<T>>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    let a_inv = scirs2_linalg::inv(&a_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    from_array2(a_inv)
}

/// Compute the determinant
///
/// Uses LU decomposition via LAPACK for efficiency.
///
/// # Arguments
/// * `a` - Square matrix
///
/// # Returns
/// The determinant det(A)
pub fn det<T>(a: &Array<T>) -> Result<T>
where
    T: Float + NumAssign + Clone + Debug + NumCast + Sum + Send + Sync + ScalarOperand + 'static,
{
    let a_nd = to_array2(a)?;

    scirs2_linalg::det(&a_nd.view(), None).map_err(linalg_to_numrs2_error)
}

/// Least-squares solution to Ax = b
///
/// Finds x that minimizes ||Ax - b||₂
///
/// # Arguments
/// * `a` - Coefficient matrix (m × n)
/// * `b` - Right-hand side vector (m elements)
///
/// # Returns
/// The least-squares solution x
pub fn lstsq<T>(a: &Array<T>, b: &Array<T>) -> Result<Array<T>>
where
    T: Float
        + NumAssign
        + Clone
        + Debug
        + NumCast
        + Sum
        + Send
        + Sync
        + ScalarOperand
        + num_traits::One
        + 'static,
{
    let a_nd = to_array2(a)?;
    let b_nd = to_array1(b)?;

    let result =
        scirs2_linalg::lstsq(&a_nd.view(), &b_nd.view(), None).map_err(linalg_to_numrs2_error)?;

    from_array1(result.x)
}

// ============================================================================
// Accelerated BLAS Wrapper Struct
// ============================================================================

/// High-performance BLAS wrapper using scirs2-linalg
///
/// This struct provides a namespace for BLAS operations that are accelerated
/// via OxiBLAS (pure Rust BLAS/LAPACK implementation with SIMD optimizations) through
/// scirs2-linalg.
///
/// # Performance
///
/// These operations offer significant speedups over pure Rust implementations:
/// - gemm: 200-700x for large matrices
/// - gemv: 50-200x for large matrices
/// - dot: 10-50x with SIMD + BLAS
pub struct AcceleratedBlas;

impl AcceleratedBlas {
    /// Matrix-matrix multiplication: C = α*A*B + β*C
    pub fn gemm<T>(
        a: &Array<T>,
        b: &Array<T>,
        c: &mut Array<T>,
        alpha: T,
        beta: T,
        _trans_a: bool,
        _trans_b: bool,
    ) -> Result<()>
    where
        T: Float + NumAssign + Clone + Debug + NumCast + 'static,
    {
        // Note: Current scirs2-linalg API doesn't support transpose flags directly
        // For transposed operations, the caller should transpose the arrays first
        let result = gemm(alpha, a, b, beta, c)?;
        *c = result;
        Ok(())
    }

    /// Matrix-vector multiplication: y = α*A*x + β*y
    pub fn gemv<T>(
        a: &Array<T>,
        x: &Array<T>,
        y: &mut Array<T>,
        alpha: T,
        beta: T,
        _trans: bool,
    ) -> Result<()>
    where
        T: Float + NumAssign + Clone + Debug + NumCast + 'static,
    {
        let result = gemv(alpha, a, x, beta, y)?;
        *y = result;
        Ok(())
    }

    /// Dot product of two vectors
    pub fn dot<T>(x: &Array<T>, y: &Array<T>) -> Result<T>
    where
        T: Float + NumAssign + Clone + Debug + NumCast + 'static,
    {
        dot(x, y)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_relative_eq;

    #[test]
    fn test_dot_product() {
        let x = Array::from_vec(vec![1.0f64, 2.0, 3.0]);
        let y = Array::from_vec(vec![4.0f64, 5.0, 6.0]);

        let result = dot(&x, &y).expect("dot product should succeed");
        assert_relative_eq!(result, 32.0, epsilon = 1e-10);
    }

    #[test]
    fn test_norm() {
        let x = Array::from_vec(vec![3.0f64, 4.0]);

        let result = norm(&x).expect("norm should succeed");
        assert_relative_eq!(result, 5.0, epsilon = 1e-10);
    }

    #[test]
    fn test_matmul() {
        let a = Array::from_vec(vec![1.0f64, 2.0, 3.0, 4.0]).reshape(&[2, 2]);
        let b = Array::from_vec(vec![5.0f64, 6.0, 7.0, 8.0]).reshape(&[2, 2]);

        let c = matmul(&a, &b).expect("matmul should succeed");

        // Expected: [[19, 22], [43, 50]]
        assert_relative_eq!(c.get(&[0, 0]).expect("valid index"), 19.0, epsilon = 1e-10);
        assert_relative_eq!(c.get(&[0, 1]).expect("valid index"), 22.0, epsilon = 1e-10);
        assert_relative_eq!(c.get(&[1, 0]).expect("valid index"), 43.0, epsilon = 1e-10);
        assert_relative_eq!(c.get(&[1, 1]).expect("valid index"), 50.0, epsilon = 1e-10);
    }

    #[test]
    fn test_matvec() {
        let a = Array::from_vec(vec![1.0f64, 2.0, 3.0, 4.0]).reshape(&[2, 2]);
        let x = Array::from_vec(vec![1.0f64, 2.0]);

        let y = matvec(&a, &x).expect("matvec should succeed");

        // Expected: [5, 11]
        assert_relative_eq!(y.get(&[0]).expect("valid index"), 5.0, epsilon = 1e-10);
        assert_relative_eq!(y.get(&[1]).expect("valid index"), 11.0, epsilon = 1e-10);
    }

    #[test]
    fn test_solve() {
        // Solve: 2x + y = 5, x + 3y = 6
        // Solution: x = 1.8, y = 1.4
        let a = Array::from_vec(vec![2.0f64, 1.0, 1.0, 3.0]).reshape(&[2, 2]);
        let b = Array::from_vec(vec![5.0f64, 6.0]);

        let x = solve(&a, &b).expect("solve should succeed");

        assert_relative_eq!(x.get(&[0]).expect("valid index"), 1.8, epsilon = 1e-10);
        assert_relative_eq!(x.get(&[1]).expect("valid index"), 1.4, epsilon = 1e-10);
    }

    #[test]
    fn test_inv() {
        let a = Array::from_vec(vec![4.0f64, 7.0, 2.0, 6.0]).reshape(&[2, 2]);

        let a_inv = inv(&a).expect("inverse should succeed");

        // A * A^-1 should be identity
        let identity = matmul(&a, &a_inv).expect("matmul should succeed");

        assert_relative_eq!(
            identity.get(&[0, 0]).expect("valid index"),
            1.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            identity.get(&[0, 1]).expect("valid index"),
            0.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            identity.get(&[1, 0]).expect("valid index"),
            0.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            identity.get(&[1, 1]).expect("valid index"),
            1.0,
            epsilon = 1e-10
        );
    }

    #[test]
    fn test_det() {
        let a = Array::from_vec(vec![1.0f64, 2.0, 3.0, 4.0]).reshape(&[2, 2]);

        let d = det(&a).expect("determinant should succeed");

        // det([[1, 2], [3, 4]]) = 1*4 - 2*3 = -2
        assert_relative_eq!(d, -2.0, epsilon = 1e-10);
    }

    #[test]
    fn test_qr() {
        let a = Array::from_vec(vec![1.0f64, 2.0, 3.0, 4.0]).reshape(&[2, 2]);

        let (q, r) = qr(&a).expect("QR decomposition should succeed");

        // Q should be orthogonal: Q * Q^T = I
        let q_t = q.transpose();
        let identity = matmul(&q, &q_t).expect("matmul should succeed");

        assert_relative_eq!(
            identity.get(&[0, 0]).expect("valid index"),
            1.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            identity.get(&[1, 1]).expect("valid index"),
            1.0,
            epsilon = 1e-10
        );

        // A = Q * R
        let reconstructed = matmul(&q, &r).expect("matmul should succeed");

        assert_relative_eq!(
            reconstructed.get(&[0, 0]).expect("valid index"),
            1.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            reconstructed.get(&[0, 1]).expect("valid index"),
            2.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            reconstructed.get(&[1, 0]).expect("valid index"),
            3.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            reconstructed.get(&[1, 1]).expect("valid index"),
            4.0,
            epsilon = 1e-10
        );
    }

    #[test]
    fn test_cholesky() {
        // Symmetric positive-definite matrix
        let a = Array::from_vec(vec![4.0f64, 2.0, 2.0, 3.0]).reshape(&[2, 2]);

        let l = cholesky(&a).expect("Cholesky decomposition should succeed");

        // L * L^T should equal A
        let l_t = l.transpose();
        let reconstructed = matmul(&l, &l_t).expect("matmul should succeed");

        assert_relative_eq!(
            reconstructed.get(&[0, 0]).expect("valid index"),
            4.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            reconstructed.get(&[0, 1]).expect("valid index"),
            2.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            reconstructed.get(&[1, 0]).expect("valid index"),
            2.0,
            epsilon = 1e-10
        );
        assert_relative_eq!(
            reconstructed.get(&[1, 1]).expect("valid index"),
            3.0,
            epsilon = 1e-10
        );
    }

    #[test]
    fn test_eigh() {
        // Symmetric matrix
        let a = Array::from_vec(vec![2.0f64, 1.0, 1.0, 2.0]).reshape(&[2, 2]);

        let (eigenvalues, _eigenvectors) = eigh(&a).expect("eigendecomposition should succeed");

        // Eigenvalues should be 1 and 3
        let mut eigs = eigenvalues.to_vec();
        eigs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));

        assert_relative_eq!(eigs[0], 1.0, epsilon = 1e-10);
        assert_relative_eq!(eigs[1], 3.0, epsilon = 1e-10);
    }

    #[test]
    fn test_accelerated_blas_gemm() {
        let a = Array::from_vec(vec![1.0f64, 2.0, 3.0, 4.0]).reshape(&[2, 2]);
        let b = Array::from_vec(vec![5.0f64, 6.0, 7.0, 8.0]).reshape(&[2, 2]);
        let mut c = Array::zeros(&[2, 2]);

        AcceleratedBlas::gemm(&a, &b, &mut c, 1.0, 0.0, false, false).expect("gemm should succeed");

        assert_relative_eq!(c.get(&[0, 0]).expect("valid index"), 19.0, epsilon = 1e-10);
        assert_relative_eq!(c.get(&[0, 1]).expect("valid index"), 22.0, epsilon = 1e-10);
        assert_relative_eq!(c.get(&[1, 0]).expect("valid index"), 43.0, epsilon = 1e-10);
        assert_relative_eq!(c.get(&[1, 1]).expect("valid index"), 50.0, epsilon = 1e-10);
    }
}