scirs2-linalg 0.4.0

//! Parallel linear algebra algorithms
//!
//! This module provides parallel implementations of core linear algebra
//! algorithms optimized for multi-core systems.

use super::{adaptive, WorkerConfig};
use crate::error::{LinalgError, LinalgResult};
use scirs2_core::ndarray::{Array1, ArrayView1, ArrayView2};
use scirs2_core::numeric::{Float, NumAssign, One, Zero};
use scirs2_core::parallel_ops::*;
use std::iter::Sum;

/// Parallel matrix-vector multiplication
///
/// This is a simpler and more effective parallelization that can be used
/// as a building block for more complex algorithms.
///
/// # Arguments
///
/// * `matrix` - Input matrix
/// * `vector` - Input vector
/// * `config` - Worker configuration
///
/// # Returns
///
/// * Result vector y = A * x
pub fn parallel_matvec<F>(
    matrix: &ArrayView2<F>,
    vector: &ArrayView1<F>,
    config: &WorkerConfig,
) -> LinalgResult<Array1<F>>
where
    F: Float + Send + Sync + Zero + Sum + 'static,
{
    let (m, n) = matrix.dim();
    if n != vector.len() {
        return Err(LinalgError::ShapeError(format!(
            "Matrix-vector dimensions incompatible: {}x{} * {}",
            m,
            n,
            vector.len()
        )));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        // Fall back to serial computation
        return Ok(matrix.dot(vector));
    }

    config.apply();

    // Parallel computation of each row
    let result_vec: Vec<F> = (0..m)
        .into_par_iter()
        .map(|i| {
            matrix
                .row(i)
                .iter()
                .zip(vector.iter())
                .map(|(&aij, &xj)| aij * xj)
                .sum()
        })
        .collect();

    Ok(Array1::from_vec(result_vec))
}

/// Parallel power iteration for dominant eigenvalue
///
/// This implementation uses parallel matrix-vector multiplications
/// in the power iteration method for computing dominant eigenvalues.
pub fn parallel_power_iteration<F>(
    matrix: &ArrayView2<F>,
    max_iter: usize,
    tolerance: F,
    config: &WorkerConfig,
) -> LinalgResult<(F, Array1<F>)>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + NumAssign
        + One
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "Power iteration requires square matrix".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        // Fall back to serial power iteration
        return crate::eigen::power_iteration(&matrix.view(), max_iter, tolerance);
    }

    config.apply();

    // Initialize with simple vector
    let mut v = Array1::ones(n);
    let norm = v.iter().map(|&x| x * x).sum::<F>().sqrt();
    v /= norm;

    let mut eigenvalue = F::zero();

    for _iter in 0..max_iter {
        // Use the parallel matrix-vector multiplication
        let new_v = parallel_matvec(matrix, &v.view(), config)?;

        // Compute eigenvalue estimate (Rayleigh quotient)
        let new_eigenvalue = new_v
            .iter()
            .zip(v.iter())
            .map(|(&new_vi, &vi)| new_vi * vi)
            .sum::<F>();

        // Normalize
        let norm = new_v.iter().map(|&x| x * x).sum::<F>().sqrt();
        if norm < F::epsilon() {
            return Err(LinalgError::ComputationError(
                "Vector became zero during iteration".to_string(),
            ));
        }
        let normalized_v = new_v / norm;

        // Check convergence
        if (new_eigenvalue - eigenvalue).abs() < tolerance {
            return Ok((new_eigenvalue, normalized_v));
        }

        eigenvalue = new_eigenvalue;
        v = normalized_v;
    }

    Err(LinalgError::ComputationError(
        "Power iteration failed to converge".to_string(),
    ))
}

/// Parallel vector operations for linear algebra
///
/// This module provides basic parallel vector operations that serve as
/// building blocks for more complex algorithms.
pub mod vector_ops {
    use super::*;

    /// Parallel dot product of two vectors
    pub fn parallel_dot<F>(
        x: &ArrayView1<F>,
        y: &ArrayView1<F>,
        config: &WorkerConfig,
    ) -> LinalgResult<F>
    where
        F: Float + Send + Sync + Zero + Sum + 'static,
    {
        if x.len() != y.len() {
            return Err(LinalgError::ShapeError(
                "Vectors must have same length for dot product".to_string(),
            ));
        }

        let datasize = x.len();
        if !adaptive::should_use_parallel(datasize, config) {
            return Ok(x.iter().zip(y.iter()).map(|(&xi, &yi)| xi * yi).sum());
        }

        config.apply();

        let result = (0..x.len()).into_par_iter().map(|i| x[i] * y[i]).sum();

        Ok(result)
    }

    /// Parallel vector norm computation
    pub fn parallel_norm<F>(x: &ArrayView1<F>, config: &WorkerConfig) -> LinalgResult<F>
    where
        F: Float + Send + Sync + Zero + Sum + 'static,
    {
        let datasize = x.len();
        if !adaptive::should_use_parallel(datasize, config) {
            return Ok(x.iter().map(|&xi| xi * xi).sum::<F>().sqrt());
        }

        config.apply();

        let sum_squares = (0..x.len()).into_par_iter().map(|i| x[i] * x[i]).sum::<F>();

        Ok(sum_squares.sqrt())
    }

    /// Parallel AXPY operation: y = a*x + y
    ///
    /// Note: This function returns a new array rather than modifying in-place
    /// due to complications with parallel mutable iteration.
    pub fn parallel_axpy<F>(
        alpha: F,
        x: &ArrayView1<F>,
        y: &ArrayView1<F>,
        config: &WorkerConfig,
    ) -> LinalgResult<Array1<F>>
    where
        F: Float + Send + Sync + 'static,
    {
        if x.len() != y.len() {
            return Err(LinalgError::ShapeError(
                "Vectors must have same length for AXPY".to_string(),
            ));
        }

        let datasize = x.len();
        if !adaptive::should_use_parallel(datasize, config) {
            let result = x
                .iter()
                .zip(y.iter())
                .map(|(&xi, &yi)| alpha * xi + yi)
                .collect();
            return Ok(Array1::from_vec(result));
        }

        config.apply();

        let result_vec: Vec<F> = (0..x.len())
            .into_par_iter()
            .map(|i| alpha * x[i] + y[i])
            .collect();

        Ok(Array1::from_vec(result_vec))
    }
}

/// Parallel matrix multiplication (GEMM)
///
/// Implements parallel general matrix multiplication with block-based
/// parallelization for improved cache performance.
pub fn parallel_gemm<F>(
    a: &ArrayView2<F>,
    b: &ArrayView2<F>,
    config: &WorkerConfig,
) -> LinalgResult<scirs2_core::ndarray::Array2<F>>
where
    F: Float + Send + Sync + Zero + Sum + NumAssign + 'static,
{
    let (m, k) = a.dim();
    let (k2, n) = b.dim();

    if k != k2 {
        return Err(LinalgError::ShapeError(format!(
            "Matrix dimensions incompatible for multiplication: {m}x{k} * {k2}x{n}"
        )));
    }

    let datasize = m * k * n;
    if !adaptive::should_use_parallel(datasize, config) {
        return Ok(a.dot(b));
    }

    config.apply();

    // Block size for cache-friendly computation
    let blocksize = config.chunksize;

    let mut result = scirs2_core::ndarray::Array2::zeros((m, n));

    // Parallel computation using blocks
    result
        .outer_iter_mut()
        .enumerate()
        .par_bridge()
        .for_each(|(i, mut row)| {
            for j in 0..n {
                let mut sum = F::zero();
                for kb in (0..k).step_by(blocksize) {
                    let k_end = std::cmp::min(kb + blocksize, k);
                    for ki in kb..k_end {
                        sum += a[[i, ki]] * b[[ki, j]];
                    }
                }
                row[j] = sum;
            }
        });

    Ok(result)
}

/// Parallel QR decomposition using Householder reflections
///
/// This implementation parallelizes the application of Householder
/// transformations across columns.
pub fn parallel_qr<F>(
    matrix: &ArrayView2<F>,
    config: &WorkerConfig,
) -> LinalgResult<(
    scirs2_core::ndarray::Array2<F>,
    scirs2_core::ndarray::Array2<F>,
)>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    let datasize = m * n;

    if !adaptive::should_use_parallel(datasize, config) {
        return crate::decomposition::qr(&matrix.view(), None);
    }

    config.apply();

    let mut a = matrix.to_owned();
    let mut q = scirs2_core::ndarray::Array2::eye(m);
    let min_dim = std::cmp::min(m, n);

    for k in 0..min_dim {
        // Extract column vector for Householder reflection
        let x = a.slice(scirs2_core::ndarray::s![k.., k]).to_owned();
        let alpha = if x[0] >= F::zero() {
            -x.iter().map(|&xi| xi * xi).sum::<F>().sqrt()
        } else {
            x.iter().map(|&xi| xi * xi).sum::<F>().sqrt()
        };

        if alpha.abs() < F::epsilon() {
            continue;
        }

        let mut v = x.clone();
        v[0] -= alpha;
        let v_norm_sq = v.iter().map(|&vi| vi * vi).sum::<F>();

        if v_norm_sq < F::epsilon() {
            continue;
        }

        // Apply Householder transformation (serial for simplicity)
        let remaining_cols = n - k;
        if remaining_cols > 1 {
            for j in k..n {
                let col = a.slice(scirs2_core::ndarray::s![k.., j]).to_owned();
                let dot_product = v
                    .iter()
                    .zip(col.iter())
                    .map(|(&vi, &ci)| vi * ci)
                    .sum::<F>();
                let factor = F::from(2.0).expect("Operation failed") * dot_product / v_norm_sq;

                for (i, &vi) in v.iter().enumerate() {
                    a[[k + i, j]] -= factor * vi;
                }
            }
        }

        // Update Q matrix (serial for simplicity)
        for i in 0..m {
            let row = q.slice(scirs2_core::ndarray::s![i, k..]).to_owned();
            let dot_product = v
                .iter()
                .zip(row.iter())
                .map(|(&vi, &ri)| vi * ri)
                .sum::<F>();
            let factor = F::from(2.0).expect("Operation failed") * dot_product / v_norm_sq;

            for (j, &vj) in v.iter().enumerate() {
                q[[i, k + j]] -= factor * vj;
            }
        }
    }

    let r = a.slice(scirs2_core::ndarray::s![..min_dim, ..]).to_owned();
    Ok((q, r))
}

/// Parallel Cholesky decomposition
///
/// Implements parallel Cholesky decomposition using block-column approach
/// for positive definite matrices.
pub fn parallel_cholesky<F>(
    matrix: &ArrayView2<F>,
    config: &WorkerConfig,
) -> LinalgResult<scirs2_core::ndarray::Array2<F>>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "Cholesky decomposition requires square matrix".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        return crate::decomposition::cholesky(&matrix.view(), None);
    }

    config.apply();

    let mut l = scirs2_core::ndarray::Array2::zeros((n, n));
    let blocksize = config.chunksize;

    for k in (0..n).step_by(blocksize) {
        let k_end = std::cmp::min(k + blocksize, n);

        // Diagonal block factorization (serial for numerical stability)
        for i in k..k_end {
            // Compute L[i,i]
            let mut sum = F::zero();
            for j in 0..i {
                sum += l[[i, j]] * l[[i, j]];
            }
            let aii = matrix[[i, i]] - sum;
            if aii <= F::zero() {
                return Err(LinalgError::ComputationError(
                    "Matrix is not positive definite".to_string(),
                ));
            }
            l[[i, i]] = aii.sqrt();

            // Compute L[i+1:k_end, i]
            for j in (i + 1)..k_end {
                let mut sum = F::zero();
                for p in 0..i {
                    sum += l[[j, p]] * l[[i, p]];
                }
                l[[j, i]] = (matrix[[j, i]] - sum) / l[[i, i]];
            }
        }

        // Update trailing submatrix (serial for simplicity)
        if k_end < n {
            for i in k_end..n {
                for j in k..k_end {
                    let mut sum = F::zero();
                    for p in 0..j {
                        sum += l[[i, p]] * l[[j, p]];
                    }
                    l[[i, j]] = (matrix[[i, j]] - sum) / l[[j, j]];
                }
            }
        }
    }

    Ok(l)
}

/// Parallel LU decomposition with partial pivoting
///
/// Implements parallel LU decomposition using block-column approach.
pub fn parallel_lu<F>(
    matrix: &ArrayView2<F>,
    config: &WorkerConfig,
) -> LinalgResult<(
    scirs2_core::ndarray::Array2<F>,
    scirs2_core::ndarray::Array2<F>,
    scirs2_core::ndarray::Array2<F>,
)>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    let datasize = m * n;

    if !adaptive::should_use_parallel(datasize, config) {
        return crate::decomposition::lu(&matrix.view(), None);
    }

    config.apply();

    let mut a = matrix.to_owned();
    let mut perm_vec = (0..m).collect::<Vec<_>>();
    let min_dim = std::cmp::min(m, n);

    for k in 0..min_dim {
        // Find pivot (serial for correctness)
        let mut max_val = F::zero();
        let mut pivot_row = k;
        for i in k..m {
            let abs_val = a[[i, k]].abs();
            if abs_val > max_val {
                max_val = abs_val;
                pivot_row = i;
            }
        }

        if max_val < F::epsilon() {
            return Err(LinalgError::ComputationError(
                "Matrix is singular".to_string(),
            ));
        }

        // Swap rows if needed
        if pivot_row != k {
            for j in 0..n {
                let temp = a[[k, j]];
                a[[k, j]] = a[[pivot_row, j]];
                a[[pivot_row, j]] = temp;
            }
            perm_vec.swap(k, pivot_row);
        }

        // Update submatrix (serial for now to avoid borrowing issues)
        let pivot = a[[k, k]];

        for i in (k + 1)..m {
            let multiplier = a[[i, k]] / pivot;
            a[[i, k]] = multiplier;

            for j in (k + 1)..n {
                a[[i, j]] = a[[i, j]] - multiplier * a[[k, j]];
            }
        }
    }

    // Create permutation matrix P
    let mut p = scirs2_core::ndarray::Array2::zeros((m, m));
    for (i, &piv) in perm_vec.iter().enumerate() {
        p[[i, piv]] = F::one();
    }

    // Extract L and U matrices
    let mut l = scirs2_core::ndarray::Array2::eye(m);
    let mut u = scirs2_core::ndarray::Array2::zeros((m, n));

    for i in 0..m {
        for j in 0..n {
            if i > j && j < min_dim {
                l[[i, j]] = a[[i, j]];
            } else if i <= j {
                u[[i, j]] = a[[i, j]];
            }
        }
    }

    Ok((p, l, u))
}

/// Parallel conjugate gradient solver
///
/// Implements parallel conjugate gradient method for solving linear systems
/// with symmetric positive definite matrices.
pub fn parallel_conjugate_gradient<F>(
    matrix: &ArrayView2<F>,
    b: &ArrayView1<F>,
    max_iter: usize,
    tolerance: F,
    config: &WorkerConfig,
) -> LinalgResult<Array1<F>>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "CG requires square matrix".to_string(),
        ));
    }
    if n != b.len() {
        return Err(LinalgError::ShapeError(
            "Matrix and vector dimensions incompatible".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        return crate::iterative_solvers::conjugate_gradient(
            &matrix.view(),
            &b.view(),
            max_iter,
            tolerance,
            None,
        );
    }

    config.apply();

    // Initialize
    let mut x = Array1::zeros(n);

    // r = b - A*x
    let ax = parallel_matvec(matrix, &x.view(), config)?;
    let mut r = b - &ax;
    let mut p = r.clone();
    let mut rsold = vector_ops::parallel_dot(&r.view(), &r.view(), config)?;

    for _iter in 0..max_iter {
        let ap = parallel_matvec(matrix, &p.view(), config)?;
        let alpha = rsold / vector_ops::parallel_dot(&p.view(), &ap.view(), config)?;

        x = vector_ops::parallel_axpy(alpha, &p.view(), &x.view(), config)?;
        r = vector_ops::parallel_axpy(-alpha, &ap.view(), &r.view(), config)?;

        let rsnew = vector_ops::parallel_dot(&r.view(), &r.view(), config)?;

        if rsnew.sqrt() < tolerance {
            return Ok(x);
        }

        let beta = rsnew / rsold;
        p = vector_ops::parallel_axpy(beta, &p.view(), &r.view(), config)?;
        rsold = rsnew;
    }

    Err(LinalgError::ComputationError(
        "Conjugate gradient failed to converge".to_string(),
    ))
}

/// Parallel SVD decomposition
///
/// Implements parallel Singular Value Decomposition using a block-based approach
/// for improved performance on large matrices.
pub fn parallel_svd<F>(
    matrix: &ArrayView2<F>,
    config: &WorkerConfig,
) -> LinalgResult<(
    scirs2_core::ndarray::Array2<F>,
    scirs2_core::ndarray::Array1<F>,
    scirs2_core::ndarray::Array2<F>,
)>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    let datasize = m * n;

    if !adaptive::should_use_parallel(datasize, config) {
        return crate::decomposition::svd(&matrix.view(), false, None);
    }

    config.apply();

    // For now, use QR decomposition as a first step
    // This is a simplified parallel SVD - a full implementation would use
    // more sophisticated algorithms like Jacobi SVD or divide-and-conquer
    let (q, r) = parallel_qr(matrix, config)?;

    // Apply SVD to the smaller R matrix (serial for numerical stability)
    let (u_r, s, vt) = crate::decomposition::svd(&r.view(), false, None)?;

    // U = Q * U_r
    let u = parallel_gemm(&q.view(), &u_r.view(), config)?;

    Ok((u, s, vt))
}

/// Parallel GMRES (Generalized Minimal Residual) solver
///
/// Implements parallel GMRES for solving non-symmetric linear systems.
pub fn parallel_gmres<F>(
    matrix: &ArrayView2<F>,
    b: &ArrayView1<F>,
    max_iter: usize,
    tolerance: F,
    restart: usize,
    config: &WorkerConfig,
) -> LinalgResult<Array1<F>>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + std::fmt::Debug
        + std::fmt::Display
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "GMRES requires square matrix".to_string(),
        ));
    }
    if n != b.len() {
        return Err(LinalgError::ShapeError(
            "Matrix and vector dimensions incompatible".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        // Fall back to serial GMRES - use the iterative solver version
        let options = crate::solvers::iterative::IterativeSolverOptions {
            max_iterations: max_iter,
            tolerance,
            verbose: false,
            restart: Some(restart),
        };
        let result = crate::solvers::iterative::gmres(matrix, b, None, &options)?;
        return Ok(result.solution);
    }

    config.apply();

    let mut x = Array1::zeros(n);
    let restart = restart.min(n);

    for _outer in 0..max_iter {
        // Compute initial residual
        let ax = parallel_matvec(matrix, &x.view(), config)?;
        let r = b - &ax;
        let beta = vector_ops::parallel_norm(&r.view(), config)?;

        if beta < tolerance {
            return Ok(x);
        }

        // Initialize Krylov subspace
        let mut v = vec![r / beta];
        let mut h = scirs2_core::ndarray::Array2::<F>::zeros((restart + 1, restart));

        // Arnoldi iteration
        for j in 0..restart {
            // w = A * v[j]
            let w = parallel_matvec(matrix, &v[j].view(), config)?;

            // Modified Gram-Schmidt orthogonalization
            let mut w_new = w.clone();
            for i in 0..=j {
                h[[i, j]] = vector_ops::parallel_dot(&w.view(), &v[i].view(), config)?;
                w_new = vector_ops::parallel_axpy(-h[[i, j]], &v[i].view(), &w_new.view(), config)?;
            }

            h[[j + 1, j]] = vector_ops::parallel_norm(&w_new.view(), config)?;

            if h[[j + 1, j]] < F::epsilon() {
                break;
            }

            v.push(w_new / h[[j + 1, j]]);
        }

        // Solve least squares problem (serial for numerical stability)
        let k = v.len() - 1;
        let h_sub = h.slice(scirs2_core::ndarray::s![..=k, ..k]).to_owned();
        let mut g = Array1::zeros(k + 1);
        g[0] = beta;

        // Apply Givens rotations to solve the least squares problem
        let mut y = Array1::zeros(k);
        for i in (0..k).rev() {
            let mut sum = g[i];
            for j in (i + 1)..k {
                sum -= h_sub[[i, j]] * y[j];
            }
            y[i] = sum / h_sub[[i, i]];
        }

        // Update solution
        for i in 0..k {
            x = vector_ops::parallel_axpy(y[i], &v[i].view(), &x.view(), config)?;
        }

        // Check residual
        let ax = parallel_matvec(matrix, &x.view(), config)?;
        let r = b - &ax;
        let residual_norm = vector_ops::parallel_norm(&r.view(), config)?;

        if residual_norm < tolerance {
            return Ok(x);
        }
    }

    Err(LinalgError::ComputationError(
        "GMRES failed to converge".to_string(),
    ))
}

/// Parallel BiCGSTAB (Biconjugate Gradient Stabilized) solver
///
/// Implements parallel BiCGSTAB for solving non-symmetric linear systems.
pub fn parallel_bicgstab<F>(
    matrix: &ArrayView2<F>,
    b: &ArrayView1<F>,
    max_iter: usize,
    tolerance: F,
    config: &WorkerConfig,
) -> LinalgResult<Array1<F>>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "BiCGSTAB requires square matrix".to_string(),
        ));
    }
    if n != b.len() {
        return Err(LinalgError::ShapeError(
            "Matrix and vector dimensions incompatible".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        return crate::iterative_solvers::bicgstab(
            &matrix.view(),
            &b.view(),
            max_iter,
            tolerance,
            None,
        );
    }

    config.apply();

    // Initialize
    let mut x = Array1::zeros(n);
    let ax = parallel_matvec(matrix, &x.view(), config)?;
    let mut r = b - &ax;
    let r_hat = r.clone();

    let mut rho = F::one();
    let mut alpha = F::one();
    let mut omega = F::one();

    let mut v = Array1::zeros(n);
    let mut p = Array1::zeros(n);

    for _iter in 0..max_iter {
        let rho_new = vector_ops::parallel_dot(&r_hat.view(), &r.view(), config)?;

        if rho_new.abs() < F::epsilon() {
            return Err(LinalgError::ComputationError(
                "BiCGSTAB breakdown: rho = 0".to_string(),
            ));
        }

        let beta = (rho_new / rho) * (alpha / omega);

        // p = r + beta * (p - omega * v)
        let temp = vector_ops::parallel_axpy(-omega, &v.view(), &p.view(), config)?;
        p = vector_ops::parallel_axpy(
            F::one(),
            &r.view(),
            &vector_ops::parallel_axpy(beta, &temp.view(), &Array1::zeros(n).view(), config)?
                .view(),
            config,
        )?;

        // v = A * p
        v = parallel_matvec(matrix, &p.view(), config)?;

        alpha = rho_new / vector_ops::parallel_dot(&r_hat.view(), &v.view(), config)?;

        // s = r - alpha * v
        let s = vector_ops::parallel_axpy(-alpha, &v.view(), &r.view(), config)?;

        // Check convergence
        let s_norm = vector_ops::parallel_norm(&s.view(), config)?;
        if s_norm < tolerance {
            x = vector_ops::parallel_axpy(alpha, &p.view(), &x.view(), config)?;
            return Ok(x);
        }

        // t = A * s
        let t = parallel_matvec(matrix, &s.view(), config)?;

        omega = vector_ops::parallel_dot(&t.view(), &s.view(), config)?
            / vector_ops::parallel_dot(&t.view(), &t.view(), config)?;

        // x = x + alpha * p + omega * s
        x = vector_ops::parallel_axpy(alpha, &p.view(), &x.view(), config)?;
        x = vector_ops::parallel_axpy(omega, &s.view(), &x.view(), config)?;

        // r = s - omega * t
        r = vector_ops::parallel_axpy(-omega, &t.view(), &s.view(), config)?;

        // Check convergence
        let r_norm = vector_ops::parallel_norm(&r.view(), config)?;
        if r_norm < tolerance {
            return Ok(x);
        }

        rho = rho_new;
    }

    Err(LinalgError::ComputationError(
        "BiCGSTAB failed to converge".to_string(),
    ))
}

/// Parallel Jacobi method
///
/// Implements parallel Jacobi iteration for solving linear systems.
/// This method is particularly well-suited for parallel execution.
pub fn parallel_jacobi<F>(
    matrix: &ArrayView2<F>,
    b: &ArrayView1<F>,
    max_iter: usize,
    tolerance: F,
    config: &WorkerConfig,
) -> LinalgResult<Array1<F>>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "Jacobi method requires square matrix".to_string(),
        ));
    }
    if n != b.len() {
        return Err(LinalgError::ShapeError(
            "Matrix and vector dimensions incompatible".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        return crate::iterative_solvers::jacobi_method(
            &matrix.view(),
            &b.view(),
            max_iter,
            tolerance,
            None,
        );
    }

    config.apply();

    // Extract diagonal
    let diag: Vec<F> = (0..n)
        .into_par_iter()
        .map(|i| {
            if matrix[[i, i]].abs() < F::epsilon() {
                F::one() // Avoid division by zero
            } else {
                matrix[[i, i]]
            }
        })
        .collect();

    let mut x = Array1::zeros(n);

    for _iter in 0..max_iter {
        // Parallel update: x_new[i] = (b[i] - sum(A[i,j]*x[j] for j != i)) / A[i,i]
        let x_new_vec: Vec<F> = (0..n)
            .into_par_iter()
            .map(|i| {
                let mut sum = b[i];
                for j in 0..n {
                    if i != j {
                        sum -= matrix[[i, j]] * x[j];
                    }
                }
                sum / diag[i]
            })
            .collect();

        let x_new = Array1::from_vec(x_new_vec);

        // Check convergence
        let diff = &x_new - &x;
        let error = vector_ops::parallel_norm(&diff.view(), config)?;

        if error < tolerance {
            return Ok(x_new);
        }

        x = x_new.clone();
    }

    Err(LinalgError::ComputationError(
        "Jacobi method failed to converge".to_string(),
    ))
}

/// Parallel SOR (Successive Over-Relaxation) method
///
/// Implements a modified parallel SOR using red-black ordering
/// to enable parallel updates.
pub fn parallel_sor<F>(
    matrix: &ArrayView2<F>,
    b: &ArrayView1<F>,
    omega: F,
    max_iter: usize,
    tolerance: F,
    config: &WorkerConfig,
) -> LinalgResult<Array1<F>>
where
    F: Float
        + Send
        + Sync
        + Zero
        + Sum
        + One
        + NumAssign
        + scirs2_core::ndarray::ScalarOperand
        + 'static,
{
    let (m, n) = matrix.dim();
    if m != n {
        return Err(LinalgError::ShapeError(
            "SOR requires square matrix".to_string(),
        ));
    }
    if n != b.len() {
        return Err(LinalgError::ShapeError(
            "Matrix and vector dimensions incompatible".to_string(),
        ));
    }

    if omega <= F::zero() || omega >= F::from(2.0).expect("Operation failed") {
        return Err(LinalgError::InvalidInputError(
            "Relaxation parameter omega must be in (0, 2)".to_string(),
        ));
    }

    let datasize = m * n;
    if !adaptive::should_use_parallel(datasize, config) {
        return crate::iterative_solvers::successive_over_relaxation(
            &matrix.view(),
            &b.view(),
            omega,
            max_iter,
            tolerance,
            None,
        );
    }

    config.apply();

    let mut x = Array1::zeros(n);

    for _iter in 0..max_iter {
        let x_old = x.clone();

        // Red-black ordering for parallel updates
        // First update "red" points (even indices)
        let red_updates: Vec<(usize, F)> = (0..n)
            .into_par_iter()
            .filter(|&i| i % 2 == 0)
            .map(|i| {
                let mut sum = b[i];
                for j in 0..n {
                    if i != j {
                        sum -= matrix[[i, j]] * x_old[j];
                    }
                }
                let x_gs = sum / matrix[[i, i]];
                let x_new = (F::one() - omega) * x_old[i] + omega * x_gs;
                (i, x_new)
            })
            .collect();

        // Apply red updates
        for (i, val) in red_updates {
            x[i] = val;
        }

        // Then update "black" points (odd indices)
        let black_updates: Vec<(usize, F)> = (0..n)
            .into_par_iter()
            .filter(|&i| i % 2 == 1)
            .map(|i| {
                let mut sum = b[i];
                for j in 0..n {
                    if i != j {
                        sum -= matrix[[i, j]] * x[j];
                    }
                }
                let x_gs = sum / matrix[[i, i]];
                let x_new = (F::one() - omega) * x_old[i] + omega * x_gs;
                (i, x_new)
            })
            .collect();

        // Apply black updates
        for (i, val) in black_updates {
            x[i] = val;
        }

        // Check convergence
        let ax = parallel_matvec(matrix, &x.view(), config)?;
        let r = b - &ax;
        let error = vector_ops::parallel_norm(&r.view(), config)?;

        if error < tolerance {
            return Ok(x);
        }
    }

    Err(LinalgError::ComputationError(
        "SOR failed to converge".to_string(),
    ))
}