scirs2_linalg/parallel/algorithms/
mod.rs

1//! Parallel linear algebra algorithms
2//!
3//! This module provides parallel implementations of core linear algebra
4//! algorithms optimized for multi-core systems.
5
6use super::{adaptive, WorkerConfig};
7use crate::error::{LinalgError, LinalgResult};
8use scirs2_core::ndarray::{Array1, ArrayView1, ArrayView2};
9use scirs2_core::numeric::{Float, NumAssign, One, Zero};
10use scirs2_core::parallel_ops::*;
11use std::iter::Sum;
12
13/// Parallel matrix-vector multiplication
14///
15/// This is a simpler and more effective parallelization that can be used
16/// as a building block for more complex algorithms.
17///
18/// # Arguments
19///
20/// * `matrix` - Input matrix
21/// * `vector` - Input vector
22/// * `config` - Worker configuration
23///
24/// # Returns
25///
26/// * Result vector y = A * x
27pub fn parallel_matvec<F>(
28    matrix: &ArrayView2<F>,
29    vector: &ArrayView1<F>,
30    config: &WorkerConfig,
31) -> LinalgResult<Array1<F>>
32where
33    F: Float + Send + Sync + Zero + Sum + 'static,
34{
35    let (m, n) = matrix.dim();
36    if n != vector.len() {
37        return Err(LinalgError::ShapeError(format!(
38            "Matrix-vector dimensions incompatible: {}x{} * {}",
39            m,
40            n,
41            vector.len()
42        )));
43    }
44
45    let datasize = m * n;
46    if !adaptive::should_use_parallel(datasize, config) {
47        // Fall back to serial computation
48        return Ok(matrix.dot(vector));
49    }
50
51    config.apply();
52
53    // Parallel computation of each row
54    let result_vec: Vec<F> = (0..m)
55        .into_par_iter()
56        .map(|i| {
57            matrix
58                .row(i)
59                .iter()
60                .zip(vector.iter())
61                .map(|(&aij, &xj)| aij * xj)
62                .sum()
63        })
64        .collect();
65
66    Ok(Array1::from_vec(result_vec))
67}
68
69/// Parallel power iteration for dominant eigenvalue
70///
71/// This implementation uses parallel matrix-vector multiplications
72/// in the power iteration method for computing dominant eigenvalues.
73pub fn parallel_power_iteration<F>(
74    matrix: &ArrayView2<F>,
75    max_iter: usize,
76    tolerance: F,
77    config: &WorkerConfig,
78) -> LinalgResult<(F, Array1<F>)>
79where
80    F: Float
81        + Send
82        + Sync
83        + Zero
84        + Sum
85        + NumAssign
86        + One
87        + scirs2_core::ndarray::ScalarOperand
88        + 'static,
89{
90    let (m, n) = matrix.dim();
91    if m != n {
92        return Err(LinalgError::ShapeError(
93            "Power iteration requires square matrix".to_string(),
94        ));
95    }
96
97    let datasize = m * n;
98    if !adaptive::should_use_parallel(datasize, config) {
99        // Fall back to serial power iteration
100        return crate::eigen::power_iteration(&matrix.view(), max_iter, tolerance);
101    }
102
103    config.apply();
104
105    // Initialize with simple vector
106    let mut v = Array1::ones(n);
107    let norm = v.iter().map(|&x| x * x).sum::<F>().sqrt();
108    v /= norm;
109
110    let mut eigenvalue = F::zero();
111
112    for _iter in 0..max_iter {
113        // Use the parallel matrix-vector multiplication
114        let new_v = parallel_matvec(matrix, &v.view(), config)?;
115
116        // Compute eigenvalue estimate (Rayleigh quotient)
117        let new_eigenvalue = new_v
118            .iter()
119            .zip(v.iter())
120            .map(|(&new_vi, &vi)| new_vi * vi)
121            .sum::<F>();
122
123        // Normalize
124        let norm = new_v.iter().map(|&x| x * x).sum::<F>().sqrt();
125        if norm < F::epsilon() {
126            return Err(LinalgError::ComputationError(
127                "Vector became zero during iteration".to_string(),
128            ));
129        }
130        let normalized_v = new_v / norm;
131
132        // Check convergence
133        if (new_eigenvalue - eigenvalue).abs() < tolerance {
134            return Ok((new_eigenvalue, normalized_v));
135        }
136
137        eigenvalue = new_eigenvalue;
138        v = normalized_v;
139    }
140
141    Err(LinalgError::ComputationError(
142        "Power iteration failed to converge".to_string(),
143    ))
144}
145
146/// Parallel vector operations for linear algebra
147///
148/// This module provides basic parallel vector operations that serve as
149/// building blocks for more complex algorithms.
150pub mod vector_ops {
151    use super::*;
152
153    /// Parallel dot product of two vectors
154    pub fn parallel_dot<F>(
155        x: &ArrayView1<F>,
156        y: &ArrayView1<F>,
157        config: &WorkerConfig,
158    ) -> LinalgResult<F>
159    where
160        F: Float + Send + Sync + Zero + Sum + 'static,
161    {
162        if x.len() != y.len() {
163            return Err(LinalgError::ShapeError(
164                "Vectors must have same length for dot product".to_string(),
165            ));
166        }
167
168        let datasize = x.len();
169        if !adaptive::should_use_parallel(datasize, config) {
170            return Ok(x.iter().zip(y.iter()).map(|(&xi, &yi)| xi * yi).sum());
171        }
172
173        config.apply();
174
175        let result = (0..x.len()).into_par_iter().map(|i| x[i] * y[i]).sum();
176
177        Ok(result)
178    }
179
180    /// Parallel vector norm computation
181    pub fn parallel_norm<F>(x: &ArrayView1<F>, config: &WorkerConfig) -> LinalgResult<F>
182    where
183        F: Float + Send + Sync + Zero + Sum + 'static,
184    {
185        let datasize = x.len();
186        if !adaptive::should_use_parallel(datasize, config) {
187            return Ok(x.iter().map(|&xi| xi * xi).sum::<F>().sqrt());
188        }
189
190        config.apply();
191
192        let sum_squares = (0..x.len()).into_par_iter().map(|i| x[i] * x[i]).sum::<F>();
193
194        Ok(sum_squares.sqrt())
195    }
196
197    /// Parallel AXPY operation: y = a*x + y
198    ///
199    /// Note: This function returns a new array rather than modifying in-place
200    /// due to complications with parallel mutable iteration.
201    pub fn parallel_axpy<F>(
202        alpha: F,
203        x: &ArrayView1<F>,
204        y: &ArrayView1<F>,
205        config: &WorkerConfig,
206    ) -> LinalgResult<Array1<F>>
207    where
208        F: Float + Send + Sync + 'static,
209    {
210        if x.len() != y.len() {
211            return Err(LinalgError::ShapeError(
212                "Vectors must have same length for AXPY".to_string(),
213            ));
214        }
215
216        let datasize = x.len();
217        if !adaptive::should_use_parallel(datasize, config) {
218            let result = x
219                .iter()
220                .zip(y.iter())
221                .map(|(&xi, &yi)| alpha * xi + yi)
222                .collect();
223            return Ok(Array1::from_vec(result));
224        }
225
226        config.apply();
227
228        let result_vec: Vec<F> = (0..x.len())
229            .into_par_iter()
230            .map(|i| alpha * x[i] + y[i])
231            .collect();
232
233        Ok(Array1::from_vec(result_vec))
234    }
235}
236
237/// Parallel matrix multiplication (GEMM)
238///
239/// Implements parallel general matrix multiplication with block-based
240/// parallelization for improved cache performance.
241pub fn parallel_gemm<F>(
242    a: &ArrayView2<F>,
243    b: &ArrayView2<F>,
244    config: &WorkerConfig,
245) -> LinalgResult<scirs2_core::ndarray::Array2<F>>
246where
247    F: Float + Send + Sync + Zero + Sum + NumAssign + 'static,
248{
249    let (m, k) = a.dim();
250    let (k2, n) = b.dim();
251
252    if k != k2 {
253        return Err(LinalgError::ShapeError(format!(
254            "Matrix dimensions incompatible for multiplication: {m}x{k} * {k2}x{n}"
255        )));
256    }
257
258    let datasize = m * k * n;
259    if !adaptive::should_use_parallel(datasize, config) {
260        return Ok(a.dot(b));
261    }
262
263    config.apply();
264
265    // Block size for cache-friendly computation
266    let blocksize = config.chunksize;
267
268    let mut result = scirs2_core::ndarray::Array2::zeros((m, n));
269
270    // Parallel computation using blocks
271    result
272        .outer_iter_mut()
273        .enumerate()
274        .par_bridge()
275        .for_each(|(i, mut row)| {
276            for j in 0..n {
277                let mut sum = F::zero();
278                for kb in (0..k).step_by(blocksize) {
279                    let k_end = std::cmp::min(kb + blocksize, k);
280                    for ki in kb..k_end {
281                        sum += a[[i, ki]] * b[[ki, j]];
282                    }
283                }
284                row[j] = sum;
285            }
286        });
287
288    Ok(result)
289}
290
291/// Parallel QR decomposition using Householder reflections
292///
293/// This implementation parallelizes the application of Householder
294/// transformations across columns.
295pub fn parallel_qr<F>(
296    matrix: &ArrayView2<F>,
297    config: &WorkerConfig,
298) -> LinalgResult<(
299    scirs2_core::ndarray::Array2<F>,
300    scirs2_core::ndarray::Array2<F>,
301)>
302where
303    F: Float
304        + Send
305        + Sync
306        + Zero
307        + Sum
308        + One
309        + NumAssign
310        + scirs2_core::ndarray::ScalarOperand
311        + 'static,
312{
313    let (m, n) = matrix.dim();
314    let datasize = m * n;
315
316    if !adaptive::should_use_parallel(datasize, config) {
317        return crate::decomposition::qr(&matrix.view(), None);
318    }
319
320    config.apply();
321
322    let mut a = matrix.to_owned();
323    let mut q = scirs2_core::ndarray::Array2::eye(m);
324    let min_dim = std::cmp::min(m, n);
325
326    for k in 0..min_dim {
327        // Extract column vector for Householder reflection
328        let x = a.slice(scirs2_core::ndarray::s![k.., k]).to_owned();
329        let alpha = if x[0] >= F::zero() {
330            -x.iter().map(|&xi| xi * xi).sum::<F>().sqrt()
331        } else {
332            x.iter().map(|&xi| xi * xi).sum::<F>().sqrt()
333        };
334
335        if alpha.abs() < F::epsilon() {
336            continue;
337        }
338
339        let mut v = x.clone();
340        v[0] -= alpha;
341        let v_norm_sq = v.iter().map(|&vi| vi * vi).sum::<F>();
342
343        if v_norm_sq < F::epsilon() {
344            continue;
345        }
346
347        // Apply Householder transformation (serial for simplicity)
348        let remaining_cols = n - k;
349        if remaining_cols > 1 {
350            for j in k..n {
351                let col = a.slice(scirs2_core::ndarray::s![k.., j]).to_owned();
352                let dot_product = v
353                    .iter()
354                    .zip(col.iter())
355                    .map(|(&vi, &ci)| vi * ci)
356                    .sum::<F>();
357                let factor = F::from(2.0).unwrap() * dot_product / v_norm_sq;
358
359                for (i, &vi) in v.iter().enumerate() {
360                    a[[k + i, j]] -= factor * vi;
361                }
362            }
363        }
364
365        // Update Q matrix (serial for simplicity)
366        for i in 0..m {
367            let row = q.slice(scirs2_core::ndarray::s![i, k..]).to_owned();
368            let dot_product = v
369                .iter()
370                .zip(row.iter())
371                .map(|(&vi, &ri)| vi * ri)
372                .sum::<F>();
373            let factor = F::from(2.0).unwrap() * dot_product / v_norm_sq;
374
375            for (j, &vj) in v.iter().enumerate() {
376                q[[i, k + j]] -= factor * vj;
377            }
378        }
379    }
380
381    let r = a.slice(scirs2_core::ndarray::s![..min_dim, ..]).to_owned();
382    Ok((q, r))
383}
384
385/// Parallel Cholesky decomposition
386///
387/// Implements parallel Cholesky decomposition using block-column approach
388/// for positive definite matrices.
389pub fn parallel_cholesky<F>(
390    matrix: &ArrayView2<F>,
391    config: &WorkerConfig,
392) -> LinalgResult<scirs2_core::ndarray::Array2<F>>
393where
394    F: Float
395        + Send
396        + Sync
397        + Zero
398        + Sum
399        + One
400        + NumAssign
401        + scirs2_core::ndarray::ScalarOperand
402        + 'static,
403{
404    let (m, n) = matrix.dim();
405    if m != n {
406        return Err(LinalgError::ShapeError(
407            "Cholesky decomposition requires square matrix".to_string(),
408        ));
409    }
410
411    let datasize = m * n;
412    if !adaptive::should_use_parallel(datasize, config) {
413        return crate::decomposition::cholesky(&matrix.view(), None);
414    }
415
416    config.apply();
417
418    let mut l = scirs2_core::ndarray::Array2::zeros((n, n));
419    let blocksize = config.chunksize;
420
421    for k in (0..n).step_by(blocksize) {
422        let k_end = std::cmp::min(k + blocksize, n);
423
424        // Diagonal block factorization (serial for numerical stability)
425        for i in k..k_end {
426            // Compute L[i,i]
427            let mut sum = F::zero();
428            for j in 0..i {
429                sum += l[[i, j]] * l[[i, j]];
430            }
431            let aii = matrix[[i, i]] - sum;
432            if aii <= F::zero() {
433                return Err(LinalgError::ComputationError(
434                    "Matrix is not positive definite".to_string(),
435                ));
436            }
437            l[[i, i]] = aii.sqrt();
438
439            // Compute L[i+1:k_end, i]
440            for j in (i + 1)..k_end {
441                let mut sum = F::zero();
442                for p in 0..i {
443                    sum += l[[j, p]] * l[[i, p]];
444                }
445                l[[j, i]] = (matrix[[j, i]] - sum) / l[[i, i]];
446            }
447        }
448
449        // Update trailing submatrix (serial for simplicity)
450        if k_end < n {
451            for i in k_end..n {
452                for j in k..k_end {
453                    let mut sum = F::zero();
454                    for p in 0..j {
455                        sum += l[[i, p]] * l[[j, p]];
456                    }
457                    l[[i, j]] = (matrix[[i, j]] - sum) / l[[j, j]];
458                }
459            }
460        }
461    }
462
463    Ok(l)
464}
465
466/// Parallel LU decomposition with partial pivoting
467///
468/// Implements parallel LU decomposition using block-column approach.
469pub fn parallel_lu<F>(
470    matrix: &ArrayView2<F>,
471    config: &WorkerConfig,
472) -> LinalgResult<(
473    scirs2_core::ndarray::Array2<F>,
474    scirs2_core::ndarray::Array2<F>,
475    scirs2_core::ndarray::Array2<F>,
476)>
477where
478    F: Float
479        + Send
480        + Sync
481        + Zero
482        + Sum
483        + One
484        + NumAssign
485        + scirs2_core::ndarray::ScalarOperand
486        + 'static,
487{
488    let (m, n) = matrix.dim();
489    let datasize = m * n;
490
491    if !adaptive::should_use_parallel(datasize, config) {
492        return crate::decomposition::lu(&matrix.view(), None);
493    }
494
495    config.apply();
496
497    let mut a = matrix.to_owned();
498    let mut perm_vec = (0..m).collect::<Vec<_>>();
499    let min_dim = std::cmp::min(m, n);
500
501    for k in 0..min_dim {
502        // Find pivot (serial for correctness)
503        let mut max_val = F::zero();
504        let mut pivot_row = k;
505        for i in k..m {
506            let abs_val = a[[i, k]].abs();
507            if abs_val > max_val {
508                max_val = abs_val;
509                pivot_row = i;
510            }
511        }
512
513        if max_val < F::epsilon() {
514            return Err(LinalgError::ComputationError(
515                "Matrix is singular".to_string(),
516            ));
517        }
518
519        // Swap rows if needed
520        if pivot_row != k {
521            for j in 0..n {
522                let temp = a[[k, j]];
523                a[[k, j]] = a[[pivot_row, j]];
524                a[[pivot_row, j]] = temp;
525            }
526            perm_vec.swap(k, pivot_row);
527        }
528
529        // Update submatrix (serial for now to avoid borrowing issues)
530        let pivot = a[[k, k]];
531
532        for i in (k + 1)..m {
533            let multiplier = a[[i, k]] / pivot;
534            a[[i, k]] = multiplier;
535
536            for j in (k + 1)..n {
537                a[[i, j]] = a[[i, j]] - multiplier * a[[k, j]];
538            }
539        }
540    }
541
542    // Create permutation matrix P
543    let mut p = scirs2_core::ndarray::Array2::zeros((m, m));
544    for (i, &piv) in perm_vec.iter().enumerate() {
545        p[[i, piv]] = F::one();
546    }
547
548    // Extract L and U matrices
549    let mut l = scirs2_core::ndarray::Array2::eye(m);
550    let mut u = scirs2_core::ndarray::Array2::zeros((m, n));
551
552    for i in 0..m {
553        for j in 0..n {
554            if i > j && j < min_dim {
555                l[[i, j]] = a[[i, j]];
556            } else if i <= j {
557                u[[i, j]] = a[[i, j]];
558            }
559        }
560    }
561
562    Ok((p, l, u))
563}
564
565/// Parallel conjugate gradient solver
566///
567/// Implements parallel conjugate gradient method for solving linear systems
568/// with symmetric positive definite matrices.
569pub fn parallel_conjugate_gradient<F>(
570    matrix: &ArrayView2<F>,
571    b: &ArrayView1<F>,
572    max_iter: usize,
573    tolerance: F,
574    config: &WorkerConfig,
575) -> LinalgResult<Array1<F>>
576where
577    F: Float
578        + Send
579        + Sync
580        + Zero
581        + Sum
582        + One
583        + NumAssign
584        + scirs2_core::ndarray::ScalarOperand
585        + 'static,
586{
587    let (m, n) = matrix.dim();
588    if m != n {
589        return Err(LinalgError::ShapeError(
590            "CG requires square matrix".to_string(),
591        ));
592    }
593    if n != b.len() {
594        return Err(LinalgError::ShapeError(
595            "Matrix and vector dimensions incompatible".to_string(),
596        ));
597    }
598
599    let datasize = m * n;
600    if !adaptive::should_use_parallel(datasize, config) {
601        return crate::iterative_solvers::conjugate_gradient(
602            &matrix.view(),
603            &b.view(),
604            max_iter,
605            tolerance,
606            None,
607        );
608    }
609
610    config.apply();
611
612    // Initialize
613    let mut x = Array1::zeros(n);
614
615    // r = b - A*x
616    let ax = parallel_matvec(matrix, &x.view(), config)?;
617    let mut r = b - &ax;
618    let mut p = r.clone();
619    let mut rsold = vector_ops::parallel_dot(&r.view(), &r.view(), config)?;
620
621    for _iter in 0..max_iter {
622        let ap = parallel_matvec(matrix, &p.view(), config)?;
623        let alpha = rsold / vector_ops::parallel_dot(&p.view(), &ap.view(), config)?;
624
625        x = vector_ops::parallel_axpy(alpha, &p.view(), &x.view(), config)?;
626        r = vector_ops::parallel_axpy(-alpha, &ap.view(), &r.view(), config)?;
627
628        let rsnew = vector_ops::parallel_dot(&r.view(), &r.view(), config)?;
629
630        if rsnew.sqrt() < tolerance {
631            return Ok(x);
632        }
633
634        let beta = rsnew / rsold;
635        p = vector_ops::parallel_axpy(beta, &p.view(), &r.view(), config)?;
636        rsold = rsnew;
637    }
638
639    Err(LinalgError::ComputationError(
640        "Conjugate gradient failed to converge".to_string(),
641    ))
642}
643
644/// Parallel SVD decomposition
645///
646/// Implements parallel Singular Value Decomposition using a block-based approach
647/// for improved performance on large matrices.
648pub fn parallel_svd<F>(
649    matrix: &ArrayView2<F>,
650    config: &WorkerConfig,
651) -> LinalgResult<(
652    scirs2_core::ndarray::Array2<F>,
653    scirs2_core::ndarray::Array1<F>,
654    scirs2_core::ndarray::Array2<F>,
655)>
656where
657    F: Float
658        + Send
659        + Sync
660        + Zero
661        + Sum
662        + One
663        + NumAssign
664        + scirs2_core::ndarray::ScalarOperand
665        + 'static,
666{
667    let (m, n) = matrix.dim();
668    let datasize = m * n;
669
670    if !adaptive::should_use_parallel(datasize, config) {
671        return crate::decomposition::svd(&matrix.view(), false, None);
672    }
673
674    config.apply();
675
676    // For now, use QR decomposition as a first step
677    // This is a simplified parallel SVD - a full implementation would use
678    // more sophisticated algorithms like Jacobi SVD or divide-and-conquer
679    let (q, r) = parallel_qr(matrix, config)?;
680
681    // Apply SVD to the smaller R matrix (serial for numerical stability)
682    let (u_r, s, vt) = crate::decomposition::svd(&r.view(), false, None)?;
683
684    // U = Q * U_r
685    let u = parallel_gemm(&q.view(), &u_r.view(), config)?;
686
687    Ok((u, s, vt))
688}
689
690/// Parallel GMRES (Generalized Minimal Residual) solver
691///
692/// Implements parallel GMRES for solving non-symmetric linear systems.
693pub fn parallel_gmres<F>(
694    matrix: &ArrayView2<F>,
695    b: &ArrayView1<F>,
696    max_iter: usize,
697    tolerance: F,
698    restart: usize,
699    config: &WorkerConfig,
700) -> LinalgResult<Array1<F>>
701where
702    F: Float
703        + Send
704        + Sync
705        + Zero
706        + Sum
707        + One
708        + NumAssign
709        + scirs2_core::ndarray::ScalarOperand
710        + std::fmt::Debug
711        + std::fmt::Display
712        + 'static,
713{
714    let (m, n) = matrix.dim();
715    if m != n {
716        return Err(LinalgError::ShapeError(
717            "GMRES requires square matrix".to_string(),
718        ));
719    }
720    if n != b.len() {
721        return Err(LinalgError::ShapeError(
722            "Matrix and vector dimensions incompatible".to_string(),
723        ));
724    }
725
726    let datasize = m * n;
727    if !adaptive::should_use_parallel(datasize, config) {
728        // Fall back to serial GMRES - use the iterative solver version
729        let options = crate::solvers::iterative::IterativeSolverOptions {
730            max_iterations: max_iter,
731            tolerance,
732            verbose: false,
733            restart: Some(restart),
734        };
735        let result = crate::solvers::iterative::gmres(matrix, b, None, &options)?;
736        return Ok(result.solution);
737    }
738
739    config.apply();
740
741    let mut x = Array1::zeros(n);
742    let restart = restart.min(n);
743
744    for _outer in 0..max_iter {
745        // Compute initial residual
746        let ax = parallel_matvec(matrix, &x.view(), config)?;
747        let r = b - &ax;
748        let beta = vector_ops::parallel_norm(&r.view(), config)?;
749
750        if beta < tolerance {
751            return Ok(x);
752        }
753
754        // Initialize Krylov subspace
755        let mut v = vec![r / beta];
756        let mut h = scirs2_core::ndarray::Array2::<F>::zeros((restart + 1, restart));
757
758        // Arnoldi iteration
759        for j in 0..restart {
760            // w = A * v[j]
761            let w = parallel_matvec(matrix, &v[j].view(), config)?;
762
763            // Modified Gram-Schmidt orthogonalization
764            let mut w_new = w.clone();
765            for i in 0..=j {
766                h[[i, j]] = vector_ops::parallel_dot(&w.view(), &v[i].view(), config)?;
767                w_new = vector_ops::parallel_axpy(-h[[i, j]], &v[i].view(), &w_new.view(), config)?;
768            }
769
770            h[[j + 1, j]] = vector_ops::parallel_norm(&w_new.view(), config)?;
771
772            if h[[j + 1, j]] < F::epsilon() {
773                break;
774            }
775
776            v.push(w_new / h[[j + 1, j]]);
777        }
778
779        // Solve least squares problem (serial for numerical stability)
780        let k = v.len() - 1;
781        let h_sub = h.slice(scirs2_core::ndarray::s![..=k, ..k]).to_owned();
782        let mut g = Array1::zeros(k + 1);
783        g[0] = beta;
784
785        // Apply Givens rotations to solve the least squares problem
786        let mut y = Array1::zeros(k);
787        for i in (0..k).rev() {
788            let mut sum = g[i];
789            for j in (i + 1)..k {
790                sum -= h_sub[[i, j]] * y[j];
791            }
792            y[i] = sum / h_sub[[i, i]];
793        }
794
795        // Update solution
796        for i in 0..k {
797            x = vector_ops::parallel_axpy(y[i], &v[i].view(), &x.view(), config)?;
798        }
799
800        // Check residual
801        let ax = parallel_matvec(matrix, &x.view(), config)?;
802        let r = b - &ax;
803        let residual_norm = vector_ops::parallel_norm(&r.view(), config)?;
804
805        if residual_norm < tolerance {
806            return Ok(x);
807        }
808    }
809
810    Err(LinalgError::ComputationError(
811        "GMRES failed to converge".to_string(),
812    ))
813}
814
815/// Parallel BiCGSTAB (Biconjugate Gradient Stabilized) solver
816///
817/// Implements parallel BiCGSTAB for solving non-symmetric linear systems.
818pub fn parallel_bicgstab<F>(
819    matrix: &ArrayView2<F>,
820    b: &ArrayView1<F>,
821    max_iter: usize,
822    tolerance: F,
823    config: &WorkerConfig,
824) -> LinalgResult<Array1<F>>
825where
826    F: Float
827        + Send
828        + Sync
829        + Zero
830        + Sum
831        + One
832        + NumAssign
833        + scirs2_core::ndarray::ScalarOperand
834        + 'static,
835{
836    let (m, n) = matrix.dim();
837    if m != n {
838        return Err(LinalgError::ShapeError(
839            "BiCGSTAB requires square matrix".to_string(),
840        ));
841    }
842    if n != b.len() {
843        return Err(LinalgError::ShapeError(
844            "Matrix and vector dimensions incompatible".to_string(),
845        ));
846    }
847
848    let datasize = m * n;
849    if !adaptive::should_use_parallel(datasize, config) {
850        return crate::iterative_solvers::bicgstab(
851            &matrix.view(),
852            &b.view(),
853            max_iter,
854            tolerance,
855            None,
856        );
857    }
858
859    config.apply();
860
861    // Initialize
862    let mut x = Array1::zeros(n);
863    let ax = parallel_matvec(matrix, &x.view(), config)?;
864    let mut r = b - &ax;
865    let r_hat = r.clone();
866
867    let mut rho = F::one();
868    let mut alpha = F::one();
869    let mut omega = F::one();
870
871    let mut v = Array1::zeros(n);
872    let mut p = Array1::zeros(n);
873
874    for _iter in 0..max_iter {
875        let rho_new = vector_ops::parallel_dot(&r_hat.view(), &r.view(), config)?;
876
877        if rho_new.abs() < F::epsilon() {
878            return Err(LinalgError::ComputationError(
879                "BiCGSTAB breakdown: rho = 0".to_string(),
880            ));
881        }
882
883        let beta = (rho_new / rho) * (alpha / omega);
884
885        // p = r + beta * (p - omega * v)
886        let temp = vector_ops::parallel_axpy(-omega, &v.view(), &p.view(), config)?;
887        p = vector_ops::parallel_axpy(
888            F::one(),
889            &r.view(),
890            &vector_ops::parallel_axpy(beta, &temp.view(), &Array1::zeros(n).view(), config)?
891                .view(),
892            config,
893        )?;
894
895        // v = A * p
896        v = parallel_matvec(matrix, &p.view(), config)?;
897
898        alpha = rho_new / vector_ops::parallel_dot(&r_hat.view(), &v.view(), config)?;
899
900        // s = r - alpha * v
901        let s = vector_ops::parallel_axpy(-alpha, &v.view(), &r.view(), config)?;
902
903        // Check convergence
904        let s_norm = vector_ops::parallel_norm(&s.view(), config)?;
905        if s_norm < tolerance {
906            x = vector_ops::parallel_axpy(alpha, &p.view(), &x.view(), config)?;
907            return Ok(x);
908        }
909
910        // t = A * s
911        let t = parallel_matvec(matrix, &s.view(), config)?;
912
913        omega = vector_ops::parallel_dot(&t.view(), &s.view(), config)?
914            / vector_ops::parallel_dot(&t.view(), &t.view(), config)?;
915
916        // x = x + alpha * p + omega * s
917        x = vector_ops::parallel_axpy(alpha, &p.view(), &x.view(), config)?;
918        x = vector_ops::parallel_axpy(omega, &s.view(), &x.view(), config)?;
919
920        // r = s - omega * t
921        r = vector_ops::parallel_axpy(-omega, &t.view(), &s.view(), config)?;
922
923        // Check convergence
924        let r_norm = vector_ops::parallel_norm(&r.view(), config)?;
925        if r_norm < tolerance {
926            return Ok(x);
927        }
928
929        rho = rho_new;
930    }
931
932    Err(LinalgError::ComputationError(
933        "BiCGSTAB failed to converge".to_string(),
934    ))
935}
936
937/// Parallel Jacobi method
938///
939/// Implements parallel Jacobi iteration for solving linear systems.
940/// This method is particularly well-suited for parallel execution.
941pub fn parallel_jacobi<F>(
942    matrix: &ArrayView2<F>,
943    b: &ArrayView1<F>,
944    max_iter: usize,
945    tolerance: F,
946    config: &WorkerConfig,
947) -> LinalgResult<Array1<F>>
948where
949    F: Float
950        + Send
951        + Sync
952        + Zero
953        + Sum
954        + One
955        + NumAssign
956        + scirs2_core::ndarray::ScalarOperand
957        + 'static,
958{
959    let (m, n) = matrix.dim();
960    if m != n {
961        return Err(LinalgError::ShapeError(
962            "Jacobi method requires square matrix".to_string(),
963        ));
964    }
965    if n != b.len() {
966        return Err(LinalgError::ShapeError(
967            "Matrix and vector dimensions incompatible".to_string(),
968        ));
969    }
970
971    let datasize = m * n;
972    if !adaptive::should_use_parallel(datasize, config) {
973        return crate::iterative_solvers::jacobi_method(
974            &matrix.view(),
975            &b.view(),
976            max_iter,
977            tolerance,
978            None,
979        );
980    }
981
982    config.apply();
983
984    // Extract diagonal
985    let diag: Vec<F> = (0..n)
986        .into_par_iter()
987        .map(|i| {
988            if matrix[[i, i]].abs() < F::epsilon() {
989                F::one() // Avoid division by zero
990            } else {
991                matrix[[i, i]]
992            }
993        })
994        .collect();
995
996    let mut x = Array1::zeros(n);
997
998    for _iter in 0..max_iter {
999        // Parallel update: x_new[i] = (b[i] - sum(A[i,j]*x[j] for j != i)) / A[i,i]
1000        let x_new_vec: Vec<F> = (0..n)
1001            .into_par_iter()
1002            .map(|i| {
1003                let mut sum = b[i];
1004                for j in 0..n {
1005                    if i != j {
1006                        sum -= matrix[[i, j]] * x[j];
1007                    }
1008                }
1009                sum / diag[i]
1010            })
1011            .collect();
1012
1013        let x_new = Array1::from_vec(x_new_vec);
1014
1015        // Check convergence
1016        let diff = &x_new - &x;
1017        let error = vector_ops::parallel_norm(&diff.view(), config)?;
1018
1019        if error < tolerance {
1020            return Ok(x_new);
1021        }
1022
1023        x = x_new.clone();
1024    }
1025
1026    Err(LinalgError::ComputationError(
1027        "Jacobi method failed to converge".to_string(),
1028    ))
1029}
1030
1031/// Parallel SOR (Successive Over-Relaxation) method
1032///
1033/// Implements a modified parallel SOR using red-black ordering
1034/// to enable parallel updates.
1035pub fn parallel_sor<F>(
1036    matrix: &ArrayView2<F>,
1037    b: &ArrayView1<F>,
1038    omega: F,
1039    max_iter: usize,
1040    tolerance: F,
1041    config: &WorkerConfig,
1042) -> LinalgResult<Array1<F>>
1043where
1044    F: Float
1045        + Send
1046        + Sync
1047        + Zero
1048        + Sum
1049        + One
1050        + NumAssign
1051        + scirs2_core::ndarray::ScalarOperand
1052        + 'static,
1053{
1054    let (m, n) = matrix.dim();
1055    if m != n {
1056        return Err(LinalgError::ShapeError(
1057            "SOR requires square matrix".to_string(),
1058        ));
1059    }
1060    if n != b.len() {
1061        return Err(LinalgError::ShapeError(
1062            "Matrix and vector dimensions incompatible".to_string(),
1063        ));
1064    }
1065
1066    if omega <= F::zero() || omega >= F::from(2.0).unwrap() {
1067        return Err(LinalgError::InvalidInputError(
1068            "Relaxation parameter omega must be in (0, 2)".to_string(),
1069        ));
1070    }
1071
1072    let datasize = m * n;
1073    if !adaptive::should_use_parallel(datasize, config) {
1074        return crate::iterative_solvers::successive_over_relaxation(
1075            &matrix.view(),
1076            &b.view(),
1077            omega,
1078            max_iter,
1079            tolerance,
1080            None,
1081        );
1082    }
1083
1084    config.apply();
1085
1086    let mut x = Array1::zeros(n);
1087
1088    for _iter in 0..max_iter {
1089        let x_old = x.clone();
1090
1091        // Red-black ordering for parallel updates
1092        // First update "red" points (even indices)
1093        let red_updates: Vec<(usize, F)> = (0..n)
1094            .into_par_iter()
1095            .filter(|&i| i % 2 == 0)
1096            .map(|i| {
1097                let mut sum = b[i];
1098                for j in 0..n {
1099                    if i != j {
1100                        sum -= matrix[[i, j]] * x_old[j];
1101                    }
1102                }
1103                let x_gs = sum / matrix[[i, i]];
1104                let x_new = (F::one() - omega) * x_old[i] + omega * x_gs;
1105                (i, x_new)
1106            })
1107            .collect();
1108
1109        // Apply red updates
1110        for (i, val) in red_updates {
1111            x[i] = val;
1112        }
1113
1114        // Then update "black" points (odd indices)
1115        let black_updates: Vec<(usize, F)> = (0..n)
1116            .into_par_iter()
1117            .filter(|&i| i % 2 == 1)
1118            .map(|i| {
1119                let mut sum = b[i];
1120                for j in 0..n {
1121                    if i != j {
1122                        sum -= matrix[[i, j]] * x[j];
1123                    }
1124                }
1125                let x_gs = sum / matrix[[i, i]];
1126                let x_new = (F::one() - omega) * x_old[i] + omega * x_gs;
1127                (i, x_new)
1128            })
1129            .collect();
1130
1131        // Apply black updates
1132        for (i, val) in black_updates {
1133            x[i] = val;
1134        }
1135
1136        // Check convergence
1137        let ax = parallel_matvec(matrix, &x.view(), config)?;
1138        let r = b - &ax;
1139        let error = vector_ops::parallel_norm(&r.view(), config)?;
1140
1141        if error < tolerance {
1142            return Ok(x);
1143        }
1144    }
1145
1146    Err(LinalgError::ComputationError(
1147        "SOR failed to converge".to_string(),
1148    ))
1149}