Skip to main content

scirs2_sparse/
gpu_preconditioner.rs

1//! Mixed CPU/GPU preconditioning for sparse linear systems.
2//!
3//! ILU(0) factorization runs on CPU; SpMV applies the preconditioner.
4//! The mixed strategy exploits CPU's suitability for sequential triangular solves
5//! and GPU's throughput for parallel sparse matrix-vector products.
6//!
7//! # References
8//!
9//! - Saad, Y. (2003). *Iterative Methods for Sparse Linear Systems*, 2nd ed. SIAM. §10.3.
10//! - Benzi, M. (2002). Preconditioning techniques for large linear systems. J. Comput. Phys.
11
12use crate::error::{SparseError, SparseResult};
13use scirs2_core::ndarray::Array1;
14
15// ---------------------------------------------------------------------------
16// Lightweight f64 CSR used internally by this module.
17// We deliberately keep this separate from the generic `csr::CsrMatrix<T>` to
18// avoid the heavyweight trait bounds and stay self-contained.
19// ---------------------------------------------------------------------------
20
21/// Sparse matrix in CSR format (row-major compressed), f64 values only.
22///
23/// This is a lightweight internal type used by the GPU preconditioner.
24/// For the general sparse matrix API use [`crate::csr::CsrMatrix`].
25#[derive(Debug, Clone)]
26pub struct GpuPrecondCsr {
27    /// Number of rows.
28    pub nrows: usize,
29    /// Number of columns.
30    pub ncols: usize,
31    /// Row pointers: `row_ptr[i]..row_ptr[i+1]` indexes row `i`.
32    pub row_ptr: Vec<usize>,
33    /// Column indices of non-zeros.
34    pub col_idx: Vec<usize>,
35    /// Non-zero values.
36    pub values: Vec<f64>,
37}
38
39impl GpuPrecondCsr {
40    /// Create from raw CSR data.
41    pub fn new(
42        nrows: usize,
43        ncols: usize,
44        row_ptr: Vec<usize>,
45        col_idx: Vec<usize>,
46        values: Vec<f64>,
47    ) -> Self {
48        Self {
49            nrows,
50            ncols,
51            row_ptr,
52            col_idx,
53            values,
54        }
55    }
56
57    /// Create from a dense `ndarray::Array2<f64>` (useful for testing).
58    pub fn from_dense(dense: &scirs2_core::ndarray::Array2<f64>) -> Self {
59        let (m, n) = (dense.nrows(), dense.ncols());
60        let mut row_ptr = vec![0usize];
61        let mut col_idx = Vec::new();
62        let mut values = Vec::new();
63        for i in 0..m {
64            for j in 0..n {
65                let v = dense[[i, j]];
66                if v.abs() > 1e-14 {
67                    col_idx.push(j);
68                    values.push(v);
69                }
70            }
71            row_ptr.push(col_idx.len());
72        }
73        Self {
74            nrows: m,
75            ncols: n,
76            row_ptr,
77            col_idx,
78            values,
79        }
80    }
81
82    /// Sparse matrix-vector product `y = A * x`.
83    pub fn spmv(&self, x: &Array1<f64>) -> Array1<f64> {
84        let mut y = Array1::zeros(self.nrows);
85        for i in 0..self.nrows {
86            let mut sum = 0.0_f64;
87            for k in self.row_ptr[i]..self.row_ptr[i + 1] {
88                sum += self.values[k] * x[self.col_idx[k]];
89            }
90            y[i] = sum;
91        }
92        y
93    }
94
95    /// Number of non-zeros.
96    pub fn nnz(&self) -> usize {
97        self.values.len()
98    }
99}
100
101// ---------------------------------------------------------------------------
102// ILU(0) factorization
103// ---------------------------------------------------------------------------
104
105/// ILU(0) preconditioner — zero fill-in incomplete LU factorization.
106///
107/// Computes factors L and U such that LU ≈ A on the same sparsity pattern
108/// as A.  The factorization is performed row-by-row (Crout variant).
109pub struct Ilu0Preconditioner {
110    n: usize,
111    /// L factor values stored in the combined `row_ptr`/`col_idx` layout.
112    /// Entries with `col_idx[k] < i` are L; `col_idx[k] == i` is L diagonal
113    /// (always 1.0); entries with `col_idx[k] > i` are 0.0 here.
114    l_values: Vec<f64>,
115    /// U factor values stored in the same layout.
116    /// Entries with `col_idx[k] >= i` are U; `col_idx[k] < i` is 0.0 here.
117    u_values: Vec<f64>,
118    row_ptr: Vec<usize>,
119    col_idx: Vec<usize>,
120}
121
122impl Ilu0Preconditioner {
123    /// Compute ILU(0) factorization of `a` (must be square with non-zero diagonal).
124    ///
125    /// # Errors
126    ///
127    /// Returns [`SparseError::ValueError`] if the matrix is non-square or if a
128    /// zero pivot is encountered during factorization.
129    pub fn compute(a: &GpuPrecondCsr) -> SparseResult<Self> {
130        if a.nrows != a.ncols {
131            return Err(SparseError::ValueError(
132                "ILU(0) requires a square matrix".to_string(),
133            ));
134        }
135        let n = a.nrows;
136        if n == 0 {
137            return Ok(Self {
138                n,
139                l_values: Vec::new(),
140                u_values: Vec::new(),
141                row_ptr: vec![0],
142                col_idx: Vec::new(),
143            });
144        }
145
146        // Working copy of values; we overwrite in-place during elimination.
147        let mut values = a.values.clone();
148        let row_ptr = a.row_ptr.clone();
149        let col_idx = a.col_idx.clone();
150
151        // Locate diagonal entries for each row (needed for pivot access).
152        let mut diag_idx = vec![usize::MAX; n];
153        for i in 0..n {
154            for k in row_ptr[i]..row_ptr[i + 1] {
155                if col_idx[k] == i {
156                    diag_idx[i] = k;
157                    break;
158                }
159            }
160            if diag_idx[i] == usize::MAX {
161                return Err(SparseError::ValueError(format!(
162                    "ILU(0): missing diagonal entry in row {i}"
163                )));
164            }
165        }
166
167        // Row-by-row ILU(0) Gaussian elimination (keeps original sparsity).
168        for i in 1..n {
169            for k in row_ptr[i]..row_ptr[i + 1] {
170                let j = col_idx[k];
171                if j >= i {
172                    break; // columns are sorted; strictly lower part is done
173                }
174                let u_jj = values[diag_idx[j]];
175                if u_jj.abs() < 1e-300 {
176                    return Err(SparseError::ValueError(format!(
177                        "ILU(0): zero pivot at diagonal ({j},{j})"
178                    )));
179                }
180                // L[i,j] = A[i,j] / U[j,j]
181                values[k] /= u_jj;
182                let l_ij = values[k];
183
184                // Eliminate: A[i, col] -= L[i,j] * U[j, col] for col > j
185                // walking row i and row j in lock-step (both sorted by column).
186                let mut ki = k + 1; // position in row i, col > j
187                let mut kj = diag_idx[j] + 1; // position in row j, col > j
188                while ki < row_ptr[i + 1] && kj < row_ptr[j + 1] {
189                    let ci = col_idx[ki];
190                    let cj = col_idx[kj];
191                    match ci.cmp(&cj) {
192                        std::cmp::Ordering::Equal => {
193                            // Pattern match: both rows have column ci=cj; subtract.
194                            values[ki] -= l_ij * values[kj];
195                            ki += 1;
196                            kj += 1;
197                        }
198                        std::cmp::Ordering::Less => ki += 1,
199                        std::cmp::Ordering::Greater => kj += 1,
200                    }
201                }
202            }
203        }
204
205        // Split into L (lower, unit diagonal) and U (upper + diagonal).
206        let nnz = a.nnz();
207        let mut l_values = vec![0.0_f64; nnz];
208        let mut u_values = vec![0.0_f64; nnz];
209
210        for i in 0..n {
211            for k in row_ptr[i]..row_ptr[i + 1] {
212                let j = col_idx[k];
213                if j < i {
214                    l_values[k] = values[k]; // L factor
215                } else if j == i {
216                    l_values[k] = 1.0; // unit diagonal for L
217                    u_values[k] = values[k]; // diagonal of U
218                } else {
219                    u_values[k] = values[k]; // strictly upper part of U
220                }
221            }
222        }
223
224        Ok(Self {
225            n,
226            l_values,
227            u_values,
228            row_ptr,
229            col_idx,
230        })
231    }
232
233    /// Apply preconditioner: solve `(LU) z = r`.
234    ///
235    /// Performs forward solve `Ly = r` then backward solve `Uz = y`.
236    pub fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
237        // Forward solve: L y = r  (unit lower triangular)
238        let mut y = r.clone();
239        for i in 0..self.n {
240            for k in self.row_ptr[i]..self.row_ptr[i + 1] {
241                let j = self.col_idx[k];
242                if j < i {
243                    let ly = self.l_values[k] * y[j];
244                    y[i] -= ly;
245                }
246            }
247        }
248
249        // Backward solve: U z = y  (upper triangular with explicit diagonal)
250        let mut z = y;
251        for ii in 0..self.n {
252            let i = self.n - 1 - ii;
253            let mut diag_val = 1.0_f64;
254            for k in self.row_ptr[i]..self.row_ptr[i + 1] {
255                let j = self.col_idx[k];
256                if j > i {
257                    let uz = self.u_values[k] * z[j];
258                    z[i] -= uz;
259                } else if j == i {
260                    diag_val = self.u_values[k];
261                }
262            }
263            if diag_val.abs() > 1e-300 {
264                z[i] /= diag_val;
265            }
266        }
267        z
268    }
269}
270
271// ---------------------------------------------------------------------------
272// Mixed preconditioned CG
273// ---------------------------------------------------------------------------
274
275/// Mixed CPU/GPU preconditioned conjugate gradient solver.
276///
277/// The preconditioner (ILU(0)) runs entirely on CPU; SpMV is the hot path
278/// that would execute on GPU in a real deployment.  In this pure-Rust
279/// implementation SpMV uses [`GpuPrecondCsr::spmv`], which simulates the GPU
280/// compute path without an actual GPU dependency.
281pub struct MixedPreconditionedCg {
282    preconditioner: Ilu0Preconditioner,
283    max_iter: usize,
284    tol: f64,
285}
286
287impl MixedPreconditionedCg {
288    /// Create a new solver, pre-computing ILU(0) from matrix `a`.
289    ///
290    /// # Errors
291    ///
292    /// Propagates any error from [`Ilu0Preconditioner::compute`].
293    pub fn new(a: &GpuPrecondCsr, max_iter: usize, tol: f64) -> SparseResult<Self> {
294        let preconditioner = Ilu0Preconditioner::compute(a)?;
295        Ok(Self {
296            preconditioner,
297            max_iter,
298            tol,
299        })
300    }
301
302    /// Solve `Ax = b` using preconditioned CG (Fletcher-Reeves with ILU(0)).
303    ///
304    /// Returns `(x, iterations_used)`.
305    ///
306    /// # Errors
307    ///
308    /// Returns [`SparseError::ValueError`] if `b.len() != a.nrows`.
309    pub fn solve(&self, a: &GpuPrecondCsr, b: &Array1<f64>) -> SparseResult<(Array1<f64>, usize)> {
310        let n = b.len();
311        if n != a.nrows {
312            return Err(SparseError::ValueError(format!(
313                "solve: b length {n} does not match matrix nrows {}",
314                a.nrows
315            )));
316        }
317
318        // Initial residual r = b - A*x  (start from zero)
319        let mut x = Array1::zeros(n);
320        let ax0 = a.spmv(&x);
321        let mut r: Array1<f64> = b - &ax0;
322        let mut z = self.preconditioner.apply(&r);
323        let mut p = z.clone();
324        let mut rz = r.dot(&z);
325
326        for iter in 0..self.max_iter {
327            let ap = a.spmv(&p);
328            let pap = p.dot(&ap);
329            if pap.abs() < 1e-300 {
330                break;
331            }
332            let alpha = rz / pap;
333            x = &x + &(&p * alpha);
334            r = &r - &(&ap * alpha);
335
336            let res_norm = r.dot(&r).sqrt();
337            if res_norm < self.tol {
338                return Ok((x, iter + 1));
339            }
340
341            z = self.preconditioner.apply(&r);
342            let rz_new = r.dot(&z);
343            if rz.abs() < 1e-300 {
344                break;
345            }
346            let beta = rz_new / rz;
347            p = &z + &(&p * beta);
348            rz = rz_new;
349        }
350
351        Ok((x, self.max_iter))
352    }
353}
354
355// ---------------------------------------------------------------------------
356// Tests
357// ---------------------------------------------------------------------------
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362    use scirs2_core::ndarray::Array2;
363
364    /// Build a tridiagonal matrix [-1, 2, -1] of size n×n.
365    fn tridiag(n: usize) -> GpuPrecondCsr {
366        let mut row_ptr = vec![0usize];
367        let mut col_idx = Vec::new();
368        let mut values = Vec::new();
369        for i in 0..n {
370            if i > 0 {
371                col_idx.push(i - 1);
372                values.push(-1.0_f64);
373            }
374            col_idx.push(i);
375            values.push(2.0_f64);
376            if i + 1 < n {
377                col_idx.push(i + 1);
378                values.push(-1.0_f64);
379            }
380            row_ptr.push(col_idx.len());
381        }
382        GpuPrecondCsr::new(n, n, row_ptr, col_idx, values)
383    }
384
385    /// Build a diagonally dominant SPD matrix of size n×n for CG testing.
386    fn spd_matrix(n: usize) -> GpuPrecondCsr {
387        let mut row_ptr = vec![0usize];
388        let mut col_idx = Vec::new();
389        let mut values = Vec::new();
390        for i in 0..n {
391            if i > 0 {
392                col_idx.push(i - 1);
393                values.push(-1.0_f64);
394            }
395            col_idx.push(i);
396            // Diagonal is (n + 2.0) to ensure strict diagonal dominance.
397            values.push((n as f64) + 2.0);
398            if i + 1 < n {
399                col_idx.push(i + 1);
400                values.push(-1.0_f64);
401            }
402            row_ptr.push(col_idx.len());
403        }
404        GpuPrecondCsr::new(n, n, row_ptr, col_idx, values)
405    }
406
407    #[test]
408    fn test_spmv() {
409        let n = 4;
410        let a = tridiag(n);
411        // x = [1, 1, 1, 1]  => A*x = [1, 0, 0, 1] for tridiag(2,-1)
412        let x = Array1::ones(n);
413        let y = a.spmv(&x);
414        // row 0: 2*1 + (-1)*1 = 1
415        // row 1: (-1)*1 + 2*1 + (-1)*1 = 0
416        // row 2: (-1)*1 + 2*1 + (-1)*1 = 0
417        // row 3: (-1)*1 + 2*1 = 1
418        assert!((y[0] - 1.0).abs() < 1e-12, "y[0]={}", y[0]);
419        assert!((y[1]).abs() < 1e-12, "y[1]={}", y[1]);
420        assert!((y[2]).abs() < 1e-12, "y[2]={}", y[2]);
421        assert!((y[3] - 1.0).abs() < 1e-12, "y[3]={}", y[3]);
422    }
423
424    #[test]
425    fn test_ilu0_compute() {
426        let a = tridiag(4);
427        let ilu = Ilu0Preconditioner::compute(&a).expect("ILU(0) on tridiagonal must succeed");
428        // Verify we got a proper object (n = 4).
429        assert_eq!(ilu.n, 4);
430        assert_eq!(ilu.row_ptr.len(), 5);
431    }
432
433    #[test]
434    fn test_ilu0_apply() {
435        let a = tridiag(6);
436        let ilu = Ilu0Preconditioner::compute(&a).expect("ILU(0) should succeed");
437        let r = Array1::from(vec![1.0_f64; 6]);
438        let z = ilu.apply(&r);
439        // Shape check
440        assert_eq!(z.len(), 6);
441        // All-ones r should produce a finite, non-NaN result.
442        for (i, &v) in z.iter().enumerate() {
443            assert!(v.is_finite(), "z[{i}] is not finite: {v}");
444        }
445    }
446
447    #[test]
448    fn test_from_dense_roundtrip() {
449        let mut dense = Array2::zeros((3, 3));
450        dense[[0, 0]] = 4.0;
451        dense[[0, 1]] = -1.0;
452        dense[[1, 0]] = -1.0;
453        dense[[1, 1]] = 4.0;
454        dense[[1, 2]] = -1.0;
455        dense[[2, 1]] = -1.0;
456        dense[[2, 2]] = 4.0;
457
458        let a = GpuPrecondCsr::from_dense(&dense);
459        assert_eq!(a.nrows, 3);
460        assert_eq!(a.ncols, 3);
461        assert_eq!(a.nnz(), 7);
462
463        let x = Array1::from(vec![1.0, 1.0, 1.0]);
464        let y = a.spmv(&x);
465        // Row 0: 4 - 1 = 3
466        assert!((y[0] - 3.0).abs() < 1e-12);
467        // Row 1: -1 + 4 - 1 = 2
468        assert!((y[1] - 2.0).abs() < 1e-12);
469        // Row 2: -1 + 4 = 3
470        assert!((y[2] - 3.0).abs() < 1e-12);
471    }
472
473    #[test]
474    fn test_preconditioned_cg_solve() {
475        let n = 10;
476        let a = spd_matrix(n);
477        // b = A * x_exact, x_exact = [1, 2, ..., n]
478        let x_exact = Array1::from_iter((1..=n).map(|i| i as f64));
479        let b = a.spmv(&x_exact);
480
481        let solver = MixedPreconditionedCg::new(&a, 200, 1e-10)
482            .expect("MixedPreconditionedCg construction must succeed");
483        let (x_sol, iters) = solver.solve(&a, &b).expect("PCG solve must succeed");
484
485        assert!(iters <= 200, "solver used more iterations than max");
486        // Verify ||x_sol - x_exact||_inf < 1e-6
487        for i in 0..n {
488            assert!(
489                (x_sol[i] - x_exact[i]).abs() < 1e-6,
490                "x_sol[{i}]={} vs x_exact[{i}]={}",
491                x_sol[i],
492                x_exact[i]
493            );
494        }
495    }
496}