use rayon::prelude::*;
use std::sync::Mutex;
#[derive(Debug, Clone, Default)]
pub struct CsrMatrix {
pub nrows: usize,
pub ncols: usize,
pub row_offsets: Vec<usize>,
pub col_indices: Vec<usize>,
pub values: Vec<f64>,
}
impl CsrMatrix {
pub fn new_from_pattern(
nrows: usize,
ncols: usize,
row_offsets: Vec<usize>,
col_indices: Vec<usize>,
) -> Self {
let nnz = col_indices.len();
Self {
nrows,
ncols,
row_offsets,
col_indices,
values: vec![0.0; nnz],
}
}
pub fn identity(n: usize) -> Self {
Self {
nrows: n,
ncols: n,
row_offsets: (0..=n).collect(),
col_indices: (0..n).collect(),
values: vec![1.0; n],
}
}
pub fn nnz(&self) -> usize {
self.values.len()
}
pub fn spmv_par(&self, x: &[f64], y: &mut [f64]) {
debug_assert_eq!(x.len(), self.ncols);
debug_assert_eq!(y.len(), self.nrows);
y.par_iter_mut().enumerate().for_each(|(i, yi)| {
let row_start = self.row_offsets[i];
let row_end = self.row_offsets[i + 1];
let mut sum = 0.0;
for k in row_start..row_end {
sum += self.values[k] * x[self.col_indices[k]];
}
*yi = sum;
});
}
pub fn spmv(&self, x: &[f64], y: &mut [f64]) {
debug_assert_eq!(x.len(), self.ncols);
debug_assert_eq!(y.len(), self.nrows);
for (i, yi) in y.iter_mut().enumerate().take(self.nrows) {
let row_start = self.row_offsets[i];
let row_end = self.row_offsets[i + 1];
let mut sum = 0.0;
for k in row_start..row_end {
sum += self.values[k] * x[self.col_indices[k]];
}
*yi = sum;
}
}
pub fn diagonal_preconditioner(&self) -> Vec<f64> {
let mut diag = vec![1.0f64; self.nrows];
for (i, diag_i) in diag.iter_mut().enumerate().take(self.nrows) {
for k in self.row_offsets[i]..self.row_offsets[i + 1] {
if self.col_indices[k] == i {
let d = self.values[k];
if d.abs() > 1e-15 {
*diag_i = 1.0 / d;
}
break;
}
}
}
diag
}
pub fn spmv_chunked(&self, x: &[f64], y: &mut [f64], chunk_size: usize) {
debug_assert_eq!(x.len(), self.ncols);
debug_assert_eq!(y.len(), self.nrows);
let chunk_size = chunk_size.max(1);
y.par_chunks_mut(chunk_size)
.enumerate()
.for_each(|(chunk_idx, y_chunk)| {
let row_start = chunk_idx * chunk_size;
for (k, yi) in y_chunk.iter_mut().enumerate() {
let row = row_start + k;
let rs = self.row_offsets[row];
let re = self.row_offsets[row + 1];
let mut sum = 0.0;
for j in rs..re {
sum += self.values[j] * x[self.col_indices[j]];
}
*yi = sum;
}
});
}
}
#[derive(Debug, Clone)]
pub struct AssemblyTask {
pub global_dofs: Vec<usize>,
pub ke: Vec<f64>,
}
impl AssemblyTask {
pub fn new(global_dofs: Vec<usize>, ke: Vec<f64>) -> Self {
let ndof = global_dofs.len();
debug_assert_eq!(ke.len(), ndof * ndof);
Self { global_dofs, ke }
}
pub fn ndof(&self) -> usize {
self.global_dofs.len()
}
}
#[derive(Debug, Default)]
pub struct ParallelAssembler {
pub ndofs: usize,
}
impl ParallelAssembler {
pub fn new(ndofs: usize) -> Self {
Self { ndofs }
}
pub fn assemble(&self, tasks: &[AssemblyTask]) -> CsrMatrix {
let mut row_cols: Vec<std::collections::BTreeSet<usize>> =
vec![std::collections::BTreeSet::new(); self.ndofs];
for task in tasks {
for &row_dof in &task.global_dofs {
for &col_dof in &task.global_dofs {
row_cols[row_dof].insert(col_dof);
}
}
}
let mut row_offsets = vec![0usize; self.ndofs + 1];
let mut col_indices: Vec<usize> = Vec::new();
for (i, cols) in row_cols.iter().enumerate() {
row_offsets[i + 1] = row_offsets[i] + cols.len();
col_indices.extend(cols.iter().copied());
}
let nnz = col_indices.len();
let values = vec![0.0f64; nnz];
let row_col_to_csr: Vec<std::collections::HashMap<usize, usize>> = row_cols
.iter()
.enumerate()
.map(|(i, cols)| {
let base = row_offsets[i];
cols.iter()
.enumerate()
.map(|(j, &c)| (c, base + j))
.collect()
})
.collect();
let values_locked: Vec<Mutex<f64>> = values.into_iter().map(Mutex::new).collect();
tasks.par_iter().for_each(|task| {
let ndof = task.ndof();
for (li, &row) in task.global_dofs.iter().enumerate() {
for (lj, &col) in task.global_dofs.iter().enumerate() {
let ke_val = task.ke[li * ndof + lj];
if let Some(&csr_idx) = row_col_to_csr[row].get(&col) {
let mut guard = values_locked[csr_idx]
.lock()
.unwrap_or_else(|e| e.into_inner());
*guard += ke_val;
}
}
}
});
let values: Vec<f64> = values_locked
.into_iter()
.map(|m| {
m.into_inner()
.expect("mutex not poisoned after parallel assembly")
})
.collect();
CsrMatrix {
nrows: self.ndofs,
ncols: self.ndofs,
row_offsets,
col_indices,
values,
}
}
pub fn assemble_rhs(
&self,
element_dofs: &[Vec<usize>],
element_forces: &[Vec<f64>],
) -> Vec<f64> {
let rhs_locked: Vec<Mutex<f64>> = (0..self.ndofs).map(|_| Mutex::new(0.0f64)).collect();
element_dofs
.par_iter()
.zip(element_forces.par_iter())
.for_each(|(dofs, forces)| {
for (&dof, &f) in dofs.iter().zip(forces.iter()) {
*rhs_locked[dof].lock().unwrap_or_else(|e| e.into_inner()) += f;
}
});
rhs_locked
.into_iter()
.map(|m| {
m.into_inner()
.expect("mutex not poisoned after parallel rhs assembly")
})
.collect()
}
pub fn assemble_colored(
&self,
tasks: &[AssemblyTask],
element_dofs: &[Vec<usize>],
) -> CsrMatrix {
use crate::solvers::assembly_coloring::{assemble_colored_csr, color_elements};
let coloring = color_elements(tasks.len(), element_dofs);
assemble_colored_csr(self.ndofs, tasks, &coloring)
}
}
#[derive(Debug, Clone, Copy)]
pub struct PcgStats {
pub iterations: usize,
pub residual_norm: f64,
pub converged: bool,
}
#[derive(Debug, Clone)]
pub struct ParallelPcgSolver {
pub max_iterations: usize,
pub tolerance: f64,
pub abs_tolerance: f64,
}
impl Default for ParallelPcgSolver {
fn default() -> Self {
Self {
max_iterations: 500,
tolerance: 1e-8,
abs_tolerance: 1e-14,
}
}
}
impl ParallelPcgSolver {
pub fn new(max_iterations: usize, tolerance: f64) -> Self {
Self {
max_iterations,
tolerance,
abs_tolerance: 1e-14,
}
}
pub fn solve(&self, a: &CsrMatrix, b: &[f64], x: &mut [f64]) -> PcgStats {
let n = a.nrows;
assert_eq!(b.len(), n);
assert_eq!(x.len(), n);
let m_inv = a.diagonal_preconditioner();
let mut ax = vec![0.0f64; n];
a.spmv_par(x, &mut ax);
let mut r: Vec<f64> = b.iter().zip(ax.iter()).map(|(bi, ai)| bi - ai).collect();
let b_norm = dot_par(b, b).sqrt();
if b_norm < self.abs_tolerance {
return PcgStats {
iterations: 0,
residual_norm: 0.0,
converged: true,
};
}
let mut z: Vec<f64> = r.iter().zip(m_inv.iter()).map(|(ri, mi)| ri * mi).collect();
let mut p = z.clone();
let mut rz = dot_par(&r, &z);
let mut ap = vec![0.0f64; n];
let mut iters = 0;
let mut res_norm = r.iter().map(|ri| ri * ri).sum::<f64>().sqrt();
for _ in 0..self.max_iterations {
if res_norm / b_norm < self.tolerance {
break;
}
if res_norm < self.abs_tolerance {
break;
}
a.spmv_par(&p, &mut ap);
let pap = dot_par(&p, &ap);
if pap.abs() < 1e-300 {
break;
}
let alpha = rz / pap;
x.par_iter_mut()
.zip(p.par_iter())
.for_each(|(xi, pi)| *xi += alpha * pi);
r.par_iter_mut()
.zip(ap.par_iter())
.for_each(|(ri, api)| *ri -= alpha * api);
z.par_iter_mut()
.zip(r.par_iter().zip(m_inv.par_iter()))
.for_each(|(zi, (ri, mi))| *zi = ri * mi);
let rz_new = dot_par(&r, &z);
let beta = rz_new / rz.max(1e-300);
rz = rz_new;
p.par_iter_mut()
.zip(z.par_iter())
.for_each(|(pi, zi)| *pi = zi + beta * *pi);
res_norm = r.iter().map(|ri| ri * ri).sum::<f64>().sqrt();
iters += 1;
}
PcgStats {
iterations: iters,
residual_norm: res_norm,
converged: res_norm / b_norm.max(1e-300) < self.tolerance
|| res_norm < self.abs_tolerance,
}
}
}
fn dot_par(a: &[f64], b: &[f64]) -> f64 {
a.par_iter().zip(b.par_iter()).map(|(ai, bi)| ai * bi).sum()
}
#[derive(Debug, Clone)]
pub struct ParallelGmresSolver {
pub krylov_dim: usize,
pub max_restarts: usize,
pub tolerance: f64,
}
impl Default for ParallelGmresSolver {
fn default() -> Self {
Self {
krylov_dim: 30,
max_restarts: 10,
tolerance: 1e-8,
}
}
}
impl ParallelGmresSolver {
pub fn solve(&self, a: &CsrMatrix, b: &[f64], x: &mut [f64]) -> PcgStats {
let n = a.nrows;
let m_inv = a.diagonal_preconditioner();
let mut res_norm = 0.0f64;
let mb: Vec<f64> = b.iter().zip(m_inv.iter()).map(|(bi, mi)| bi * mi).collect();
let b_norm = dot_par(&mb, &mb).sqrt().max(1e-300);
let mut total_iters = 0;
for _restart in 0..self.max_restarts {
let mut ax = vec![0.0f64; n];
a.spmv_par(x, &mut ax);
let r0: Vec<f64> = b.iter().zip(ax.iter()).map(|(bi, ai)| bi - ai).collect();
let z0: Vec<f64> = r0
.iter()
.zip(m_inv.iter())
.map(|(ri, mi)| ri * mi)
.collect();
let beta = dot_par(&z0, &z0).sqrt().max(1e-300);
res_norm = beta;
if res_norm / b_norm < self.tolerance {
break;
}
let m = self.krylov_dim.min(n);
let mut q: Vec<Vec<f64>> = Vec::with_capacity(m + 1);
q.push(z0.iter().map(|v| v / beta).collect());
let mut h = vec![0.0f64; (m + 1) * m];
let mut cs = vec![0.0f64; m];
let mut sn = vec![0.0f64; m];
let mut e1 = vec![0.0f64; m + 1];
e1[0] = beta;
let mut j_stop = m;
for j in 0..m {
let mut aqj = vec![0.0f64; n];
a.spmv_par(&q[j], &mut aqj);
let w: Vec<f64> = aqj.iter().zip(m_inv.iter()).map(|(v, mi)| v * mi).collect();
let mut w = w;
for i in 0..=j {
let hij = dot_par(&w, &q[i]);
h[i * m + j] = hij;
w.par_iter_mut()
.zip(q[i].par_iter())
.for_each(|(wi, qi)| *wi -= hij * qi);
}
let w_norm = dot_par(&w, &w).sqrt();
h[(j + 1) * m + j] = w_norm;
let exact_convergence = w_norm < 1e-14;
if !exact_convergence {
q.push(w.iter().map(|v| v / w_norm).collect());
}
for i in 0..j {
let tmp = cs[i] * h[i * m + j] + sn[i] * h[(i + 1) * m + j];
h[(i + 1) * m + j] = -sn[i] * h[i * m + j] + cs[i] * h[(i + 1) * m + j];
h[i * m + j] = tmp;
}
let (c, s) = givens_rotation(h[j * m + j], h[(j + 1) * m + j]);
cs[j] = c;
sn[j] = s;
h[j * m + j] = c * h[j * m + j] + s * h[(j + 1) * m + j];
h[(j + 1) * m + j] = 0.0;
e1[j + 1] = -s * e1[j];
e1[j] *= c;
res_norm = e1[j + 1].abs();
total_iters += 1;
if res_norm / b_norm < self.tolerance || exact_convergence {
j_stop = j + 1;
break;
}
}
let mut y = vec![0.0f64; j_stop];
for i in (0..j_stop).rev() {
y[i] = e1[i];
for k in (i + 1)..j_stop {
y[i] -= h[i * m + k] * y[k];
}
let hii = h[i * m + i];
if hii.abs() > 1e-300 {
y[i] /= hii;
}
}
for j in 0..j_stop {
let yj = y[j];
x.par_iter_mut()
.zip(q[j].par_iter())
.for_each(|(xi, qji)| *xi += yj * qji);
}
if res_norm / b_norm < self.tolerance {
break;
}
}
PcgStats {
iterations: total_iters,
residual_norm: res_norm,
converged: res_norm / b_norm < self.tolerance,
}
}
}
fn givens_rotation(a: f64, b: f64) -> (f64, f64) {
if b.abs() < 1e-300 {
(1.0, 0.0)
} else {
let r = a.hypot(b);
(a / r, b / r)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn diag_matrix(n: usize, diag_val: f64) -> CsrMatrix {
CsrMatrix {
nrows: n,
ncols: n,
row_offsets: (0..=n).collect(),
col_indices: (0..n).collect(),
values: vec![diag_val; n],
}
}
#[test]
fn pcg_solves_diagonal_system() {
let n = 16;
let mat = diag_matrix(n, 4.0);
let rhs: Vec<f64> = (0..n).map(|i| (i + 1) as f64).collect();
let mut x = vec![0.0f64; n];
let stats = ParallelPcgSolver::default().solve(&mat, &rhs, &mut x);
assert!(stats.converged, "PCG did not converge: {:?}", stats);
for (i, xi) in x.iter().enumerate() {
let expected = (i + 1) as f64 / 4.0;
assert!(
(xi - expected).abs() < 1e-10,
"x[{i}] = {xi}, expected {expected}"
);
}
}
#[test]
fn parallel_spmv_matches_sequential() {
let mat = diag_matrix(64, 3.0);
let x: Vec<f64> = (0..64).map(|i| i as f64).collect();
let mut y_par = vec![0.0f64; 64];
let mut y_seq = vec![0.0f64; 64];
mat.spmv_par(&x, &mut y_par);
mat.spmv(&x, &mut y_seq);
for (a, b) in y_par.iter().zip(y_seq.iter()) {
assert!((a - b).abs() < 1e-14, "{a} != {b}");
}
}
#[test]
fn parallel_assembler_assembles_2d_bar() {
let tasks = vec![
AssemblyTask::new(vec![0, 1], vec![1.0, -1.0, -1.0, 1.0]),
AssemblyTask::new(vec![1, 2], vec![1.0, -1.0, -1.0, 1.0]),
];
let asm = ParallelAssembler::new(3);
let mat = asm.assemble(&tasks);
assert_eq!(mat.nrows, 3);
let find_val = |row: usize, col: usize| -> f64 {
for k in mat.row_offsets[row]..mat.row_offsets[row + 1] {
if mat.col_indices[k] == col {
return mat.values[k];
}
}
0.0
};
assert!((find_val(0, 0) - 1.0).abs() < 1e-14);
assert!((find_val(1, 1) - 2.0).abs() < 1e-14);
assert!((find_val(2, 2) - 1.0).abs() < 1e-14);
assert!((find_val(0, 1) - (-1.0)).abs() < 1e-14);
assert!((find_val(1, 2) - (-1.0)).abs() < 1e-14);
}
#[test]
fn assemble_rhs_sums_forces() {
let asm = ParallelAssembler::new(3);
let dofs = vec![vec![0usize, 1], vec![1, 2]];
let forces = vec![vec![1.0, 2.0], vec![3.0, 4.0]];
let rhs = asm.assemble_rhs(&dofs, &forces);
assert!((rhs[0] - 1.0).abs() < 1e-14);
assert!((rhs[1] - 5.0).abs() < 1e-14); assert!((rhs[2] - 4.0).abs() < 1e-14);
}
#[test]
fn gmres_solves_diagonal_system() {
let n = 8;
let mat = diag_matrix(n, 2.0);
let rhs: Vec<f64> = (0..n).map(|i| (i + 1) as f64).collect();
let mut x = vec![0.0f64; n];
let stats = ParallelGmresSolver::default().solve(&mat, &rhs, &mut x);
assert!(stats.converged, "GMRES did not converge: {:?}", stats);
for (i, xi) in x.iter().enumerate() {
let expected = (i + 1) as f64 / 2.0;
assert!(
(xi - expected).abs() < 1e-6,
"x[{i}] = {xi}, expected {expected}"
);
}
}
#[test]
fn spmv_chunked_matches_spmv() {
let n = 64;
let mat = diag_matrix(n, 3.0);
let x: Vec<f64> = (0..n).map(|i| i as f64).collect();
let mut y_par = vec![0.0f64; n];
let mut y_chunked = vec![0.0f64; n];
mat.spmv_par(&x, &mut y_par);
mat.spmv_chunked(&x, &mut y_chunked, 16);
for (a, b) in y_par.iter().zip(y_chunked.iter()) {
assert!((a - b).abs() < 1e-14, "{a} != {b}");
}
}
}
use crate::solvers::amg::{
cycle::{AmgHierarchy, CycleKind},
preconditioner::Preconditioner,
};
#[derive(Debug)]
pub struct PcgWithPrecond<P: Preconditioner> {
pub max_iterations: usize,
pub tolerance: f64,
pub abs_tolerance: f64,
pub precond: P,
}
impl<P: Preconditioner> PcgWithPrecond<P> {
pub fn new(precond: P, max_iterations: usize, tolerance: f64) -> Self {
Self {
max_iterations,
tolerance,
abs_tolerance: 1e-14,
precond,
}
}
pub fn solve(&self, a: &CsrMatrix, b: &[f64], x: &mut [f64]) -> PcgStats {
let n = a.nrows;
let mut z = vec![0.0f64; n];
let mut ax = vec![0.0f64; n];
a.spmv(x, &mut ax);
let mut r: Vec<f64> = b.iter().zip(ax.iter()).map(|(bi, ai)| bi - ai).collect();
self.precond.apply(&r, &mut z);
let mut p = z.clone();
let mut rz = dot_par(&r, &z);
let b_norm = dot_par(b, b).sqrt().max(1e-300);
let mut ap = vec![0.0f64; n];
let mut iters = 0;
let mut res_norm = r.iter().map(|ri| ri * ri).sum::<f64>().sqrt();
for _ in 0..self.max_iterations {
if res_norm / b_norm < self.tolerance {
break;
}
if res_norm < self.abs_tolerance {
break;
}
a.spmv_par(&p, &mut ap);
let pap = dot_par(&p, &ap);
if pap.abs() < 1e-300 {
break;
}
let alpha = rz / pap;
for i in 0..n {
x[i] += alpha * p[i];
r[i] -= alpha * ap[i];
}
self.precond.apply(&r, &mut z);
let rz_new = dot_par(&r, &z);
let beta = rz_new / rz.max(1e-300);
rz = rz_new;
for i in 0..n {
p[i] = z[i] + beta * p[i];
}
res_norm = r.iter().map(|ri| ri * ri).sum::<f64>().sqrt();
iters += 1;
}
PcgStats {
iterations: iters,
residual_norm: res_norm,
converged: res_norm / b_norm < self.tolerance || res_norm < self.abs_tolerance,
}
}
}
pub type PcgWithAmg = PcgWithPrecond<crate::solvers::amg::preconditioner::AmgPreconditioner>;
pub struct GmresWithAmg {
pub krylov_dim: usize,
pub max_restarts: usize,
pub tolerance: f64,
pub hierarchy: AmgHierarchy,
}
impl GmresWithAmg {
pub fn new(
hierarchy: AmgHierarchy,
krylov_dim: usize,
max_restarts: usize,
tolerance: f64,
) -> Self {
Self {
krylov_dim,
max_restarts,
tolerance,
hierarchy,
}
}
pub fn solve(&self, a: &CsrMatrix, b: &[f64], x: &mut [f64]) -> PcgStats {
let n = a.nrows;
let b_norm = dot_par(b, b).sqrt().max(1e-300);
let mut res_norm = 0.0f64;
let mut total_iters = 0;
let pcg = ParallelPcgSolver::new(500, 1e-10);
let amg_precond = crate::solvers::amg::preconditioner::AmgPreconditioner {
hierarchy: self.hierarchy.clone(),
cycle_kind: CycleKind::V,
pcg,
};
for _restart in 0..self.max_restarts {
let mut ax = vec![0.0f64; n];
a.spmv(x, &mut ax);
let r0: Vec<f64> = b.iter().zip(ax.iter()).map(|(bi, ai)| bi - ai).collect();
let mut r0z = vec![0.0f64; n];
amg_precond.apply(&r0, &mut r0z);
res_norm = r0z.iter().map(|v| v * v).sum::<f64>().sqrt();
if res_norm / b_norm < self.tolerance {
break;
}
let beta = res_norm;
let mut q: Vec<Vec<f64>> = vec![r0z.iter().map(|v| v / beta).collect()];
let m = self.krylov_dim.min(n);
let mut h = vec![vec![0.0f64; m]; m + 1];
let mut cs = vec![0.0f64; m];
let mut sn = vec![0.0f64; m];
let mut e1 = vec![0.0f64; m + 1];
e1[0] = beta;
let mut j_stop = m;
for jj in 0..m {
let mut aq = vec![0.0f64; n];
a.spmv_par(&q[jj], &mut aq);
let mut w = vec![0.0f64; n];
amg_precond.apply(&aq, &mut w);
for ii in 0..=jj {
h[ii][jj] = dot_par(&w, &q[ii]);
for k in 0..n {
w[k] -= h[ii][jj] * q[ii][k];
}
}
h[jj + 1][jj] = w.iter().map(|v| v * v).sum::<f64>().sqrt();
let exact_convergence = h[jj + 1][jj] < 1e-14;
if !exact_convergence {
let inv_norm = 1.0 / h[jj + 1][jj];
q.push(w.iter().map(|v| v * inv_norm).collect());
}
for ii in 0..jj {
let tmp = cs[ii] * h[ii][jj] + sn[ii] * h[ii + 1][jj];
h[ii + 1][jj] = -sn[ii] * h[ii][jj] + cs[ii] * h[ii + 1][jj];
h[ii][jj] = tmp;
}
let denom = (h[jj][jj] * h[jj][jj] + h[jj + 1][jj] * h[jj + 1][jj]).sqrt();
cs[jj] = if denom > 1e-300 {
h[jj][jj] / denom
} else {
1.0
};
sn[jj] = if denom > 1e-300 {
h[jj + 1][jj] / denom
} else {
0.0
};
h[jj][jj] = cs[jj] * h[jj][jj] + sn[jj] * h[jj + 1][jj];
h[jj + 1][jj] = 0.0;
e1[jj + 1] = -sn[jj] * e1[jj];
e1[jj] *= cs[jj];
res_norm = e1[jj + 1].abs();
if res_norm / b_norm < self.tolerance || exact_convergence {
j_stop = jj + 1;
break;
}
}
let mut y = vec![0.0f64; j_stop];
for ii in (0..j_stop).rev() {
y[ii] = e1[ii];
for kk in (ii + 1)..j_stop {
y[ii] -= h[ii][kk] * y[kk];
}
if h[ii][ii].abs() > 1e-300 {
y[ii] /= h[ii][ii];
}
}
for ii in 0..j_stop {
let yi = y[ii];
for k in 0..n {
x[k] += yi * q[ii][k];
}
}
total_iters += j_stop;
if res_norm / b_norm < self.tolerance {
break;
}
}
PcgStats {
iterations: total_iters,
residual_norm: res_norm,
converged: res_norm / b_norm < self.tolerance,
}
}
}