use super::*;
pub(crate) fn tile_schur_partial<B: BatchedBlockSolver>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
backend: &B,
kind: SchurReductionKind,
ordinal: usize,
range: Range<usize>,
) -> Result<Array2<f64>, ArrowSchurError> {
let k = sys.k;
let mut factors: Vec<(Array2<f64>, Array2<f64>)> = Vec::with_capacity(range.len());
let mut total_d = 0usize;
for i in range.clone() {
let (left, right) = row_schur_contribution_factors(
sys,
i,
&sys.rows[i],
htt_factors.factor(i),
backend,
kind,
)?;
total_d += left.nrows();
factors.push((left, right));
}
if total_d > 0 && k > 0 {
let mut left_stack = Array2::<f64>::zeros((total_d, k));
let mut right_stack = Array2::<f64>::zeros((total_d, k));
let mut base = 0usize;
for (left, right) in &factors {
let di = left.nrows();
left_stack
.slice_mut(ndarray::s![base..base + di, ..])
.assign(left);
right_stack
.slice_mut(ndarray::s![base..base + di, ..])
.assign(right);
base += di;
}
if let Some(product) =
crate::gpu::try_fast_atb_on_ordinal(ordinal, left_stack.view(), right_stack.view())
{
return Ok(product.mapv(|v| -v));
}
}
let mut partial = Array2::<f64>::zeros((k, k));
for (left, right) in &factors {
backend.block_gemm_subtract(&mut partial, left, right);
}
Ok(partial)
}
pub(crate) fn reduce_row_schur_contributions<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
backend: &B,
kind: SchurReductionKind,
schur: &mut Array2<f64>,
) -> Result<(), ArrowSchurError> {
let n = sys.rows.len();
let k = sys.k;
let tiles = crate::gpu::device_runtime::GpuRuntime::global()
.map(|rt| crate::gpu::pool::balanced_partition(rt, n))
.filter(|tiles| tiles.len() > 1);
let Some(tiles) = tiles else {
let n_rows = sys.rows.len();
let parallel =
n_rows >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let partials: Result<Vec<Array2<f64>>, ArrowSchurError> = (0..n_rows)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut partial = Array2::<f64>::zeros((k, k));
for i in idxs {
subtract_row_schur_contribution(
sys,
i,
&sys.rows[i],
htt_factors.factor(i),
backend,
kind,
&mut partial,
)?;
}
Ok(partial)
})
.collect();
for partial in &partials? {
for a in 0..k {
for b in 0..k {
schur[[a, b]] += partial[[a, b]];
}
}
}
return Ok(());
}
for (i, row) in sys.rows.iter().enumerate() {
subtract_row_schur_contribution(
sys,
i,
row,
htt_factors.factor(i),
backend,
kind,
schur,
)?;
}
return Ok(());
};
let partials: Result<Vec<Array2<f64>>, ArrowSchurError> = std::thread::scope(|scope| {
let handles: Vec<_> = tiles
.iter()
.map(|(ordinal, range)| {
let ordinal = *ordinal;
let range = range.clone();
scope.spawn(move || {
#[cfg(target_os = "linux")]
{
if let Some(ctx) = crate::gpu::device_runtime::cuda_context_for(ordinal) {
if ctx.bind_to_thread().is_err() {
}
}
}
tile_schur_partial(sys, htt_factors, backend, kind, ordinal, range)
})
})
.collect();
handles
.into_iter()
.map(|handle| {
handle
.join()
.map_err(|_| ArrowSchurError::SchurFactorFailed {
reason: "schur-reduction tile thread panicked".to_string(),
})?
})
.collect()
});
let partials = partials?;
for partial in &partials {
for a in 0..k {
for b in 0..k {
schur[[a, b]] += partial[[a, b]];
}
}
}
Ok(())
}
pub(crate) fn build_dense_schur_direct<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
) -> Result<Array2<f64>, ArrowSchurError> {
let k = sys.k;
let op = sys.effective_penalty_op();
if op.dim() != k {
return Err(ArrowSchurError::SchurFactorFailed {
reason: "Direct BA requires a K×K shared H_ββ penalty operator".to_string(),
});
}
let mut schur = op.to_dense();
for j in 0..k {
schur[[j, j]] += ridge_beta;
}
reduce_row_schur_contributions(
sys,
htt_factors,
backend,
SchurReductionKind::Direct,
&mut schur,
)?;
symmetrize_upper_from_lower(&mut schur);
Ok(schur)
}
pub(crate) fn build_dense_schur_sqrt_ba<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
) -> Result<Array2<f64>, ArrowSchurError> {
let k = sys.k;
let op = sys.effective_penalty_op();
if op.dim() != k {
return Err(ArrowSchurError::SchurFactorFailed {
reason: "Square-Root BA direct solve requires a K×K shared H_ββ penalty operator"
.to_string(),
});
}
let mut schur = op.to_dense();
for j in 0..k {
schur[[j, j]] += ridge_beta;
}
reduce_row_schur_contributions(
sys,
htt_factors,
backend,
SchurReductionKind::SqrtBa,
&mut schur,
)?;
symmetrize_upper_from_lower(&mut schur);
Ok(schur)
}
pub(crate) fn mixed_precision_reduced_beta(
schur: &Array2<f64>,
factor: &Array2<f64>,
rhs: &Array1<f64>,
options: &ArrowSolveOptions,
) -> Option<Array1<f64>> {
let ArrowSolvePrecisionPolicy::CertifiedMixed {
max_refinement_steps,
residual_relative_tolerance,
kappa_unit_roundoff_margin,
} = options.solve_precision
else {
return None;
};
if options.trust_region.radius.is_finite() {
return None;
}
let n = schur.nrows();
if n == 0 {
return None;
}
let kappa = cholesky_factor_kappa_estimate(factor);
if !kappa.is_finite() || kappa * F32_UNIT_ROUNDOFF >= kappa_unit_roundoff_margin {
return None;
}
let factor_f32 = factor.mapv(|v| v as f32);
let s_inf = matrix_inf_norm(schur);
let rhs_inf = rhs.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
let certificate_tol = residual_relative_tolerance
.max(MIXED_PRECISION_CERTIFICATE_EPSILON_MULTIPLIER * f64::EPSILON);
let mut x = cholesky_solve_lower_f32(&factor_f32, &rhs.mapv(|v| v as f32)).mapv(|v| v as f64);
let mut last_residual = f64::INFINITY;
for _ in 0..=max_refinement_steps {
let sx = schur.dot(&x);
let mut r = rhs.clone();
r -= &sx;
let r_inf = r.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
let x_inf = x.iter().fold(0.0_f64, |a, &b| a.max(b.abs()));
let denom = s_inf * x_inf + rhs_inf;
let backward_error = if denom > 0.0 { r_inf / denom } else { 0.0 };
if backward_error <= certificate_tol {
return Some(x);
}
if !(r_inf < last_residual) {
return None;
}
last_residual = r_inf;
let delta = cholesky_solve_lower_f32(&factor_f32, &r.mapv(|v| v as f32)).mapv(|v| v as f64);
x += δ
}
None
}
pub(crate) fn matrix_inf_norm(a: &Array2<f64>) -> f64 {
let mut max_row = 0.0_f64;
for row in a.rows() {
let s: f64 = row.iter().map(|v| v.abs()).sum();
if s > max_row {
max_row = s;
}
}
max_row
}
pub(crate) fn spectral_pd_floored_schur(
schur: &Array2<f64>,
relative_floor: f64,
) -> Option<Array2<f64>> {
let n = schur.nrows();
if n == 0 || schur.ncols() != n || !(relative_floor.is_finite() && relative_floor > 0.0) {
return None;
}
let mut sym = Array2::<f64>::zeros((n, n));
for i in 0..n {
for j in 0..n {
let v = 0.5 * (schur[[i, j]] + schur[[j, i]]);
if !v.is_finite() {
return None;
}
sym[[i, j]] = v;
}
}
let (evals, evecs) = sym.eigh(Side::Lower).ok()?;
let max_abs = evals.iter().fold(
0.0_f64,
|acc, &v| if v.is_finite() { acc.max(v.abs()) } else { acc },
);
if !(max_abs.is_finite() && max_abs > 0.0) {
return None;
}
let floor = relative_floor * max_abs;
let mut conditioned = Array2::<f64>::zeros((n, n));
for eig_idx in 0..evals.len() {
let lambda = evals[eig_idx];
let lambda_floored = if lambda.is_finite() { lambda.max(floor) } else { floor };
for i in 0..n {
let vi = evecs[[i, eig_idx]];
if vi == 0.0 {
continue;
}
for j in 0..n {
conditioned[[i, j]] += lambda_floored * vi * evecs[[j, eig_idx]];
}
}
}
Some(conditioned)
}
pub(crate) fn solve_dense_reduced_system(
schur: &Array2<f64>,
rhs_beta: &Array1<f64>,
options: &ArrowSolveOptions,
metric_weights: Option<&MetricWeights>,
) -> Result<(Array1<f64>, Option<Array2<f64>>, PcgDiagnostics), ArrowSchurError> {
let factor = match cholesky_lower(schur) {
Ok(factor) => factor,
Err(e) => {
match options.schur_pd_floor {
Some(relative_floor) => match spectral_pd_floored_schur(schur, relative_floor) {
Some(floored) => match cholesky_lower(&floored) {
Ok(factor) => {
let direct = mixed_precision_reduced_beta(
&floored, &factor, rhs_beta, options,
)
.unwrap_or_else(|| cholesky_solve_vector(&factor, rhs_beta));
if step_inside_trust_region(
direct.view(),
options.trust_region.radius,
metric_weights,
) {
return Ok((direct, Some(factor), PcgDiagnostics::default()));
}
let identity = IdentityPreconditioner;
let (delta, diag) = steihaug_dense_system(
&floored,
rhs_beta,
&identity,
&ArrowPcgOptions {
max_iterations: options.trust_region.max_iterations,
relative_tolerance: options
.trust_region
.steihaug_relative_tolerance,
},
&options.trust_region,
metric_weights,
)?;
return Ok((delta, Some(factor), diag));
}
Err(floored_err) => {
return Err(ArrowSchurError::SchurFactorFailed {
reason: format!(
"reduced Schur non-PD ({e}); spectral PD-floor \
reconstruction still non-PD: {floored_err}"
),
});
}
},
None => {
return Err(ArrowSchurError::SchurFactorFailed {
reason: format!(
"reduced Schur non-PD ({e}); spectral PD-floor declined \
(no usable spectrum)"
),
});
}
},
None => return Err(ArrowSchurError::SchurFactorFailed { reason: e }),
}
}
};
if !options.tolerate_ill_conditioning {
let schur_kappa = cholesky_factor_kappa_estimate(&factor);
if !schur_kappa.is_finite() || schur_kappa > safe_spd_kappa_max(schur.nrows()) {
return Err(ArrowSchurError::SchurFactorFailed {
reason: format!(
"reduced Schur complement Cholesky succeeded but is ill-conditioned \
(kappa_estimate={schur_kappa:e}); accumulated per-row \
(H_tt)⁻¹ contamination would yield an inaccurate Δβ"
),
});
}
}
let direct = mixed_precision_reduced_beta(schur, &factor, rhs_beta, options)
.unwrap_or_else(|| cholesky_solve_vector(&factor, rhs_beta));
if step_inside_trust_region(direct.view(), options.trust_region.radius, metric_weights) {
return Ok((direct, Some(factor), PcgDiagnostics::default()));
}
let identity = IdentityPreconditioner;
let (delta, diag) = steihaug_dense_system(
schur,
rhs_beta,
&identity,
&ArrowPcgOptions {
max_iterations: options.trust_region.max_iterations,
relative_tolerance: options.trust_region.steihaug_relative_tolerance,
},
&options.trust_region,
metric_weights,
)?;
Ok((delta, Some(factor), diag))
}
pub fn solve_streaming_reduced_beta(
s_acc: &Array2<f64>,
rhs_beta: &Array1<f64>,
options: &ArrowSolveOptions,
) -> Result<Array1<f64>, ArrowSchurError> {
let mut proximal_ridge = 0.0_f64;
let mut last_err: Option<ArrowSchurError> = None;
for attempt in 0..=DEFAULT_PROXIMAL_MAX_ATTEMPTS {
let mut schur = s_acc.clone();
symmetrize_upper_from_lower(&mut schur);
if proximal_ridge > 0.0 {
for j in 0..schur.nrows() {
schur[[j, j]] += proximal_ridge;
}
}
if crate::gpu::device_runtime::GpuRuntime::is_available() {
match crate::gpu::kernels::arrow_schur::solve_reduced_beta_pcg(
&schur,
rhs_beta,
options.trust_region.max_iterations,
options.trust_region.steihaug_relative_tolerance,
) {
Ok(delta_beta) => return Ok(delta_beta),
Err(crate::gpu::kernels::arrow_schur::ArrowSchurGpuFailure::Unavailable) => {}
Err(_) => {
}
}
}
match solve_dense_reduced_system(&schur, rhs_beta, options, None) {
Ok((delta_beta, _factor, _diag)) => return Ok(delta_beta),
Err(err) => {
let recoverable = matches!(
err,
ArrowSchurError::SchurFactorFailed { .. }
| ArrowSchurError::PcgFailed { .. }
| ArrowSchurError::UnboundedNegativeCurvature { .. }
);
last_err = Some(err);
if !recoverable || attempt == DEFAULT_PROXIMAL_MAX_ATTEMPTS {
break;
}
proximal_ridge = if proximal_ridge == 0.0 {
DEFAULT_PROXIMAL_INITIAL_RIDGE
} else {
proximal_ridge * DEFAULT_PROXIMAL_RIDGE_GROWTH
};
}
}
}
Err(last_err.expect("escalation loop set last_err on failure"))
}
pub(crate) fn step_inside_trust_region(
step: ArrayView1<'_, f64>,
radius: f64,
metric_weights: Option<&MetricWeights>,
) -> bool {
!radius.is_finite() || metric_norm(step, metric_weights) <= radius
}
pub(crate) const SCHUR_MATVEC_PARALLEL_ROW_MIN: usize = 256;
pub(crate) const SCHUR_PROLOGUE_PARALLEL_K_MIN: usize = 512;
pub(crate) struct SaeResidentReducedSchur {
pub(crate) p: usize,
pub(crate) rows: Vec<ResidentRowFactor>,
pub(crate) a_phi: Arc<[Vec<(usize, f64)>]>,
}
pub(crate) struct ResidentRowFactor {
pub(crate) di: usize,
pub(crate) l: Vec<f64>,
pub(crate) y: Vec<f64>,
}
impl SaeResidentReducedSchur {
pub(crate) fn build<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
backend: &B,
) -> Option<Self> {
let data = sys.device_sae_pcg.as_ref()?;
let p = data.p;
let n = sys.rows.len();
if p == 0
|| sys.htbeta_dense_supplement
|| data.a_phi.len() != n
|| data.local_jac.len() != n
{
return None;
}
let empty = || ResidentRowFactor {
di: 0,
l: Vec::new(),
y: Vec::new(),
};
let build_row = |row: usize| -> ResidentRowFactor {
let di = sys.row_dims[row];
let jac = &data.local_jac[row];
if p == 0 || jac.len() != di * p || di == 0 {
return empty();
}
let l_i = match ArrayView2::from_shape((di, p), jac.as_slice()) {
Ok(v) => v.to_owned(),
Err(_) => return empty(),
};
let y = backend.solve_block_matrix(htt_factors.factor(row), l_i.view());
let l_flat: Vec<f64> = l_i.iter().copied().collect();
let y_flat: Vec<f64> = y.iter().copied().collect();
ResidentRowFactor {
di,
l: l_flat,
y: y_flat,
}
};
let rows: Vec<ResidentRowFactor> =
if n >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none() {
use rayon::prelude::*;
(0..n).into_par_iter().map(build_row).collect()
} else {
(0..n).map(build_row).collect()
};
Some(Self {
p,
rows,
a_phi: data.a_phi_shared(),
})
}
#[inline]
pub(crate) fn row_into(
&self,
row: usize,
x: &Array1<f64>,
acc: &mut Array1<f64>,
gather: &mut [f64],
prod: &mut [f64],
w: &mut [f64],
) {
let rf = &self.rows[row];
let di = rf.di;
if di == 0 {
return;
}
let p = self.p;
let support = &self.a_phi[row];
if support.is_empty() {
return;
}
for v in gather.iter_mut() {
*v = 0.0;
}
for &(base, phi) in support {
if phi == 0.0 {
continue;
}
for j in 0..p {
gather[j] += phi * x[base + j];
}
}
for r in 0..di {
let yrow = &rf.y[r * p..r * p + p];
let mut s = 0.0_f64;
for c in 0..p {
s += yrow[c] * gather[c];
}
w[r] = s;
}
for v in prod.iter_mut().take(p) {
*v = 0.0;
}
for r in 0..di {
let lrow = &rf.l[r * p..r * p + p];
let wr = w[r];
for j in 0..p {
prod[j] += lrow[j] * wr;
}
}
for &(base, phi) in support {
if phi == 0.0 {
continue;
}
for j in 0..p {
acc[base + j] += phi * prod[j];
}
}
}
pub(crate) fn max_di(&self) -> usize {
self.rows.iter().map(|r| r.di).max().unwrap_or(0)
}
}
pub(crate) fn schur_matvec<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
x: &Array1<f64>,
out: &mut Array1<f64>,
backend: &B,
resident: Option<&SaeResidentReducedSchur>,
) {
out.fill(0.0);
let k = sys.k;
let parallel =
sys.rows.len() >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
{
let x_slice = x.as_slice().expect("x must be contiguous");
let out_slice = out.as_slice_mut().expect("out must be contiguous");
sys.penalty_ridge_prologue_into(x_slice, ridge_beta, out_slice, parallel);
}
let p = resident.map(|r| r.p).unwrap_or(0);
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let n = sys.rows.len();
let partials: Vec<Array1<f64>> = (0..n)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut acc = Array1::<f64>::zeros(k);
if let Some(res) = resident {
let mut gather = vec![0.0_f64; p];
let mut prod = vec![0.0_f64; p];
let mut w = vec![0.0_f64; res.max_di()];
for i in idxs {
res.row_into(i, x, &mut acc, &mut gather, &mut prod, &mut w);
}
} else {
let mut local = Array1::<f64>::zeros(sys.d);
for i in idxs {
schur_matvec_row_into(
sys,
htt_factors,
x,
backend,
i,
&mut local,
&mut acc,
);
}
}
acc
})
.collect();
for acc in &partials {
for a in 0..k {
out[a] -= acc[a];
}
}
} else if let Some(res) = resident {
let mut acc = Array1::<f64>::zeros(k);
let mut gather = vec![0.0_f64; p];
let mut prod = vec![0.0_f64; p];
let mut w = vec![0.0_f64; res.max_di()];
for i in 0..sys.rows.len() {
res.row_into(i, x, &mut acc, &mut gather, &mut prod, &mut w);
}
for a in 0..k {
out[a] -= acc[a];
}
} else {
let mut local = Array1::<f64>::zeros(sys.d);
let mut neg_contrib = Array1::<f64>::zeros(k);
for i in 0..sys.rows.len() {
neg_contrib.fill(0.0);
schur_matvec_row_into(
sys,
htt_factors,
x,
backend,
i,
&mut local,
&mut neg_contrib,
);
for a in 0..k {
out[a] -= neg_contrib[a];
}
}
}
}
#[inline]
pub(crate) fn schur_matvec_row_into<B: BatchedBlockSolver>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
x: &Array1<f64>,
backend: &B,
i: usize,
local: &mut Array1<f64>,
acc: &mut Array1<f64>,
) {
let row = &sys.rows[i];
let di = sys.row_dims[i];
let mut local_i = local.slice_mut(ndarray::s![..di]).to_owned();
local_i.fill(0.0);
sys_htbeta_apply_row(sys, i, row, x.view(), &mut local_i);
let solved = backend.solve_block_vector(htt_factors.factor(i), local_i.view());
sys_htbeta_accumulate_transpose(sys, i, row, solved.view(), acc);
}
#[derive(Clone)]
pub(crate) enum BlockFactor {
Chol {
factor: FaerLlt<f64>,
range: Range<usize>,
},
Scalar {
inv: Array1<f64>,
range: Range<usize>,
},
}
impl std::fmt::Debug for BlockFactor {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
BlockFactor::Chol { range, .. } => {
write!(f, "BlockFactor::Chol {{ range: {:?} }}", range)
}
BlockFactor::Scalar { inv, range } => {
write!(
f,
"BlockFactor::Scalar {{ inv.len: {}, range: {:?} }}",
inv.len(),
range
)
}
}
}
}
#[derive(Debug, Clone)]
pub struct JacobiPreconditioner {
pub(crate) blocks: Vec<BlockFactor>,
}
pub(crate) const BLOCK_JACOBI_MAX_BLOCK: usize = 256;
pub(crate) const JACOBI_DIAGONAL_PD_FLOOR: f64 = 1e-18;
impl JacobiPreconditioner {
pub(crate) fn from_arrow_schur<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
resident: Option<&SaeResidentReducedSchur>,
) -> Result<Self, ArrowSchurError> {
let use_block = !sys.block_offsets.is_empty()
&& sys
.block_offsets
.iter()
.map(|r| r.end.saturating_sub(r.start))
.max()
.unwrap_or(0)
<= BLOCK_JACOBI_MAX_BLOCK;
if use_block {
if let Some(res) = resident {
Self::build_block_jacobi_resident(sys, ridge_beta, res)
} else {
Self::build_block_jacobi(sys, htt_factors, ridge_beta, backend)
}
} else if let Some(res) = resident {
Self::build_scalar_jacobi_resident(sys, ridge_beta, res)
} else {
Self::build_scalar_jacobi(sys, htt_factors, ridge_beta, backend)
}
}
pub(crate) fn build_scalar_jacobi<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
) -> Result<Self, ArrowSchurError> {
let k = sys.k;
let mut diag = Array1::<f64>::zeros(k);
{
let diag_slice = diag.as_slice_mut().expect("diag must be contiguous");
sys.penalty_diagonal_add(diag_slice);
}
for a in 0..k {
diag[a] += ridge_beta;
}
let row_into = |i: usize, row: &ArrowRowBlock, diag_part: &mut Array1<f64>| {
let di = sys.row_dims[i];
let mut col_i = Array1::<f64>::zeros(di);
let mut e_a = Array1::<f64>::zeros(k);
for a in 0..k {
if sys.htbeta_matvec.is_some() || row.htbeta.dim() != (di, k) {
e_a.fill(0.0);
e_a[a] = 1.0;
col_i.fill(0.0);
sys_htbeta_apply_row(sys, i, row, e_a.view(), &mut col_i);
} else {
for c in 0..di {
col_i[c] = row.htbeta[[c, a]];
}
}
let solved = backend.solve_block_vector(htt_factors.factor(i), col_i.view());
let mut acc = 0.0;
for c in 0..di {
acc += col_i[c] * solved[c];
}
diag_part[a] -= acc;
}
};
let n = sys.rows.len();
let parallel =
n >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let partials: Vec<Array1<f64>> = (0..n)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut diag_part = Array1::<f64>::zeros(k);
for i in idxs {
row_into(i, &sys.rows[i], &mut diag_part);
}
diag_part
})
.collect();
for part in &partials {
for a in 0..k {
diag[a] += part[a];
}
}
} else {
for (i, row) in sys.rows.iter().enumerate() {
row_into(i, row, &mut diag);
}
}
let mut blocks = Vec::with_capacity(k);
for a in 0..k {
let v = diag[a];
if !v.is_finite() || v <= JACOBI_DIAGONAL_PD_FLOOR {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"invalid Schur Jacobi diagonal at index {a}: {v}; \
operator regularization is required"
),
});
}
blocks.push(BlockFactor::Scalar {
inv: Array1::from_elem(1, 1.0 / v),
range: a..a + 1,
});
}
Ok(Self { blocks })
}
pub(crate) fn build_scalar_jacobi_resident(
sys: &ArrowSchurSystem,
ridge_beta: f64,
resident: &SaeResidentReducedSchur,
) -> Result<Self, ArrowSchurError> {
let k = sys.k;
let p = resident.p;
let n = resident.rows.len();
let mut diag = Array1::<f64>::zeros(k);
{
let diag_slice = diag.as_slice_mut().expect("diag must be contiguous");
sys.penalty_diagonal_add(diag_slice);
}
for a in 0..k {
diag[a] += ridge_beta;
}
let row_into = |row: usize, diag_part: &mut [f64], col_dot: &mut [f64]| {
let rf = &resident.rows[row];
let di = rf.di;
if di == 0 {
return;
}
let support = &resident.a_phi[row];
if support.is_empty() {
return;
}
for (j, slot) in col_dot.iter_mut().enumerate().take(p) {
let mut acc = 0.0_f64;
for r in 0..di {
let idx = r * p + j;
acc += rf.l[idx] * rf.y[idx];
}
*slot = acc;
}
for &(beta_base, phi) in support {
if phi == 0.0 {
continue;
}
let phi2 = phi * phi;
for j in 0..p {
diag_part[beta_base + j] -= phi2 * col_dot[j];
}
}
};
let parallel =
n >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let partials: Vec<Array1<f64>> = (0..n)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut diag_part = Array1::<f64>::zeros(k);
let mut col_dot = vec![0.0_f64; p];
let slice = diag_part
.as_slice_mut()
.expect("diag_part must be contiguous");
for i in idxs {
row_into(i, slice, &mut col_dot);
}
diag_part
})
.collect();
for part in &partials {
for a in 0..k {
diag[a] += part[a];
}
}
} else {
let diag_slice = diag.as_slice_mut().expect("diag must be contiguous");
let mut col_dot = vec![0.0_f64; p];
for row in 0..n {
row_into(row, diag_slice, &mut col_dot);
}
}
let mut blocks = Vec::with_capacity(k);
for a in 0..k {
let v = diag[a];
if !v.is_finite() || v <= JACOBI_DIAGONAL_PD_FLOOR {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"invalid SAE-resident Schur Jacobi diagonal at index {a}: {v}; \
operator regularization is required"
),
});
}
blocks.push(BlockFactor::Scalar {
inv: Array1::from_elem(1, 1.0 / v),
range: a..a + 1,
});
}
Ok(Self { blocks })
}
pub(crate) fn build_block_jacobi_resident(
sys: &ArrowSchurSystem,
ridge_beta: f64,
resident: &SaeResidentReducedSchur,
) -> Result<Self, ArrowSchurError> {
let block_offsets = &sys.block_offsets;
let p = resident.p;
let mut schur_blocks: Vec<Array2<f64>> = Vec::with_capacity(block_offsets.len());
for (block_idx, range) in block_offsets.iter().enumerate() {
let b = range.end - range.start;
let mut schur_block = Array2::<f64>::zeros((b, b));
sys.penalty_block_add(
BetaBlockId(block_idx),
block_offsets.as_ref(),
&mut schur_block,
);
for bi in 0..b {
schur_block[[bi, bi]] += ridge_beta;
}
schur_blocks.push(schur_block);
}
let row_into = |row: usize, blocks: &mut [Array2<f64>]| {
let rf = &resident.rows[row];
let di = rf.di;
if di == 0 {
return;
}
let support = &resident.a_phi[row];
if support.is_empty() {
return;
}
for (block_idx, range) in block_offsets.iter().enumerate() {
let block = &mut blocks[block_idx];
for &(base_left, phi_left) in support {
if phi_left == 0.0 {
continue;
}
let left_start = base_left.max(range.start);
let left_end = (base_left + p).min(range.end);
if left_start >= left_end {
continue;
}
for &(base_right, phi_right) in support {
if phi_right == 0.0 {
continue;
}
let right_start = base_right.max(range.start);
let right_end = (base_right + p).min(range.end);
if right_start >= right_end {
continue;
}
let phi = phi_left * phi_right;
for gi in left_start..left_end {
let li = gi - range.start;
let ch_i = gi - base_left;
for gj in right_start..right_end {
let lj = gj - range.start;
let ch_j = gj - base_right;
let mut gij = 0.0_f64;
for r in 0..di {
gij += rf.l[r * p + ch_i] * rf.y[r * p + ch_j];
}
block[[li, lj]] -= phi * gij;
}
}
}
}
}
};
let n = resident.rows.len();
let parallel =
n >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let n_blocks = block_offsets.len();
let block_dims: Vec<usize> = block_offsets.iter().map(|r| r.end - r.start).collect();
let partials: Vec<Vec<Array2<f64>>> = (0..n)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut local: Vec<Array2<f64>> = block_dims
.iter()
.map(|&b| Array2::<f64>::zeros((b, b)))
.collect();
for i in idxs {
row_into(i, &mut local);
}
local
})
.collect();
for local in &partials {
for bidx in 0..n_blocks {
schur_blocks[bidx] += &local[bidx];
}
}
} else {
for row in 0..n {
row_into(row, &mut schur_blocks);
}
}
let mut blocks = Vec::with_capacity(block_offsets.len());
for (block_idx, range) in block_offsets.iter().enumerate() {
let b = range.end - range.start;
let schur_block = &schur_blocks[block_idx];
let factor_opt = {
use faer::Side;
let view = FaerArrayView::new(schur_block);
FaerLlt::new(view.as_ref(), Side::Lower).ok()
};
if let Some(llt) = factor_opt {
blocks.push(BlockFactor::Chol {
factor: llt,
range: range.clone(),
});
} else {
let mut inv = Array1::<f64>::zeros(b);
for bi in 0..b {
let v = schur_block[[bi, bi]];
if !v.is_finite() || v <= JACOBI_DIAGONAL_PD_FLOOR {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"SAE-resident block Jacobi scalar fallback: non-PD diagonal at \
global index {}: {v}; regularization required",
range.start + bi
),
});
}
inv[bi] = 1.0 / v;
}
blocks.push(BlockFactor::Scalar {
inv,
range: range.clone(),
});
}
}
Ok(Self { blocks })
}
pub(crate) fn build_block_jacobi<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
) -> Result<Self, ArrowSchurError> {
let block_offsets = &sys.block_offsets;
let mut schur_blocks: Vec<Array2<f64>> = Vec::with_capacity(block_offsets.len());
for (block_idx, range) in block_offsets.iter().enumerate() {
let b = range.end - range.start;
let mut schur_block = Array2::<f64>::zeros((b, b));
sys.penalty_block_add(
BetaBlockId(block_idx),
block_offsets.as_ref(),
&mut schur_block,
);
for bi in 0..b {
schur_block[[bi, bi]] += ridge_beta;
}
schur_blocks.push(schur_block);
}
let row_into = |i: usize,
row: &ArrowRowBlock,
blocks: &mut [Array2<f64>]|
-> Result<(), ArrowSchurError> {
let di = sys.row_dims[i];
let htbeta_full = sys_htbeta_materialize_row(sys, i, row)?;
for (block_idx, range) in block_offsets.iter().enumerate() {
let b = range.end - range.start;
let mut solved_cols = Array2::<f64>::zeros((di, b));
for bj in 0..b {
let gj = range.start + bj;
let rhs = htbeta_full.column(gj).to_owned();
let solved = backend.solve_block_vector(htt_factors.factor(i), rhs.view());
for c in 0..di {
solved_cols[[c, bj]] = solved[c];
}
}
let schur_block = &mut blocks[block_idx];
for bi in 0..b {
let gi = range.start + bi;
for bj in 0..b {
let mut acc = 0.0;
for c in 0..di {
acc += htbeta_full[[c, gi]] * solved_cols[[c, bj]];
}
schur_block[[bi, bj]] -= acc;
}
}
}
Ok(())
};
let n = sys.rows.len();
let parallel =
n >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let n_blocks = block_offsets.len();
let block_dims: Vec<usize> = block_offsets.iter().map(|r| r.end - r.start).collect();
let partials: Vec<Vec<Array2<f64>>> = (0..n)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut local: Vec<Array2<f64>> = block_dims
.iter()
.map(|&b| Array2::<f64>::zeros((b, b)))
.collect();
for i in idxs {
row_into(i, &sys.rows[i], &mut local)?;
}
Ok::<_, ArrowSchurError>(local)
})
.collect::<Result<Vec<_>, _>>()?;
for local in &partials {
for bidx in 0..n_blocks {
schur_blocks[bidx] += &local[bidx];
}
}
} else {
for (i, row) in sys.rows.iter().enumerate() {
row_into(i, row, &mut schur_blocks)?;
}
}
let mut blocks = Vec::with_capacity(block_offsets.len());
for (block_idx, range) in block_offsets.iter().enumerate() {
let b = range.end - range.start;
let schur_block = &schur_blocks[block_idx];
let factor_opt = {
use faer::Side;
let view = FaerArrayView::new(schur_block);
FaerLlt::new(view.as_ref(), Side::Lower).ok()
};
if let Some(llt) = factor_opt {
blocks.push(BlockFactor::Chol {
factor: llt,
range: range.clone(),
});
} else {
let mut inv = Array1::<f64>::zeros(b);
for bi in 0..b {
let v = schur_block[[bi, bi]];
if !v.is_finite() || v <= JACOBI_DIAGONAL_PD_FLOOR {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"block Jacobi scalar fallback: non-PD diagonal at \
global index {}: {v}; regularization required",
range.start + bi
),
});
}
inv[bi] = 1.0 / v;
}
blocks.push(BlockFactor::Scalar {
inv,
range: range.clone(),
});
}
}
Ok(Self { blocks })
}
pub(crate) fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(r.len());
for block in &self.blocks {
match block {
BlockFactor::Scalar { inv, range } => {
for (local, gi) in range.clone().enumerate() {
out[gi] = inv[local] * r[gi];
}
}
BlockFactor::Chol { factor, range } => {
let b = range.end - range.start;
let mut rhs = Array1::<f64>::zeros(b);
for (local, gi) in range.clone().enumerate() {
rhs[local] = r[gi];
}
use faer::linalg::solvers::Solve;
let stride = rhs.strides()[0];
let len = rhs.len();
let rhs_mat =
unsafe { faer::MatRef::from_raw_parts(rhs.as_ptr(), len, 1, stride, 0) };
let solved = factor.solve(rhs_mat);
for (local, gi) in range.clone().enumerate() {
out[gi] = solved[(local, 0)];
}
}
}
}
out
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SchurPreconditionerKind {
Diagonal,
BetaBlockJacobi,
ClusterJacobi,
AdditiveSchwarz { overlap: usize },
DiagAssembledSchwarz { overlap: usize },
BlockIncompleteCholesky,
}
pub(crate) const PRECOND_ESCALATE_K_THRESHOLD: usize = 100;
pub(crate) const SCHUR_CURVATURE_FLOOR_MARGIN: f64 = 1.0e-2;
pub(crate) const SCHUR_CURVATURE_FLOOR_REL_FLOOR: f64 = 1.0e-12;
pub(crate) const SCHUR_CURVATURE_FLOOR_REL_CEILING: f64 = 1.0e12;
pub(crate) const SCHUR_CURVATURE_FLOOR_DIAG_GROWTH: f64 = 10.0;
pub(crate) const SCHUR_CURVATURE_FLOOR_MAX_ATTEMPTS: usize = 24;
#[derive(Clone)]
pub(crate) enum ClusterFactor {
Chol {
cols: Vec<usize>,
factor: FaerLlt<f64>,
},
Scalar {
cols: Vec<usize>,
inv: Vec<f64>,
},
}
impl std::fmt::Debug for ClusterFactor {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ClusterFactor::Chol { cols, .. } => {
write!(f, "ClusterFactor::Chol {{ cols.len: {} }}", cols.len())
}
ClusterFactor::Scalar { cols, inv } => write!(
f,
"ClusterFactor::Scalar {{ cols.len: {}, inv.len: {} }}",
cols.len(),
inv.len()
),
}
}
}
pub(crate) const CLUSTER_JACOBI_MAX_CLUSTER: usize = 512;
pub(crate) const IC0_MAX_COMPONENT: usize = 4096;
pub(crate) const IC0_PATTERN_REL_DROP: f64 = 1.0e-13;
pub(crate) fn assemble_local_schur_block<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
cols: &[usize],
) -> Array2<f64> {
let d = sys.d;
let b = cols.len();
let mut s_block = Array2::<f64>::zeros((b, b));
sys.penalty_subblock_add(cols, &mut s_block);
for bi in 0..b {
s_block[[bi, bi]] += ridge_beta;
}
let cluster_row_into = |row_idx: usize, row: &ArrowRowBlock, acc: &mut Array2<f64>| {
let mut col_vec = Array1::<f64>::zeros(d);
let mut solved_cols = Array2::<f64>::zeros((d, b));
for bj in 0..b {
let gj = cols[bj];
for c in 0..d {
col_vec[c] = row.htbeta[[c, gj]];
}
let solved = backend.solve_block_vector(htt_factors.factor(row_idx), col_vec.view());
for c in 0..d {
solved_cols[[c, bj]] = solved[c];
}
}
for bi in 0..b {
let gi = cols[bi];
for bj in 0..b {
let mut dot = 0.0;
for c in 0..d {
dot += row.htbeta[[c, gi]] * solved_cols[[c, bj]];
}
acc[[bi, bj]] -= dot;
}
}
};
let n = sys.rows.len();
let parallel = n >= SCHUR_MATVEC_PARALLEL_ROW_MIN && rayon::current_thread_index().is_none();
if parallel {
use rayon::prelude::*;
const CHUNK: usize = 64;
let partials: Vec<Array2<f64>> = (0..n)
.into_par_iter()
.chunks(CHUNK)
.map(|idxs| {
let mut local = Array2::<f64>::zeros((b, b));
for i in idxs {
cluster_row_into(i, &sys.rows[i], &mut local);
}
local
})
.collect();
for local in &partials {
s_block += local;
}
} else {
for (row_idx, row) in sys.rows.iter().enumerate() {
cluster_row_into(row_idx, row, &mut s_block);
}
}
s_block
}
#[derive(Debug, Clone)]
pub struct ClusterJacobiPreconditioner {
pub(crate) clusters: Vec<ClusterFactor>,
}
impl ClusterJacobiPreconditioner {
pub fn from_arrow_schur<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
) -> Result<Self, ArrowSchurError> {
if sys.block_offsets.is_empty() {
let cols: Vec<usize> = (0..sys.k).collect();
return Self::build_from_column_groups(sys, htt_factors, ridge_beta, backend, &[cols]);
}
let graph = BetaCouplingGraph::build(
&sys.block_offsets,
&sys.rows
.iter()
.map(|r| r.htbeta.clone())
.collect::<Vec<_>>(),
);
let col_groups: Vec<Vec<usize>> = graph
.component_partition()
.iter()
.map(|comp_blocks| {
let mut cols: Vec<usize> = comp_blocks
.iter()
.flat_map(|&b| sys.block_offsets[b].clone())
.collect();
cols.sort_unstable();
cols
})
.collect();
Self::build_from_column_groups(sys, htt_factors, ridge_beta, backend, &col_groups)
}
pub(crate) fn build_from_column_groups<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
col_groups: &[Vec<usize>],
) -> Result<Self, ArrowSchurError> {
let mut clusters = Vec::with_capacity(col_groups.len());
for cols in col_groups {
let b = cols.len();
if b == 0 {
continue;
}
if b > CLUSTER_JACOBI_MAX_CLUSTER {
let inv = build_schur_scalar_inv(sys, htt_factors, ridge_beta, backend, cols)?;
clusters.push(ClusterFactor::Scalar {
cols: cols.clone(),
inv,
});
continue;
}
let mut s_block =
assemble_local_schur_block(sys, htt_factors, ridge_beta, backend, cols);
symmetrize_upper_from_lower(&mut s_block);
let factor_opt = {
use faer::Side;
let view = FaerArrayView::new(&s_block);
FaerLlt::new(view.as_ref(), Side::Lower).ok()
};
if let Some(llt) = factor_opt {
clusters.push(ClusterFactor::Chol {
cols: cols.clone(),
factor: llt,
});
} else {
let inv = build_schur_scalar_inv(sys, htt_factors, ridge_beta, backend, cols)?;
clusters.push(ClusterFactor::Scalar {
cols: cols.clone(),
inv,
});
}
}
Ok(Self { clusters })
}
pub(crate) fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(r.len());
for cluster in &self.clusters {
apply_cluster(cluster, r, &mut out, &ClusterApplyMode::Overwrite);
}
out
}
}
#[derive(Debug, Clone)]
pub struct AdditiveSchwarzPreconditioner {
pub(crate) clusters: Vec<ClusterFactor>,
pub(crate) weights: Vec<f64>,
}
impl AdditiveSchwarzPreconditioner {
pub fn from_arrow_schur<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
overlap: usize,
) -> Result<Self, ArrowSchurError> {
if sys.block_offsets.is_empty() {
let cols: Vec<usize> = (0..sys.k).collect();
let inner = ClusterJacobiPreconditioner::build_from_column_groups(
sys,
htt_factors,
ridge_beta,
backend,
&[cols],
)?;
return Ok(Self {
clusters: inner.clusters,
weights: vec![1.0f64; sys.k],
});
}
let graph = BetaCouplingGraph::build(
&sys.block_offsets,
&sys.rows
.iter()
.map(|r| r.htbeta.clone())
.collect::<Vec<_>>(),
);
let col_groups: Vec<Vec<usize>> = graph
.component_partition()
.iter()
.map(|seed| {
let mut current = seed.clone();
for _ in 0..overlap {
current = graph.expand_one_hop(¤t);
}
let mut cols: Vec<usize> = current
.iter()
.flat_map(|&b| sys.block_offsets[b].clone())
.collect();
cols.sort_unstable();
cols.dedup();
cols
})
.collect();
let mut counts = vec![0u32; sys.k];
for cols in &col_groups {
for &gi in cols {
counts[gi] += 1;
}
}
let weights: Vec<f64> = counts
.iter()
.map(|&c| if c == 0 { 1.0 } else { 1.0 / c as f64 })
.collect();
let inner = ClusterJacobiPreconditioner::build_from_column_groups(
sys,
htt_factors,
ridge_beta,
backend,
&col_groups,
)?;
Ok(Self {
clusters: inner.clusters,
weights,
})
}
pub(crate) fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(r.len());
for cluster in &self.clusters {
apply_cluster(
cluster,
r,
&mut out,
&ClusterApplyMode::Accumulate {
weights: &self.weights,
},
);
}
out
}
}
#[derive(Debug, Clone)]
pub struct DiagAssembledSchwarzPreconditioner {
pub(crate) inv_diag: Vec<f64>,
}
impl DiagAssembledSchwarzPreconditioner {
pub fn from_arrow_schur<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
overlap: usize,
) -> Result<Self, ArrowSchurError> {
let col_groups: Vec<Vec<usize>> = if sys.block_offsets.is_empty() {
vec![(0..sys.k).collect()]
} else {
let graph = BetaCouplingGraph::build(
&sys.block_offsets,
&sys.rows
.iter()
.map(|r| r.htbeta.clone())
.collect::<Vec<_>>(),
);
graph
.component_partition()
.iter()
.map(|seed| {
let mut current = seed.clone();
for _ in 0..overlap {
current = graph.expand_one_hop(¤t);
}
let mut cols: Vec<usize> = current
.iter()
.flat_map(|&b| sys.block_offsets[b].clone())
.collect();
cols.sort_unstable();
cols.dedup();
cols
})
.collect()
};
Self::build_from_column_groups(sys, htt_factors, ridge_beta, backend, &col_groups)
}
pub(crate) fn build_from_column_groups<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
col_groups: &[Vec<usize>],
) -> Result<Self, ArrowSchurError> {
let mut counts = vec![0u32; sys.k];
for cols in col_groups {
for &gi in cols {
counts[gi] += 1;
}
}
let mut accum = vec![0.0f64; sys.k];
for cols in col_groups {
let b = cols.len();
if b == 0 {
continue;
}
if b > CLUSTER_JACOBI_MAX_CLUSTER {
let inv = build_schur_scalar_inv(sys, htt_factors, ridge_beta, backend, cols)?;
for (local, &gi) in cols.iter().enumerate() {
let w = if counts[gi] == 0 {
1.0
} else {
1.0 / counts[gi] as f64
};
accum[gi] += w * inv[local];
}
continue;
}
let mut s_block =
assemble_local_schur_block(sys, htt_factors, ridge_beta, backend, cols);
symmetrize_upper_from_lower(&mut s_block);
let local_inv_diag = match local_inverse_diagonal(&s_block) {
Some(diag) => diag,
None => {
let inv = build_schur_scalar_inv(sys, htt_factors, ridge_beta, backend, cols)?;
inv
}
};
for (local, &gi) in cols.iter().enumerate() {
let w = if counts[gi] == 0 {
1.0
} else {
1.0 / counts[gi] as f64
};
accum[gi] += w * local_inv_diag[local];
}
}
for (gi, &c) in counts.iter().enumerate() {
if c == 0 {
accum[gi] = 1.0;
}
}
for (gi, m) in accum.iter().enumerate() {
if !m.is_finite() || *m <= 0.0 {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"diag-assembled Schwarz: non-positive assembled diagonal at index {gi}: {m}"
),
});
}
}
Ok(Self { inv_diag: accum })
}
pub(crate) fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(r.len());
for (gi, &m) in self.inv_diag.iter().enumerate() {
out[gi] = m * r[gi];
}
out
}
}
pub(crate) fn local_inverse_diagonal(a: &Array2<f64>) -> Option<Vec<f64>> {
let b = a.nrows();
let llt = {
use faer::Side;
let view = FaerArrayView::new(a);
FaerLlt::new(view.as_ref(), Side::Lower).ok()?
};
use faer::linalg::solvers::Solve;
let mut diag = Vec::with_capacity(b);
for col in 0..b {
let mut rhs = Array1::<f64>::zeros(b);
rhs[col] = 1.0;
let stride = rhs.strides()[0];
let len = rhs.len();
let rhs_mat = unsafe { faer::MatRef::from_raw_parts(rhs.as_ptr(), len, 1, stride, 0) };
let solved = llt.solve(rhs_mat);
diag.push(solved[(col, 0)]);
}
Some(diag)
}
pub(crate) enum ClusterApplyMode<'w> {
Overwrite,
Accumulate { weights: &'w [f64] },
}
impl ClusterApplyMode<'_> {
#[inline]
pub(crate) fn write(&self, out: &mut Array1<f64>, gi: usize, value: f64) {
match self {
ClusterApplyMode::Overwrite => out[gi] = value,
ClusterApplyMode::Accumulate { weights } => out[gi] += weights[gi] * value,
}
}
}
pub(crate) fn apply_cluster(
cluster: &ClusterFactor,
r: &Array1<f64>,
out: &mut Array1<f64>,
mode: &ClusterApplyMode<'_>,
) {
match cluster {
ClusterFactor::Scalar { cols, inv } => {
for (local, &gi) in cols.iter().enumerate() {
mode.write(out, gi, inv[local] * r[gi]);
}
}
ClusterFactor::Chol { cols, factor } => {
let b = cols.len();
let mut rhs = Array1::<f64>::zeros(b);
for (local, &gi) in cols.iter().enumerate() {
rhs[local] = r[gi];
}
use faer::linalg::solvers::Solve;
let stride = rhs.strides()[0];
let len = rhs.len();
let rhs_mat = unsafe { faer::MatRef::from_raw_parts(rhs.as_ptr(), len, 1, stride, 0) };
let solved = factor.solve(rhs_mat);
for (local, &gi) in cols.iter().enumerate() {
mode.write(out, gi, solved[(local, 0)]);
}
}
}
}
#[derive(Clone)]
pub(crate) enum Ic0Factor {
IncompleteChol {
cols: Vec<usize>,
col_ptr: Vec<usize>,
row_idx: Vec<usize>,
val: Vec<f64>,
},
Scalar {
cols: Vec<usize>,
inv: Vec<f64>,
},
}
impl std::fmt::Debug for Ic0Factor {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Ic0Factor::IncompleteChol { cols, val, .. } => write!(
f,
"Ic0Factor::IncompleteChol {{ cols.len: {}, nnz: {} }}",
cols.len(),
val.len()
),
Ic0Factor::Scalar { cols, .. } => {
write!(f, "Ic0Factor::Scalar {{ cols.len: {} }}", cols.len())
}
}
}
}
#[derive(Debug, Clone)]
pub struct BlockIncompleteCholeskyPreconditioner {
pub(crate) components: Vec<Ic0Factor>,
}
impl BlockIncompleteCholeskyPreconditioner {
pub fn from_arrow_schur<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
) -> Result<Self, ArrowSchurError> {
let col_groups: Vec<Vec<usize>> = if sys.block_offsets.is_empty() {
vec![(0..sys.k).collect()]
} else {
let graph = BetaCouplingGraph::build(
&sys.block_offsets,
&sys.rows
.iter()
.map(|r| r.htbeta.clone())
.collect::<Vec<_>>(),
);
graph
.component_partition()
.iter()
.map(|comp| {
let mut cols: Vec<usize> = comp
.iter()
.flat_map(|&blk| sys.block_offsets[blk].clone())
.collect();
cols.sort_unstable();
cols.dedup();
cols
})
.collect()
};
let mut components = Vec::with_capacity(col_groups.len());
for cols in &col_groups {
let b = cols.len();
if b == 0 {
continue;
}
if b > IC0_MAX_COMPONENT {
let inv = build_schur_scalar_inv(sys, htt_factors, ridge_beta, backend, cols)?;
components.push(Ic0Factor::Scalar {
cols: cols.clone(),
inv,
});
continue;
}
let mut s_block =
assemble_local_schur_block(sys, htt_factors, ridge_beta, backend, cols);
symmetrize_upper_from_lower(&mut s_block);
match incomplete_cholesky_level0(&s_block) {
Some((col_ptr, row_idx, val)) => components.push(Ic0Factor::IncompleteChol {
cols: cols.clone(),
col_ptr,
row_idx,
val,
}),
None => {
let inv =
build_schur_scalar_inv(sys, htt_factors, ridge_beta, backend, cols)?;
components.push(Ic0Factor::Scalar {
cols: cols.clone(),
inv,
});
}
}
}
Ok(Self { components })
}
pub(crate) fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
let mut out = Array1::<f64>::zeros(r.len());
for comp in &self.components {
match comp {
Ic0Factor::Scalar { cols, inv } => {
for (local, &gi) in cols.iter().enumerate() {
out[gi] = inv[local] * r[gi];
}
}
Ic0Factor::IncompleteChol {
cols,
col_ptr,
row_idx,
val,
} => {
let b = cols.len();
let mut z = vec![0.0f64; b];
for (local, &gi) in cols.iter().enumerate() {
z[local] = r[gi];
}
for j in 0..b {
let dstart = col_ptr[j];
let diag = val[dstart];
z[j] /= diag;
let yj = z[j];
for k in (dstart + 1)..col_ptr[j + 1] {
z[row_idx[k]] -= val[k] * yj;
}
}
for j in (0..b).rev() {
let dstart = col_ptr[j];
let mut acc = z[j];
for k in (dstart + 1)..col_ptr[j + 1] {
acc -= val[k] * z[row_idx[k]];
}
z[j] = acc / val[dstart];
}
for (local, &gi) in cols.iter().enumerate() {
out[gi] = z[local];
}
}
}
}
out
}
}
pub(crate) fn incomplete_cholesky_level0(
a: &Array2<f64>,
) -> Option<(Vec<usize>, Vec<usize>, Vec<f64>)> {
let b = a.nrows();
assert_eq!(a.ncols(), b, "incomplete Cholesky needs a square block");
let mut col_ptr = vec![0usize; b + 1];
let mut row_idx: Vec<usize> = Vec::new();
let mut val: Vec<f64> = Vec::new();
let mut col_pos: Vec<std::collections::HashMap<usize, usize>> =
Vec::with_capacity(b);
for j in 0..b {
let ajj = a[[j, j]];
let scale_j = ajj.abs().max(0.0).sqrt();
let mut map = std::collections::HashMap::new();
map.insert(j, val.len());
row_idx.push(j);
val.push(ajj);
for i in (j + 1)..b {
let aij = a[[i, j]];
let scale_i = a[[i, i]].abs().sqrt();
let thresh = IC0_PATTERN_REL_DROP * scale_i * scale_j;
if aij.abs() > thresh {
map.insert(i, val.len());
row_idx.push(i);
val.push(aij);
}
}
col_pos.push(map);
col_ptr[j + 1] = val.len();
}
for j in 0..b {
let dpos = col_ptr[j];
let mut diag = val[dpos];
for mapk in &col_pos[..j] {
if let Some(&pjk) = mapk.get(&j) {
let ljk = val[pjk];
diag -= ljk * ljk;
}
}
if !diag.is_finite() || diag <= JACOBI_DIAGONAL_PD_FLOOR {
return None;
}
let ljj = diag.sqrt();
val[dpos] = ljj;
for p in (dpos + 1)..col_ptr[j + 1] {
let i = row_idx[p];
let mut s = val[p];
for mapk in &col_pos[..j] {
if let (Some(&pik), Some(&pjk)) = (mapk.get(&i), mapk.get(&j)) {
s -= val[pik] * val[pjk];
}
}
val[p] = s / ljj;
}
}
Some((col_ptr, row_idx, val))
}
#[derive(Debug, Clone, Copy)]
pub struct PrecondLadderRow {
pub iterations: usize,
pub converged: bool,
pub final_relative_residual: f64,
}
pub fn arrow_precond_ladder_iteration_study(
sys: &ArrowSchurSystem,
ridge_beta: f64,
rhs: &Array1<f64>,
pcg: &ArrowPcgOptions,
trust: &ArrowTrustRegionOptions,
) -> Result<Vec<(SchurPreconditionerKind, Option<PrecondLadderRow>)>, ArrowSchurError> {
let backend = CpuBatchedBlockSolver;
let htt_factors = backend.factor_blocks(&sys.rows, 0.0, sys.d, false)?;
let run = |apply: &dyn Fn(&Array1<f64>) -> Array1<f64>| -> Option<PrecondLadderRow> {
let (_sol, diag) = run_pcg_with_preconditioner(
sys,
&htt_factors,
ridge_beta,
rhs,
|r| apply(r),
pcg,
trust,
&backend,
None,
None,
None,
)
.ok()?;
Some(PrecondLadderRow {
iterations: diag.iterations,
converged: matches!(diag.stopping_reason, PcgStopReason::Converged),
final_relative_residual: diag.final_relative_residual,
})
};
let mut out: Vec<(SchurPreconditionerKind, Option<PrecondLadderRow>)> = Vec::with_capacity(6);
let diag_row = {
let mut bare = sys.clone();
bare.set_block_offsets(std::sync::Arc::from([] as [Range<usize>; 0]));
let bare_factors = backend.factor_blocks(&bare.rows, 0.0, bare.d, false)?;
JacobiPreconditioner::from_arrow_schur(&bare, &bare_factors, ridge_beta, &backend, None)
.ok()
.and_then(|p| {
run_pcg_with_preconditioner(
&bare,
&bare_factors,
ridge_beta,
rhs,
|r| p.apply(r),
pcg,
trust,
&backend,
None,
None,
None,
)
.ok()
.map(|(_s, diag)| PrecondLadderRow {
iterations: diag.iterations,
converged: matches!(diag.stopping_reason, PcgStopReason::Converged),
final_relative_residual: diag.final_relative_residual,
})
})
};
out.push((SchurPreconditionerKind::Diagonal, diag_row));
let block_row =
JacobiPreconditioner::from_arrow_schur(sys, &htt_factors, ridge_beta, &backend, None)
.ok()
.and_then(|p| run(&|r| p.apply(r)));
out.push((SchurPreconditionerKind::BetaBlockJacobi, block_row));
let cluster_row =
ClusterJacobiPreconditioner::from_arrow_schur(sys, &htt_factors, ridge_beta, &backend)
.ok()
.and_then(|p| run(&|r| p.apply(r)));
out.push((SchurPreconditionerKind::ClusterJacobi, cluster_row));
let schwarz_row =
AdditiveSchwarzPreconditioner::from_arrow_schur(sys, &htt_factors, ridge_beta, &backend, 1)
.ok()
.and_then(|p| run(&|r| p.apply(r)));
out.push((SchurPreconditionerKind::AdditiveSchwarz { overlap: 1 }, schwarz_row));
let diag_schwarz_row = DiagAssembledSchwarzPreconditioner::from_arrow_schur(
sys,
&htt_factors,
ridge_beta,
&backend,
1,
)
.ok()
.and_then(|p| run(&|r| p.apply(r)));
out.push((
SchurPreconditionerKind::DiagAssembledSchwarz { overlap: 1 },
diag_schwarz_row,
));
let ic0_row =
BlockIncompleteCholeskyPreconditioner::from_arrow_schur(sys, &htt_factors, ridge_beta, &backend)
.ok()
.and_then(|p| run(&|r| p.apply(r)));
out.push((SchurPreconditionerKind::BlockIncompleteCholesky, ic0_row));
Ok(out)
}
pub(crate) fn build_schur_scalar_inv<B: BatchedBlockSolver>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
backend: &B,
cols: &[usize],
) -> Result<Vec<f64>, ArrowSchurError> {
let d = sys.d;
let mut result = Vec::with_capacity(cols.len());
let mut col_vec = Array1::<f64>::zeros(d);
let mut full_diag = Array1::<f64>::zeros(sys.k);
{
let diag_slice = full_diag.as_slice_mut().expect("full_diag contiguous");
sys.penalty_diagonal_add(diag_slice);
}
for &gi in cols {
let mut s = full_diag[gi] + ridge_beta;
for (row_idx, row) in sys.rows.iter().enumerate() {
for c in 0..d {
col_vec[c] = row.htbeta[[c, gi]];
}
let solved = backend.solve_block_vector(htt_factors.factor(row_idx), col_vec.view());
let mut acc = 0.0;
for c in 0..d {
acc += col_vec[c] * solved[c];
}
s -= acc;
}
if !s.is_finite() || s <= JACOBI_DIAGONAL_PD_FLOOR {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"cluster Schur scalar fallback: non-PD diagonal at index {gi}: {s}"
),
});
}
result.push(1.0 / s);
}
Ok(result)
}
pub(crate) fn steihaug_pcg_auto<B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
rhs: &Array1<f64>,
pcg: &ArrowPcgOptions,
trust: &ArrowTrustRegionOptions,
backend: &B,
gpu_matvec: Option<&GpuSchurMatvec>,
metric_weights: Option<&MetricWeights>,
curvature_floor: Option<f64>,
) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurError> {
let resident = if gpu_matvec.is_none() {
SaeResidentReducedSchur::build(sys, htt_factors, backend)
} else {
None
};
let mut effective_ridge = ridge_beta;
let mut x0_diag0: Option<(Array1<f64>, PcgDiagnostics)> = None;
let mut last_curvature_err: Option<ArrowSchurError> = None;
let rhs_scale = metric_norm(rhs.view(), metric_weights).max(1.0);
let ridge_ceiling = ridge_beta.max(SCHUR_CURVATURE_FLOOR_REL_CEILING * rhs_scale);
for _attempt in 0..=SCHUR_CURVATURE_FLOOR_MAX_ATTEMPTS {
let jacobi = match JacobiPreconditioner::from_arrow_schur(
sys,
htt_factors,
effective_ridge,
backend,
resident.as_ref(),
) {
Ok(jacobi) => jacobi,
Err(err @ ArrowSchurError::PcgFailed { .. }) => {
if curvature_floor.is_none() {
return Err(err);
}
let next = if effective_ridge > 0.0 {
effective_ridge * SCHUR_CURVATURE_FLOOR_DIAG_GROWTH
} else {
rhs_scale
};
last_curvature_err = Some(err);
if !next.is_finite() || next > ridge_ceiling {
break;
}
effective_ridge = next;
continue;
}
Err(other) => return Err(other),
};
match run_pcg_with_preconditioner(
sys,
htt_factors,
effective_ridge,
rhs,
|r| jacobi.apply(r),
pcg,
trust,
backend,
gpu_matvec,
metric_weights,
resident.as_ref(),
) {
Ok(result) => {
x0_diag0 = Some(result);
break;
}
Err(ArrowSchurError::UnboundedNegativeCurvature {
curvature,
direction_norm_sq,
}) => {
let Some(relative_floor) = curvature_floor else {
return Err(ArrowSchurError::UnboundedNegativeCurvature {
curvature,
direction_norm_sq,
});
};
let deficit = if direction_norm_sq > 0.0 {
curvature.abs() / direction_norm_sq
} else {
0.0
};
let bump = (deficit * (1.0 + SCHUR_CURVATURE_FLOOR_MARGIN))
.max(relative_floor.max(SCHUR_CURVATURE_FLOOR_REL_FLOOR) * rhs_scale);
let next = (effective_ridge + bump).max(effective_ridge * 2.0);
last_curvature_err = Some(ArrowSchurError::UnboundedNegativeCurvature {
curvature,
direction_norm_sq,
});
if !next.is_finite() || next > ridge_ceiling {
break;
}
effective_ridge = next;
}
Err(other) => return Err(other),
}
}
let (x0, diag0) = match x0_diag0 {
Some(result) => result,
None => {
return Err(last_curvature_err.unwrap_or(ArrowSchurError::PcgFailed {
reason: "unbounded Schur PCG negative curvature unresolved by curvature floor"
.to_string(),
}));
}
};
if sys.k <= PRECOND_ESCALATE_K_THRESHOLD || diag0.stopping_reason != PcgStopReason::MaxIter {
return Ok((x0, diag0));
}
let cluster =
ClusterJacobiPreconditioner::from_arrow_schur(sys, htt_factors, effective_ridge, backend)?;
let (x1, diag1) = run_pcg_with_preconditioner(
sys,
htt_factors,
effective_ridge,
rhs,
|r| cluster.apply(r),
pcg,
trust,
backend,
gpu_matvec,
metric_weights,
resident.as_ref(),
)?;
if diag1.stopping_reason != PcgStopReason::MaxIter {
return Ok((x1, diag1));
}
let schwarz = AdditiveSchwarzPreconditioner::from_arrow_schur(
sys,
htt_factors,
effective_ridge,
backend,
1,
)?;
let (x2, diag2) = run_pcg_with_preconditioner(
sys,
htt_factors,
effective_ridge,
rhs,
|r| schwarz.apply(r),
pcg,
trust,
backend,
gpu_matvec,
metric_weights,
resident.as_ref(),
)?;
if diag2.stopping_reason != PcgStopReason::MaxIter {
return Ok((x2, diag2));
}
let diag_schwarz = DiagAssembledSchwarzPreconditioner::from_arrow_schur(
sys,
htt_factors,
effective_ridge,
backend,
1,
)?;
let (x3, diag3) = run_pcg_with_preconditioner(
sys,
htt_factors,
effective_ridge,
rhs,
|r| diag_schwarz.apply(r),
pcg,
trust,
backend,
gpu_matvec,
metric_weights,
resident.as_ref(),
)?;
if diag3.stopping_reason != PcgStopReason::MaxIter {
return Ok((x3, diag3));
}
let ic0 = BlockIncompleteCholeskyPreconditioner::from_arrow_schur(
sys,
htt_factors,
effective_ridge,
backend,
)?;
let (x4, diag4) = run_pcg_with_preconditioner(
sys,
htt_factors,
effective_ridge,
rhs,
|r| ic0.apply(r),
pcg,
trust,
backend,
gpu_matvec,
metric_weights,
resident.as_ref(),
)?;
if diag4.stopping_reason == PcgStopReason::MaxIter {
return Err(ArrowSchurError::PcgFailed {
reason: format!(
"Schur PCG exhausted all preconditioner tiers (Jacobi, ClusterJacobi, \
AdditiveSchwarz, DiagAssembledSchwarz, BlockIncompleteCholesky) at MaxIter; \
final relative residual = {:e}",
diag4.final_relative_residual
),
});
}
Ok((x4, diag4))
}
pub(crate) fn run_pcg_with_preconditioner<ApplyPrec, B: BatchedBlockSolver + Sync>(
sys: &ArrowSchurSystem,
htt_factors: &ArrowFactorSlab,
ridge_beta: f64,
rhs: &Array1<f64>,
apply_prec: ApplyPrec,
pcg: &ArrowPcgOptions,
trust: &ArrowTrustRegionOptions,
backend: &B,
gpu_matvec: Option<&GpuSchurMatvec>,
metric_weights: Option<&MetricWeights>,
resident: Option<&SaeResidentReducedSchur>,
) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurError>
where
ApplyPrec: FnMut(&Array1<f64>) -> Array1<f64>,
{
let max_iters = pcg.max_iterations.min(trust.max_iterations);
let tol = pcg
.relative_tolerance
.max(trust.steihaug_relative_tolerance);
if let Some(gpu_mv) = gpu_matvec {
let gpu_mv = Arc::clone(gpu_mv);
steihaug_cg(
rhs,
move |p, out| gpu_mv(p, out),
apply_prec,
max_iters,
tol,
trust.radius,
metric_weights,
)
} else {
steihaug_cg(
rhs,
|p, out| schur_matvec(sys, htt_factors, ridge_beta, p, out, backend, resident),
apply_prec,
max_iters,
tol,
trust.radius,
metric_weights,
)
}
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct IdentityPreconditioner;
impl IdentityPreconditioner {
pub(crate) fn apply(&self, r: &Array1<f64>) -> Array1<f64> {
r.clone()
}
}
pub(crate) fn steihaug_dense_system(
schur: &Array2<f64>,
rhs: &Array1<f64>,
preconditioner: &IdentityPreconditioner,
pcg: &ArrowPcgOptions,
trust: &ArrowTrustRegionOptions,
metric_weights: Option<&MetricWeights>,
) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurError> {
steihaug_cg(
rhs,
|p, out| dense_matvec(schur, p, out),
|r| preconditioner.apply(r),
pcg.max_iterations,
pcg.relative_tolerance,
trust.radius,
metric_weights,
)
}
pub(crate) fn steihaug_cg<MatVec, ApplyPrec>(
rhs: &Array1<f64>,
mut matvec: MatVec,
mut apply_preconditioner: ApplyPrec,
max_iterations: usize,
relative_tolerance: f64,
trust_radius: f64,
metric_weights: Option<&MetricWeights>,
) -> Result<(Array1<f64>, PcgDiagnostics), ArrowSchurError>
where
MatVec: FnMut(&Array1<f64>, &mut Array1<f64>),
ApplyPrec: FnMut(&Array1<f64>) -> Array1<f64>,
{
let n = rhs.len();
if let Some(weights) = metric_weights {
assert_eq!(
weights.len(),
n,
"Steihaug-CG metric weight length must match solve dimension"
);
}
let radius = if trust_radius.is_finite() && trust_radius > 0.0 {
trust_radius
} else {
f64::INFINITY
};
let rhs_norm = metric_norm(rhs.view(), metric_weights);
if rhs_norm == 0.0 {
return Ok((Array1::<f64>::zeros(n), PcgDiagnostics::default()));
}
let tol = (relative_tolerance.max(0.0) * rhs_norm).max(PCG_ABSOLUTE_TOLERANCE_FLOOR);
let mut x = Array1::<f64>::zeros(n);
let mut r = rhs.clone();
let mut z = apply_preconditioner(&r);
let mut diag = PcgDiagnostics {
precond_apply_calls: 1,
..PcgDiagnostics::default()
};
let mut p = z.clone();
let mut rz = metric_dot(&r, &z, metric_weights);
if rz <= 0.0 || !rz.is_finite() {
if radius.is_finite() {
diag.final_relative_residual = metric_norm(r.view(), metric_weights) / rhs_norm;
diag.stopping_reason = PcgStopReason::TrustRegion;
return Ok((step_to_trust_boundary(&x, &r, radius, metric_weights), diag));
}
return Err(ArrowSchurError::UnboundedNegativeCurvature {
curvature: rz,
direction_norm_sq: metric_dot(&r, &r, metric_weights),
});
}
if metric_norm(r.view(), metric_weights) <= tol {
diag.final_relative_residual = 0.0;
diag.stopping_reason = PcgStopReason::Converged;
return Ok((x, diag));
}
let mut ap = Array1::<f64>::zeros(n);
let mut candidate = Array1::<f64>::zeros(n);
for _ in 0..max_iterations {
matvec(&p, &mut ap);
diag.matvec_calls += 1;
diag.iterations += 1;
let pap = metric_dot(&p, &ap, metric_weights);
if pap <= 0.0 || !pap.is_finite() {
if radius.is_finite() {
diag.final_relative_residual = metric_norm(r.view(), metric_weights) / rhs_norm;
diag.stopping_reason = PcgStopReason::TrustRegion;
return Ok((step_to_trust_boundary(&x, &p, radius, metric_weights), diag));
}
return Err(ArrowSchurError::UnboundedNegativeCurvature {
curvature: pap,
direction_norm_sq: metric_dot(&p, &p, metric_weights),
});
}
let alpha = rz / pap;
for i in 0..n {
candidate[i] = x[i] + alpha * p[i];
}
if radius.is_finite() && metric_norm(candidate.view(), metric_weights) >= radius {
diag.final_relative_residual = metric_norm(r.view(), metric_weights) / rhs_norm;
diag.stopping_reason = PcgStopReason::TrustRegion;
return Ok((step_to_trust_boundary(&x, &p, radius, metric_weights), diag));
}
x.assign(&candidate);
for i in 0..n {
r[i] -= alpha * ap[i];
}
if metric_norm(r.view(), metric_weights) <= tol {
diag.final_relative_residual = metric_norm(r.view(), metric_weights) / rhs_norm;
diag.stopping_reason = PcgStopReason::Converged;
return Ok((x, diag));
}
z = apply_preconditioner(&r);
diag.precond_apply_calls += 1;
let rz_next = metric_dot(&r, &z, metric_weights);
if rz_next <= 0.0 || !rz_next.is_finite() {
return Err(ArrowSchurError::PcgFailed {
reason: "non-positive or non-finite PCG residual".to_string(),
});
}
let beta = rz_next / rz;
for i in 0..n {
p[i] = z[i] + beta * p[i];
}
rz = rz_next;
}
diag.final_relative_residual = metric_norm(r.view(), metric_weights) / rhs_norm;
diag.stopping_reason = PcgStopReason::MaxIter;
Ok((x, diag))
}
pub(crate) fn step_to_trust_boundary(
x: &Array1<f64>,
p: &Array1<f64>,
radius: f64,
metric_weights: Option<&MetricWeights>,
) -> Array1<f64> {
let pp = metric_dot(p, p, metric_weights);
if pp == 0.0 {
return x.clone();
}
let xp = metric_dot(x, p, metric_weights);
let xx = metric_dot(x, x, metric_weights);
let disc = (xp * xp + pp * (radius * radius - xx)).max(0.0);
let tau = (-xp + disc.sqrt()) / pp;
let mut out = x.clone();
for i in 0..out.len() {
out[i] += tau * p[i];
}
out
}
pub(crate) fn dense_matvec(a: &Array2<f64>, x: &Array1<f64>, out: &mut Array1<f64>) {
let n = a.nrows();
for i in 0..n {
let mut acc = 0.0;
for j in 0..n {
acc += a[[i, j]] * x[j];
}
out[i] = acc;
}
}
pub(crate) fn dot(a: &Array1<f64>, b: &Array1<f64>) -> f64 {
let mut acc = 0.0;
for i in 0..a.len() {
acc += a[i] * b[i];
}
acc
}
pub(crate) fn metric_dot(
a: &Array1<f64>,
b: &Array1<f64>,
metric_weights: Option<&MetricWeights>,
) -> f64 {
assert_eq!(a.len(), b.len());
match metric_weights {
Some(weights) => {
assert_eq!(weights.len(), a.len());
let mut acc = 0.0;
for i in 0..a.len() {
acc += weights[i] * a[i] * b[i];
}
acc
}
None => dot(a, b),
}
}
pub(crate) fn metric_norm(v: ArrayView1<'_, f64>, metric_weights: Option<&MetricWeights>) -> f64 {
let mut acc = 0.0;
match metric_weights {
Some(weights) => {
assert_eq!(weights.len(), v.len());
for i in 0..v.len() {
acc += weights[i] * v[i] * v[i];
}
}
None => {
for x in v.iter() {
acc += x * x;
}
}
}
acc.sqrt()
}
pub(crate) fn symmetrize_upper_from_lower(a: &mut Array2<f64>) {
let n = a.nrows().min(a.ncols());
for i in 0..n {
for j in 0..i {
let v = 0.5 * (a[[i, j]] + a[[j, i]]);
a[[i, j]] = v;
a[[j, i]] = v;
}
}
}
#[derive(Debug, Clone)]
pub enum ArrowSchurError {
PerRowFactorFailed { row: usize, reason: String },
PerRowFactorIllConditioned { row: usize, kappa_estimate: f64 },
SchurFactorFailed { reason: String },
PcgFailed { reason: String },
UnboundedNegativeCurvature {
curvature: f64,
direction_norm_sq: f64,
},
AdaptiveCorrectionFailed { reason: String },
}
impl std::fmt::Display for ArrowSchurError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ArrowSchurError::PerRowFactorFailed { row, reason } => write!(
f,
"arrow-Schur: per-row H_tt^({row}) Cholesky failed: {reason}"
),
ArrowSchurError::PerRowFactorIllConditioned {
row,
kappa_estimate,
} => write!(
f,
"arrow-Schur: per-row H_tt^({row}) Cholesky succeeded but failed \
the safe-inversion guard (kappa_estimate={kappa_estimate:e}); \
Schur reduction would be numerically contaminated"
),
ArrowSchurError::SchurFactorFailed { reason } => {
write!(f, "arrow-Schur: Schur complement Cholesky failed: {reason}")
}
ArrowSchurError::PcgFailed { reason } => {
write!(f, "arrow-Schur: Schur PCG failed: {reason}")
}
ArrowSchurError::UnboundedNegativeCurvature {
curvature,
direction_norm_sq,
} => write!(
f,
"arrow-Schur: unbounded Schur PCG hit negative curvature pᵀSp={curvature:e} \
(‖p‖²={direction_norm_sq:e}); reduced Schur is indefinite (co-collapse), \
retry with a curvature-floor ridge"
),
ArrowSchurError::AdaptiveCorrectionFailed { reason } => {
write!(
f,
"arrow-Schur: adaptive proximal correction failed: {reason}"
)
}
}
}
}
impl std::error::Error for ArrowSchurError {}
pub(crate) fn cholesky_lower(a: &Array2<f64>) -> Result<Array2<f64>, String> {
let n = a.nrows();
if a.ncols() != n {
return Err(format!("cholesky_lower: non-square {}×{}", n, a.ncols()));
}
if let Some((idx, _)) = a.iter().enumerate().find(|(_, v)| !v.is_finite()) {
return Err(format!(
"cholesky_lower: non-finite entry at linear index {idx}"
));
}
let mut maybe_device = a.clone();
if crate::gpu::try_cholesky_lower_inplace(&mut maybe_device).is_some() {
return Ok(maybe_device);
}
let mut l = Array2::<f64>::zeros((n, n));
for i in 0..n {
for j in 0..=i {
let mut sum = a[[i, j]];
for kk in 0..j {
sum -= l[[i, kk]] * l[[j, kk]];
}
if i == j {
if !sum.is_finite() || sum <= 0.0 {
return Err(format!(
"non-PD pivot {sum} at index {i} (matrix is not positive definite)"
));
}
l[[i, j]] = sum.sqrt();
} else {
l[[i, j]] = sum / l[[j, j]];
}
}
}
Ok(l)
}