use crate::dense::factor::{
factor, factor_frontal_blocked_in_place_with_scratch, factor_frontal_diagonal_in_place,
phase_timing, BunchKaufmanParams, FactorScratch, FrontalFactors, ZeroPivotAction,
};
use crate::dense::matrix::SymmetricMatrix;
use crate::error::FeralError;
use crate::inertia::Inertia;
use crate::scaling::{compute_scaling_dense_fast, compute_scaling_with_cache, ScalingStrategy};
use crate::sparse::csc::CscMatrix;
use crate::symbolic::{Supernode, SymbolicFactorization};
use std::sync::{Arc, Mutex, OnceLock};
use std::time::Instant;
fn supernode_trace_enabled() -> bool {
static ENABLED: OnceLock<bool> = OnceLock::new();
*ENABLED.get_or_init(|| {
std::env::var("FERAL_TRACE_SUPERNODE")
.map(|v| !v.is_empty() && v != "0" && v != "off" && v != "false")
.unwrap_or(false)
})
}
pub const CASCADE_BREAK_MIN_N: usize = 4096;
#[derive(Debug, Clone)]
pub struct NumericParams {
pub bk: BunchKaufmanParams,
pub scaling: ScalingStrategy,
pub small_leaf: SmallLeafBatch,
pub profiler: Option<Arc<Mutex<Profiler>>>,
pub parallel_telemetry: Option<Arc<AtomicLockStats>>,
pub fma: bool,
pub allow_delayed_pivots: bool,
pub cascade_break_ratio: Option<f64>,
pub cascade_break_eps: Option<f64>,
pub min_parallel_flops: Option<u64>,
pub sqd_mode: bool,
pub static_pivot_threshold: Option<f64>,
pub warn_partial_singular: bool,
pub pattern_reused_hint: bool,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum SmallLeafBatch {
#[default]
Off,
On,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct SupernodeTiming {
pub snode_idx: usize,
pub nrow: usize,
pub ncol: usize,
pub us: u64,
#[serde(default)]
pub assembly_us: u64,
#[serde(default)]
pub densefactor_us: u64,
#[serde(default)]
pub panelfactor_us: u64,
#[serde(default)]
pub schur_us: u64,
#[serde(default)]
pub scalartail_us: u64,
}
#[derive(Debug, Clone, Default, serde::Serialize)]
pub struct PrologueBreakdown {
pub row_map_us: u64,
pub scaling_us: u64,
pub scaling_pivot_order_us: u64,
pub permute_us: u64,
pub permute_from_triplets_us: u64,
pub infnorm_tol_us: u64,
pub symmetric_pattern_us: u64,
pub setup_us: u64,
}
#[derive(Debug, Clone, Default)]
pub struct Profiler {
timings: Vec<SupernodeTiming>,
prologue_us: u64,
prologue_breakdown: PrologueBreakdown,
epilogue_us: u64,
total_us: u64,
}
impl Profiler {
pub fn new() -> Self {
Self::default()
}
pub fn len(&self) -> usize {
self.timings.len()
}
pub fn is_empty(&self) -> bool {
self.timings.is_empty()
}
pub fn timings(&self) -> &[SupernodeTiming] {
&self.timings
}
pub fn report(&self) -> ProfileReport {
const RANGES: &[(&str, usize, usize)] = &[
("<=8", 0, 8),
("9-16", 9, 16),
("17-32", 17, 32),
("33-64", 33, 64),
("65-128", 65, 128),
(">128", 129, usize::MAX),
];
let mut buckets: Vec<BucketStats> = RANGES
.iter()
.map(|&(range, _, _)| BucketStats {
range,
count: 0,
sum_us: 0,
pct_of_total: 0.0,
avg_us: 0.0,
})
.collect();
for t in &self.timings {
for (i, &(_, lo, hi)) in RANGES.iter().enumerate() {
if t.nrow >= lo && t.nrow <= hi {
buckets[i].count += 1;
buckets[i].sum_us += t.us;
break;
}
}
}
let loop_us: u64 = buckets.iter().map(|b| b.sum_us).sum();
let mut warnings: Vec<String> = Vec::new();
let count_sum: usize = buckets.iter().map(|b| b.count).sum();
if count_sum != self.timings.len() {
warnings.push(format!(
"bucket count sum {} != timings len {}",
count_sum,
self.timings.len()
));
}
if self.total_us > 0 && loop_us + self.prologue_us + self.epilogue_us > self.total_us {
warnings.push(format!(
"loop+prologue+epilogue ({}) exceeds total ({})",
loop_us + self.prologue_us + self.epilogue_us,
self.total_us
));
}
for b in &mut buckets {
if loop_us > 0 {
b.pct_of_total = (b.sum_us as f64) * 100.0 / (loop_us as f64);
}
if b.count > 0 {
b.avg_us = (b.sum_us as f64) / (b.count as f64);
}
}
let overhead_pct = if self.total_us > 0 {
((self.prologue_us + self.epilogue_us) as f64) * 100.0 / (self.total_us as f64)
} else {
0.0
};
ProfileReport {
n_supernodes: self.timings.len(),
prologue_us: self.prologue_us,
prologue_breakdown: self.prologue_breakdown.clone(),
epilogue_us: self.epilogue_us,
loop_us,
total_us: self.total_us,
overhead_pct,
buckets,
validation_warnings: warnings,
}
}
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct BucketStats {
pub range: &'static str,
pub count: usize,
pub sum_us: u64,
pub pct_of_total: f64,
pub avg_us: f64,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct ProfileReport {
pub n_supernodes: usize,
pub prologue_us: u64,
pub prologue_breakdown: PrologueBreakdown,
pub epilogue_us: u64,
pub loop_us: u64,
pub total_us: u64,
pub overhead_pct: f64,
pub buckets: Vec<BucketStats>,
pub validation_warnings: Vec<String>,
}
impl Default for NumericParams {
fn default() -> Self {
Self {
bk: BunchKaufmanParams {
pivot_threshold: 1e-8,
on_zero_pivot: ZeroPivotAction::ForceAccept,
..BunchKaufmanParams::default()
},
scaling: ScalingStrategy::default(),
small_leaf: SmallLeafBatch::default(),
profiler: None,
parallel_telemetry: None,
fma: false,
allow_delayed_pivots: true,
cascade_break_ratio: Some(0.5),
cascade_break_eps: Some(1e-10),
min_parallel_flops: None,
sqd_mode: false,
static_pivot_threshold: None,
warn_partial_singular: false,
pattern_reused_hint: false,
}
}
}
impl NumericParams {
pub fn with_bk(bk: BunchKaufmanParams) -> Self {
Self {
bk,
scaling: ScalingStrategy::default(),
small_leaf: SmallLeafBatch::default(),
profiler: None,
parallel_telemetry: None,
fma: false,
allow_delayed_pivots: true,
cascade_break_ratio: Some(0.5),
cascade_break_eps: Some(1e-10),
min_parallel_flops: None,
sqd_mode: false,
static_pivot_threshold: None,
warn_partial_singular: false,
pattern_reused_hint: false,
}
}
}
#[derive(Default, Debug)]
pub struct AtomicLockStats {
pub contrib_wait_ns: std::sync::atomic::AtomicU64,
pub contrib_hold_ns: std::sync::atomic::AtomicU64,
pub node_factors_wait_ns: std::sync::atomic::AtomicU64,
pub node_factors_hold_ns: std::sync::atomic::AtomicU64,
pub factor_body_ns: std::sync::atomic::AtomicU64,
pub task_wall_ns: std::sync::atomic::AtomicU64,
pub ws_lock_wait_ns: std::sync::atomic::AtomicU64,
pub n_tasks: std::sync::atomic::AtomicU64,
pub phase_scaling_ns: std::sync::atomic::AtomicU64,
pub phase_permute_ns: std::sync::atomic::AtomicU64,
pub phase_symmetric_pattern_ns: std::sync::atomic::AtomicU64,
pub phase_tree_setup_ns: std::sync::atomic::AtomicU64,
pub phase_thread_ws_ns: std::sync::atomic::AtomicU64,
pub phase_leaves_ns: std::sync::atomic::AtomicU64,
pub phase_scope_ns: std::sync::atomic::AtomicU64,
pub phase_collect_ns: std::sync::atomic::AtomicU64,
}
#[derive(Default, Debug, Clone, Copy)]
pub struct ParallelLockStats {
pub contrib_wait_ns: u64,
pub contrib_hold_ns: u64,
pub node_factors_wait_ns: u64,
pub node_factors_hold_ns: u64,
pub factor_body_ns: u64,
pub task_wall_ns: u64,
pub ws_lock_wait_ns: u64,
pub n_tasks: u64,
pub phase_scaling_ns: u64,
pub phase_permute_ns: u64,
pub phase_symmetric_pattern_ns: u64,
pub phase_tree_setup_ns: u64,
pub phase_thread_ws_ns: u64,
pub phase_leaves_ns: u64,
pub phase_scope_ns: u64,
pub phase_collect_ns: u64,
}
impl AtomicLockStats {
pub fn snapshot(&self) -> ParallelLockStats {
use std::sync::atomic::Ordering;
ParallelLockStats {
contrib_wait_ns: self.contrib_wait_ns.load(Ordering::Relaxed),
contrib_hold_ns: self.contrib_hold_ns.load(Ordering::Relaxed),
node_factors_wait_ns: self.node_factors_wait_ns.load(Ordering::Relaxed),
node_factors_hold_ns: self.node_factors_hold_ns.load(Ordering::Relaxed),
factor_body_ns: self.factor_body_ns.load(Ordering::Relaxed),
task_wall_ns: self.task_wall_ns.load(Ordering::Relaxed),
ws_lock_wait_ns: self.ws_lock_wait_ns.load(Ordering::Relaxed),
n_tasks: self.n_tasks.load(Ordering::Relaxed),
phase_scaling_ns: self.phase_scaling_ns.load(Ordering::Relaxed),
phase_permute_ns: self.phase_permute_ns.load(Ordering::Relaxed),
phase_symmetric_pattern_ns: self.phase_symmetric_pattern_ns.load(Ordering::Relaxed),
phase_tree_setup_ns: self.phase_tree_setup_ns.load(Ordering::Relaxed),
phase_thread_ws_ns: self.phase_thread_ws_ns.load(Ordering::Relaxed),
phase_leaves_ns: self.phase_leaves_ns.load(Ordering::Relaxed),
phase_scope_ns: self.phase_scope_ns.load(Ordering::Relaxed),
phase_collect_ns: self.phase_collect_ns.load(Ordering::Relaxed),
}
}
}
#[derive(Debug, Clone)]
pub struct SchurBlock {
pub dim: usize,
pub data: Vec<f64>,
}
impl SchurBlock {
#[inline]
pub fn get(&self, i: usize, j: usize) -> f64 {
self.data[j * self.dim + i]
}
pub fn symv(&self, x: &[f64], y: &mut [f64]) -> Result<(), FeralError> {
if x.len() != self.dim || y.len() != self.dim {
return Err(FeralError::DimensionMismatch {
expected: self.dim,
got: x.len().max(y.len()),
});
}
for yi in y.iter_mut() {
*yi = 0.0;
}
for (j, &xj) in x.iter().enumerate().take(self.dim) {
let col = &self.data[j * self.dim..(j + 1) * self.dim];
for (i, &v) in col.iter().enumerate() {
y[i] += v * xj;
}
}
Ok(())
}
pub fn solve(&self, rhs: &[f64]) -> Result<Vec<f64>, FeralError> {
self.solve_with(rhs, &BunchKaufmanParams::default())
}
pub fn solve_with(
&self,
rhs: &[f64],
params: &BunchKaufmanParams,
) -> Result<Vec<f64>, FeralError> {
if rhs.len() != self.dim {
return Err(FeralError::DimensionMismatch {
expected: self.dim,
got: rhs.len(),
});
}
let s_mat = SymmetricMatrix::from_column_major(self.dim, self.data.clone())?;
let (factors, _inertia) = factor(&s_mat, params)?;
crate::dense::solve::solve(&factors, rhs)
}
}
#[derive(Debug)]
pub struct SparseFactors {
pub n: usize,
pub perm: Vec<usize>,
pub perm_inv: Vec<usize>,
pub node_factors: Vec<NodeFactors>,
pub needs_refinement: bool,
pub scaling: Vec<f64>,
pub scaling_info: crate::scaling::ScalingInfo,
pub resolved_method: crate::symbolic::OrderingMethod,
pub resolved_amalgamation: crate::symbolic::AmalgamationStrategy,
pub resolved_preprocess: crate::symbolic::OrderingPreprocess,
}
#[derive(Debug, Clone)]
pub struct LdltExport {
pub perm: Vec<usize>,
pub perm_inv: Vec<usize>,
pub l_indptr: Vec<usize>,
pub l_indices: Vec<usize>,
pub l_values: Vec<f64>,
pub d_diag: Vec<f64>,
pub d_subdiag: Vec<f64>,
}
impl SparseFactors {
pub fn ldlt_export(&self) -> LdltExport {
let n = self.n;
let mut e_of_g = vec![usize::MAX; n];
let mut perm = vec![0usize; n];
let mut perm_inv = vec![0usize; n];
let mut d_diag = vec![0.0f64; n];
let mut d_subdiag = vec![0.0f64; n];
let mut e = 0usize;
for node in &self.node_factors {
let ff = &node.frontal_factors;
for j in 0..ff.nelim {
let g = node.row_indices[ff.perm[j]];
e_of_g[g] = e;
perm[e] = self.perm[g];
perm_inv[self.perm[g]] = e;
d_diag[e] = ff.d_diag[j];
d_subdiag[e] = ff.d_subdiag[j];
e += 1;
}
}
debug_assert_eq!(e, n, "every index eliminated exactly once");
let mut l_indptr = Vec::with_capacity(n + 1);
l_indptr.push(0);
let mut l_indices = Vec::new();
let mut l_values = Vec::new();
let mut col: Vec<(usize, f64)> = Vec::new();
for node in &self.node_factors {
let ff = &node.frontal_factors;
let nrow = ff.nrow;
for j in 0..ff.nelim {
col.clear();
let diag_e = e_of_g[node.row_indices[ff.perm[j]]];
col.push((diag_e, 1.0));
for i in (j + 1)..nrow {
let v = ff.l[j * nrow + i];
if v != 0.0 {
let row_e = e_of_g[node.row_indices[ff.perm[i]]];
col.push((row_e, v));
}
}
col.sort_unstable_by_key(|&(r, _)| r);
for &(r, v) in &col {
l_indices.push(r);
l_values.push(v);
}
l_indptr.push(l_indices.len());
}
}
LdltExport {
perm,
perm_inv,
l_indptr,
l_indices,
l_values,
d_diag,
d_subdiag,
}
}
pub fn summary(&self) -> String {
let mut n_2x2 = 0usize;
let mut n_delayed = 0usize;
let mut nnz_l = 0usize;
let mut inertia = crate::inertia::Inertia::new(0, 0, 0);
for nf in &self.node_factors {
let ff = &nf.frontal_factors;
n_delayed += ff.n_delayed;
let trailing = ff.nrow.saturating_sub(ff.nelim) * ff.nelim;
nnz_l += ff.nelim * (ff.nelim + 1) / 2 + trailing;
let nelim = ff.nelim;
let mut k = 0;
while k < nelim {
let two_by_two = k + 1 < nelim && ff.d_subdiag[k] != 0.0;
if two_by_two {
n_2x2 += 1;
k += 2;
} else {
k += 1;
}
}
inertia.positive += nf.inertia.positive;
inertia.negative += nf.inertia.negative;
inertia.zero += nf.inertia.zero;
}
format!(
"n={} | ord={:?} | amalg={:?} | preproc={:?} | scaling={:?} | n_supernodes={} | nnz_L={} | n_2x2={} | n_delayed={} | inertia=({},{},{})",
self.n,
self.resolved_method,
self.resolved_amalgamation,
self.resolved_preprocess,
self.scaling_info,
self.node_factors.len(),
nnz_l,
n_2x2,
n_delayed,
inertia.positive,
inertia.negative,
inertia.zero,
)
}
pub fn factor_nnz(&self) -> usize {
self.node_factors
.iter()
.map(|nf| {
let nrow = nf.frontal_factors.nrow;
let nelim = nf.frontal_factors.nelim;
let trailing = nrow.saturating_sub(nelim) * nelim;
let eliminated_lower_with_diag = nelim * (nelim + 1) / 2;
eliminated_lower_with_diag + trailing
})
.sum()
}
pub fn min_diagonal(&self) -> Option<f64> {
let mut min_d = f64::INFINITY;
let mut any = false;
for nf in &self.node_factors {
let ff = &nf.frontal_factors;
let nelim = ff.nelim;
let mut k = 0;
while k < nelim {
let two_by_two = k + 1 < nelim && ff.d_subdiag[k] != 0.0;
let eig = if two_by_two {
let a = ff.d_diag[k];
let b = ff.d_subdiag[k];
let c = ff.d_diag[k + 1];
let trace = a + c;
let det = a * c - b * b;
let disc = (trace * trace - 4.0 * det).max(0.0).sqrt();
(trace - disc) * 0.5
} else {
ff.d_diag[k]
};
if eig < min_d {
min_d = eig;
}
any = true;
k += if two_by_two { 2 } else { 1 };
}
}
if any {
Some(min_d)
} else {
None
}
}
pub fn min_pivot_magnitude(&self) -> Option<f64> {
self.pivot_magnitude_extent().map(|(min, _max)| min)
}
pub fn max_pivot_magnitude(&self) -> Option<f64> {
self.pivot_magnitude_extent().map(|(_min, max)| max)
}
pub fn n_tiny(&self) -> usize {
self.node_factors
.iter()
.map(|nf| nf.frontal_factors.n_tiny)
.sum()
}
fn pivot_magnitude_extent(&self) -> Option<(f64, f64)> {
let mut min_mag = f64::INFINITY;
let mut max_mag = 0.0f64;
let mut any = false;
for nf in &self.node_factors {
let ff = &nf.frontal_factors;
let nelim = ff.nelim;
let mut k = 0;
while k < nelim {
let two_by_two = k + 1 < nelim && ff.d_subdiag[k] != 0.0;
let (smaller, larger) = if two_by_two {
let a = ff.d_diag[k];
let b = ff.d_subdiag[k];
let c = ff.d_diag[k + 1];
let trace = a + c;
let det = a * c - b * b;
let disc = (trace * trace - 4.0 * det).max(0.0).sqrt();
let larger = (trace.abs() + disc) * 0.5;
let smaller = if larger > 0.0 {
det.abs() / larger
} else {
0.0
};
(smaller, larger)
} else {
let m = ff.d_diag[k].abs();
(m, m)
};
if smaller < min_mag {
min_mag = smaller;
}
if larger > max_mag {
max_mag = larger;
}
any = true;
k += if two_by_two { 2 } else { 1 };
}
}
if any {
Some((min_mag, max_mag))
} else {
None
}
}
}
#[derive(Debug)]
pub struct NodeFactors {
pub first_col: usize,
pub ncol: usize,
pub nelim: usize,
pub n_delayed_in: usize,
pub nrow: usize,
pub row_indices: Vec<usize>,
pub frontal_factors: FrontalFactors,
pub inertia: Inertia,
}
#[derive(Debug, Default)]
pub struct FactorWorkspace {
row_map: Vec<usize>,
frontal_values: Vec<f64>,
build_delayed: Vec<usize>,
build_trailing: Vec<usize>,
build_seen: Vec<bool>,
dense_values: Vec<f64>,
local_contribs: Vec<Option<ContribBlock>>,
pub factor_scratch: FactorScratch,
pub(crate) permute_cache: Option<PermuteCache>,
}
#[derive(Debug, Default)]
pub(crate) struct PermuteCache {
input_n: usize,
input_nnz: usize,
input_col_ptr: Vec<usize>,
input_row_idx: Vec<usize>,
input_perm_inv: Vec<usize>,
permuted_col_ptr: Vec<usize>,
permuted_row_idx: Vec<usize>,
value_map: Vec<usize>,
}
impl FactorWorkspace {
pub fn new() -> Self {
Self::default()
}
}
#[inline]
pub fn should_use_dense_fast_path(n: usize, nnz_lower: usize) -> bool {
const N_TINY: usize = 16;
const N_MAX: usize = 128;
const RHO_NUM: usize = 1;
const RHO_DEN: usize = 4;
if n == 0 {
return false;
}
if n <= N_TINY {
return true;
}
if n > N_MAX {
return false;
}
let lower_cells = n * (n + 1) / 2;
nnz_lower * RHO_DEN >= lower_cells * RHO_NUM
}
pub fn dense_fast_factor(
matrix: &CscMatrix,
params: &NumericParams,
) -> Result<(SparseFactors, Inertia), FeralError> {
let mut ws = FactorWorkspace::new();
dense_fast_factor_with_workspace(matrix, params, &mut ws)
}
pub fn dense_fast_factor_with_workspace(
matrix: &CscMatrix,
params: &NumericParams,
ws: &mut FactorWorkspace,
) -> Result<(SparseFactors, Inertia), FeralError> {
let n = matrix.n;
if n == 0 {
return Err(FeralError::InvalidInput(
"dense_fast_factor: matrix dimension is zero".to_string(),
));
}
let dense_buf = std::mem::take(&mut ws.dense_values);
let mut sym = matrix.to_dense_into(dense_buf);
let (scaling, scaling_info) = compute_scaling_dense_fast(matrix, &sym, ¶ms.scaling)?;
if params.warn_partial_singular {
if let crate::scaling::ScalingInfo::PartialSingular { n_unmatched } = &scaling_info {
eprintln!(
"warning: MC64 matching left {} of {} variables unmatched; \
scaling is identity on those rows/columns",
n_unmatched, n
);
}
}
for (j, &s_j) in scaling.iter().enumerate() {
let col = j * n;
for (i, &s_i) in scaling.iter().enumerate().skip(j) {
sym.data[col + i] *= s_i * s_j;
}
}
let local_params =
apply_post_scaling_overrides(params, scaled_matrix_infnorm_dense(&sym.data, n), n);
let params: &NumericParams = local_params.as_ref().unwrap_or(params);
let ff = if params.sqd_mode {
factor_frontal_diagonal_in_place(&mut sym, n, ¶ms.bk)?
} else {
factor_frontal_blocked_in_place_with_scratch(
&mut sym,
n,
false,
¶ms.bk,
&mut ws.factor_scratch,
)?
};
ws.dense_values = sym.data;
let inertia = ff.inertia.clone();
let needs_refinement = ff.needs_refinement;
let perm: Vec<usize> = (0..n).collect();
let perm_inv: Vec<usize> = (0..n).collect();
let row_indices: Vec<usize> = (0..n).collect();
let node = NodeFactors {
first_col: 0,
ncol: n,
nelim: ff.nelim,
n_delayed_in: 0,
nrow: n,
row_indices,
frontal_factors: ff,
inertia: inertia.clone(),
};
Ok((
SparseFactors {
n,
perm,
perm_inv,
node_factors: vec![node],
needs_refinement,
scaling,
scaling_info,
resolved_method: crate::symbolic::OrderingMethod::Amd,
resolved_amalgamation: crate::symbolic::AmalgamationStrategy::Adjacency,
resolved_preprocess: crate::symbolic::OrderingPreprocess::None,
},
inertia,
))
}
pub fn factorize_multifrontal_supernodal(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
) -> Result<(SparseFactors, Inertia), FeralError> {
let mut ws = FactorWorkspace::new();
factorize_multifrontal_supernodal_with_workspace(matrix, symbolic, params, &mut ws)
}
pub fn factorize_multifrontal(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
) -> Result<(SparseFactors, Inertia), FeralError> {
let mut ws = FactorWorkspace::new();
factorize_multifrontal_with_workspace(matrix, symbolic, params, &mut ws)
}
pub fn factorize_multifrontal_with_schur(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
) -> Result<(SparseFactors, Inertia, SchurBlock), FeralError> {
let n_schur = symbolic.is_schur_tail.ok_or_else(|| {
FeralError::InvalidInput(
"factorize_multifrontal_with_schur requires symbolic produced by \
symbolic_factorize_with_schur (is_schur_tail is None)"
.to_string(),
)
})?;
if n_schur == 0 {
return Err(FeralError::InvalidInput(
"is_schur_tail = Some(0); use factorize_multifrontal instead".to_string(),
));
}
let n = symbolic.n;
let n_snodes = symbolic.supernodes.len();
let mut nvschur_per_snode = vec![0usize; n_snodes];
let schur_lo = n - n_schur;
for (s, snode) in symbolic.supernodes.iter().enumerate() {
let col_lo = snode.first_col;
let col_hi = col_lo + snode.ncol();
if col_hi <= schur_lo || col_lo >= n {
continue;
}
let lo = col_lo.max(schur_lo);
let hi = col_hi.min(n);
nvschur_per_snode[s] = hi - lo;
}
let last_snode = n_snodes
.checked_sub(1)
.ok_or_else(|| FeralError::InvalidInput("symbolic has zero supernodes".to_string()))?;
let last = &symbolic.supernodes[last_snode];
if last.first_col + last.ncol() != n {
return Err(FeralError::InvalidInput(
"Schur path expects last supernode to end at n-1".to_string(),
));
}
if nvschur_per_snode[last_snode] != n_schur {
return Err(FeralError::InvalidInput(format!(
"F3.2b scope: Schur tail must lie in a single root supernode \
(last snode covers {} of {} Schur columns); see \
dev/research/schur-complement.md F3.3",
nvschur_per_snode[last_snode], n_schur
)));
}
for &k in &nvschur_per_snode[..last_snode] {
debug_assert_eq!(k, 0, "nvschur > 0 outside last snode violates F3.2b scope");
}
let mut ws = FactorWorkspace::new();
factorize_multifrontal_with_schur_inner(matrix, symbolic, params, &mut ws, &nvschur_per_snode)
}
fn factorize_multifrontal_with_schur_inner(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
ws: &mut FactorWorkspace,
nvschur_per_snode: &[usize],
) -> Result<(SparseFactors, Inertia, SchurBlock), FeralError> {
let n = symbolic.n;
let n_snodes = symbolic.supernodes.len();
ws.row_map.clear();
ws.row_map.resize(n, usize::MAX);
let (scaling_user, scaling_info) = crate::scaling::compute_scaling(matrix, ¶ms.scaling)?;
let scaling_pivot_order: Vec<f64> =
symbolic.perm.iter().map(|&old| scaling_user[old]).collect();
let (permuted, _) = permute_csc_values_with_cache(
matrix,
&symbolic.perm,
&symbolic.perm_inv,
false,
params.pattern_reused_hint,
&mut ws.permute_cache,
)?;
let full_pattern = &symbolic.permuted_pattern;
let mut is_root = vec![true; n_snodes];
for snode in &symbolic.supernodes {
for &child_idx in &snode.children {
if child_idx < n_snodes {
is_root[child_idx] = false;
}
}
}
let mut contrib_blocks: Vec<Option<ContribBlock>> = (0..n_snodes).map(|_| None).collect();
let mut node_factors: Vec<NodeFactors> = Vec::with_capacity(n_snodes);
let mut total_inertia = Inertia {
positive: 0,
negative: 0,
zero: 0,
};
let mut needs_refinement = false;
for (snode_idx, &nvschur) in nvschur_per_snode.iter().enumerate() {
let node = factor_one_supernode(
snode_idx,
symbolic,
&permuted,
full_pattern,
&scaling_pivot_order,
&is_root,
params,
ws,
&mut contrib_blocks,
nvschur,
)?;
total_inertia.positive += node.inertia.positive;
total_inertia.negative += node.inertia.negative;
total_inertia.zero += node.inertia.zero;
if node.frontal_factors.needs_refinement {
needs_refinement = true;
}
node_factors.push(node);
}
let n_schur = nvschur_per_snode[n_snodes - 1];
debug_assert!(n_schur > 0);
let contrib = contrib_blocks[n_snodes - 1].take().ok_or_else(|| {
FeralError::InvalidInput(
"Schur path: root supernode produced no contribution block".to_string(),
)
})?;
if contrib.dim < n_schur {
return Err(FeralError::InvalidInput(format!(
"Schur extraction: root contrib dim {} < n_schur {}",
contrib.dim, n_schur
)));
}
let mut out = vec![0.0f64; n_schur * n_schur];
for j in 0..n_schur {
for i in 0..n_schur {
let val = if i >= j {
contrib.data[j * contrib.dim + i]
} else {
contrib.data[i * contrib.dim + j]
};
out[j * n_schur + i] = val;
}
}
let factors = SparseFactors {
n,
perm: symbolic.perm.clone(),
perm_inv: symbolic.perm_inv.clone(),
node_factors,
needs_refinement,
scaling: scaling_user,
scaling_info,
resolved_method: symbolic.resolved_method,
resolved_amalgamation: symbolic.resolved_amalgamation,
resolved_preprocess: symbolic.resolved_preprocess,
};
let schur = SchurBlock {
dim: n_schur,
data: out,
};
Ok((factors, total_inertia, schur))
}
pub fn factorize_multifrontal_with_workspace(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
ws: &mut FactorWorkspace,
) -> Result<(SparseFactors, Inertia), FeralError> {
if should_use_dense_fast_path(matrix.n, matrix.row_idx.len()) {
return dense_fast_factor_with_workspace(matrix, params, ws);
}
factorize_multifrontal_supernodal_with_workspace(matrix, symbolic, params, ws)
}
pub fn factorize_multifrontal_supernodal_with_workspace(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
ws: &mut FactorWorkspace,
) -> Result<(SparseFactors, Inertia), FeralError> {
let t_total = params.profiler.as_ref().map(|_| Instant::now());
let t_prologue = params.profiler.as_ref().map(|_| Instant::now());
let profiling = params.profiler.is_some();
let mut bd = PrologueBreakdown::default();
let tic = || profiling.then(Instant::now);
let toc = |t: Option<Instant>| t.map(|t| t.elapsed().as_micros() as u64).unwrap_or(0);
let n = symbolic.n;
let n_snodes = symbolic.supernodes.len();
let t_phase = tic();
ws.row_map.clear();
ws.row_map.resize(n, usize::MAX);
bd.row_map_us = toc(t_phase);
let t_phase = tic();
let (scaling_user, scaling_info) =
compute_scaling_with_cache(matrix, ¶ms.scaling, symbolic.cached_mc64.as_ref())?;
bd.scaling_us = toc(t_phase);
if params.warn_partial_singular {
if let crate::scaling::ScalingInfo::PartialSingular { n_unmatched } = &scaling_info {
eprintln!(
"warning: MC64 matching left {} of {} variables unmatched; \
scaling is identity on those rows/columns",
n_unmatched, n
);
}
}
let t_phase = tic();
let scaling_pivot_order: Vec<f64> =
symbolic.perm.iter().map(|&old| scaling_user[old]).collect();
bd.scaling_pivot_order_us = toc(t_phase);
debug_assert_eq!(scaling_pivot_order.len(), n);
let t_phase = tic();
let (permuted, from_triplets_us) = permute_csc_values_with_cache(
matrix,
&symbolic.perm,
&symbolic.perm_inv,
profiling,
params.pattern_reused_hint,
&mut ws.permute_cache,
)?;
bd.permute_us = toc(t_phase);
bd.permute_from_triplets_us = from_triplets_us;
let t_phase = tic();
let local_params = apply_post_scaling_overrides(
params,
scaled_matrix_infnorm(&permuted, &scaling_pivot_order),
n,
);
let params: &NumericParams = local_params.as_ref().unwrap_or(params);
bd.infnorm_tol_us = toc(t_phase);
let t_phase = tic();
let full_pattern = &symbolic.permuted_pattern;
bd.symmetric_pattern_us = toc(t_phase);
let t_phase = tic();
let mut is_root = vec![true; n_snodes];
for snode in &symbolic.supernodes {
for &child_idx in &snode.children {
if child_idx < n_snodes {
is_root[child_idx] = false;
}
}
}
let mut contrib_blocks: Vec<Option<ContribBlock>> = (0..n_snodes).map(|_| None).collect();
let mut node_factors: Vec<NodeFactors> = Vec::with_capacity(n_snodes);
let mut total_inertia = Inertia {
positive: 0,
negative: 0,
zero: 0,
};
let mut needs_refinement = false;
bd.setup_us = toc(t_phase);
let use_small_leaf =
params.small_leaf == SmallLeafBatch::On && !symbolic.small_leaf_groups.is_empty();
let mut snode_idx = 0usize;
let prologue_us = t_prologue.map(|t| t.elapsed().as_micros() as u64);
while snode_idx < n_snodes {
if use_small_leaf {
if let Some(gid) = symbolic.snode_group[snode_idx] {
let group = &symbolic.small_leaf_groups[gid];
debug_assert_eq!(
group.members.first(),
Some(&snode_idx),
"group members must start at current snode_idx"
);
for (i, &m) in group.members.iter().enumerate() {
let t_snode = params.profiler.as_ref().map(|_| Instant::now());
let node = factor_one_small_leaf(
m,
&group.member_rows[i],
symbolic,
&permuted,
&scaling_pivot_order,
&is_root,
params,
ws,
&mut contrib_blocks,
)?;
if let (Some(arc), Some(t)) = (params.profiler.as_ref(), t_snode) {
let snode = &symbolic.supernodes[m];
let timing = SupernodeTiming {
snode_idx: m,
nrow: snode.nrow,
ncol: snode.ncol,
us: t.elapsed().as_micros() as u64,
assembly_us: 0,
densefactor_us: 0,
panelfactor_us: 0,
schur_us: 0,
scalartail_us: 0,
};
if let Ok(mut prof) = arc.lock() {
prof.timings.push(timing);
}
}
total_inertia.positive += node.inertia.positive;
total_inertia.negative += node.inertia.negative;
total_inertia.zero += node.inertia.zero;
if node.frontal_factors.needs_refinement {
needs_refinement = true;
}
node_factors.push(node);
}
snode_idx += group.members.len();
continue;
}
}
let t_snode = params.profiler.as_ref().map(|_| Instant::now());
let phase_before = phase_timing::snapshot();
let node = factor_one_supernode(
snode_idx,
symbolic,
&permuted,
full_pattern,
&scaling_pivot_order,
&is_root,
params,
ws,
&mut contrib_blocks,
0, )?;
if let (Some(arc), Some(t)) = (params.profiler.as_ref(), t_snode) {
let snode = &symbolic.supernodes[snode_idx];
let phase_after = phase_timing::snapshot();
let timing = SupernodeTiming {
snode_idx,
nrow: snode.nrow,
ncol: snode.ncol,
us: t.elapsed().as_micros() as u64,
assembly_us: (phase_after.0 - phase_before.0) / 1000,
densefactor_us: (phase_after.1 - phase_before.1) / 1000,
panelfactor_us: (phase_after.2 - phase_before.2) / 1000,
schur_us: (phase_after.3 - phase_before.3) / 1000,
scalartail_us: (phase_after.4 - phase_before.4) / 1000,
};
if let Ok(mut prof) = arc.lock() {
prof.timings.push(timing);
}
}
total_inertia.positive += node.inertia.positive;
total_inertia.negative += node.inertia.negative;
total_inertia.zero += node.inertia.zero;
if node.frontal_factors.needs_refinement {
needs_refinement = true;
}
node_factors.push(node);
snode_idx += 1;
}
let t_epilogue = params.profiler.as_ref().map(|_| Instant::now());
let result = Ok((
SparseFactors {
n,
perm: symbolic.perm.clone(),
perm_inv: symbolic.perm_inv.clone(),
node_factors,
needs_refinement,
scaling: scaling_user,
scaling_info,
resolved_method: symbolic.resolved_method,
resolved_amalgamation: symbolic.resolved_amalgamation,
resolved_preprocess: symbolic.resolved_preprocess,
},
total_inertia,
));
if let Some(arc) = params.profiler.as_ref() {
if let Ok(mut prof) = arc.lock() {
prof.prologue_us = prologue_us.unwrap_or(0);
prof.prologue_breakdown = bd;
prof.epilogue_us = t_epilogue
.map(|t| t.elapsed().as_micros() as u64)
.unwrap_or(0);
prof.total_us = t_total.map(|t| t.elapsed().as_micros() as u64).unwrap_or(0);
}
}
result
}
#[allow(clippy::too_many_arguments)]
fn factor_one_supernode(
snode_idx: usize,
symbolic: &SymbolicFactorization,
permuted: &CscMatrix,
full_pattern: &crate::sparse::csc::CscPattern,
scaling_pivot_order: &[f64],
is_root: &[bool],
params: &NumericParams,
ws: &mut FactorWorkspace,
contrib_blocks: &mut [Option<ContribBlock>],
nvschur: usize,
) -> Result<NodeFactors, FeralError> {
let snode = &symbolic.supernodes[snode_idx];
let own_ncol = snode.ncol();
let nrow = snode.nrow;
if nrow == 0 || own_ncol == 0 {
return Ok(NodeFactors {
first_col: snode.first_col,
ncol: 0,
nelim: 0,
n_delayed_in: 0,
nrow: 0,
row_indices: Vec::new(),
frontal_factors: FrontalFactors {
nrow: 0,
ncol: 0,
nelim: 0,
l: Vec::new(),
d_diag: Vec::new(),
d_subdiag: Vec::new(),
perm: Vec::new(),
perm_inv: Vec::new(),
contrib: Vec::new(),
contrib_dim: 0,
n_delayed: 0,
inertia: Inertia {
positive: 0,
negative: 0,
zero: 0,
},
needs_refinement: false,
n_rook_rescues: 0,
n_tiny: 0,
zero_tol: params.bk.zero_tol,
zero_tol_2x2: params.bk.zero_tol_2x2,
},
inertia: Inertia {
positive: 0,
negative: 0,
zero: 0,
},
});
}
let n_delayed_in: usize = snode
.children
.iter()
.filter_map(|&c| contrib_blocks[c].as_ref())
.map(|c| c.n_delayed)
.sum();
let expanded_ncol = own_ncol + n_delayed_in;
let budget_exceeded =
snode.delayed_capacity != usize::MAX && n_delayed_in > snode.delayed_capacity;
let cb_armed = params.cascade_break_ratio.is_some();
if budget_exceeded && !cb_armed && !is_root[snode_idx] {
return Err(FeralError::DelayBudgetExceeded {
supernode: snode_idx,
required: n_delayed_in,
capacity: snode.delayed_capacity,
});
}
let _pt_asm = phase_timing::start();
let _pt_br = phase_timing::start();
let mut row_indices = build_row_indices(
snode,
full_pattern,
contrib_blocks,
&mut ws.build_delayed,
&mut ws.build_trailing,
&mut ws.build_seen,
);
phase_timing::stop(&phase_timing::BUILDROW_NS, _pt_br);
let actual_nrow = row_indices.len();
debug_assert!(
actual_nrow >= expanded_ncol,
"row_indices ({}) must cover the expanded fully-summed block ({})",
actual_nrow,
expanded_ncol
);
#[cfg(debug_assertions)]
{
let first_col = snode.first_col;
let own_last = first_col + own_ncol;
for (pos, &r) in row_indices.iter().enumerate().skip(expanded_ncol) {
debug_assert!(
r >= own_last,
"trailing row {} at position {} of supernode (first_col={}, own_ncol={}, expanded_ncol={}) is < first_col+own_ncol={}",
r, pos, first_col, own_ncol, expanded_ncol, own_last
);
}
}
let own_col_offset = if nvschur > 0 && n_delayed_in > 0 {
let mut swapped = Vec::with_capacity(actual_nrow);
swapped.extend_from_slice(&row_indices[own_ncol..expanded_ncol]);
swapped.extend_from_slice(&row_indices[..own_ncol]);
swapped.extend_from_slice(&row_indices[expanded_ncol..]);
row_indices = swapped;
n_delayed_in
} else {
0
};
for (local, &global) in row_indices.iter().enumerate() {
ws.row_map[global] = local;
}
let scaling = scaling_pivot_order;
let frontal_buf = std::mem::take(&mut ws.frontal_values);
let mut frontal = SymmetricMatrix::from_pooled_buf(actual_nrow, frontal_buf);
let _pt_sc = phase_timing::start();
for (k_local, &gj) in row_indices[own_col_offset..own_col_offset + own_ncol]
.iter()
.enumerate()
{
let local_j = own_col_offset + k_local;
let s_j = scaling[gj];
for k in permuted.col_ptr[gj]..permuted.col_ptr[gj + 1] {
let gi = permuted.row_idx[k];
let local_i = ws.row_map[gi];
if local_i != usize::MAX {
let val = permuted.values[k] * scaling[gi] * s_j;
frontal.set(local_i, local_j, val);
}
}
}
phase_timing::stop(&phase_timing::SCATTER_NS, _pt_sc);
let _pt_ea = phase_timing::start();
for &child_idx in &snode.children {
if let Some(mut contrib) = contrib_blocks[child_idx].take() {
extend_add(&contrib, &ws.row_map, &mut frontal);
ws.factor_scratch.contrib_pool = Some(std::mem::take(&mut contrib.data));
}
}
phase_timing::stop(&phase_timing::EXTENDADD_NS, _pt_ea);
phase_timing::stop(&phase_timing::ASSEMBLY_NS, _pt_asm);
debug_assert!(
nvschur == 0 || is_root[snode_idx],
"nvschur > 0 only valid at root supernodes (Schur tail invariant)"
);
debug_assert!(nvschur <= expanded_ncol);
let cascade_break = match params.cascade_break_ratio {
Some(r) if !is_root[snode_idx] && params.allow_delayed_pivots && expanded_ncol > 0 => {
if snode.delayed_capacity != usize::MAX {
budget_exceeded
} else {
symbolic.n >= CASCADE_BREAK_MIN_N
&& (n_delayed_in as f64) / (expanded_ncol as f64) >= r
}
}
_ => false,
};
let may_delay = !is_root[snode_idx] && params.allow_delayed_pivots && !cascade_break;
let local_bk;
let bk_ref: &BunchKaufmanParams = if cascade_break {
let on_zero = match params.cascade_break_eps {
Some(eps) => ZeroPivotAction::PerturbToEps { abs_floor: eps },
None => ZeroPivotAction::ForceAccept,
};
local_bk = BunchKaufmanParams {
on_zero_pivot: on_zero,
..params.bk.clone()
};
&local_bk
} else {
¶ms.bk
};
let eliminable = expanded_ncol - nvschur;
let trace = supernode_trace_enabled();
let t_sn = trace.then(Instant::now);
let _pt_df = phase_timing::start();
let mut ff = if params.sqd_mode {
factor_frontal_diagonal_in_place(&mut frontal, eliminable, ¶ms.bk)?
} else {
factor_frontal_blocked_in_place_with_scratch(
&mut frontal,
eliminable,
may_delay,
bk_ref,
&mut ws.factor_scratch,
)?
};
phase_timing::stop(&phase_timing::DENSEFACTOR_NS, _pt_df);
if let Some(t0) = t_sn {
let ms = t0.elapsed().as_secs_f64() * 1e3;
eprintln!(
"[sn-trace] sn={snode_idx} nrow={actual_nrow} exp_ncol={expanded_ncol} \
elim={eliminable} n_del_in={n_delayed_in} may_del={may_delay} cb={cascade_break} \
nelim={} n_del_out={} rook_rescues={} pos={} neg={} zero={} ms={ms:.3}",
ff.nelim,
ff.n_delayed,
ff.n_rook_rescues,
ff.inertia.positive,
ff.inertia.negative,
ff.inertia.zero,
);
}
ws.frontal_values = frontal.data;
let node_inertia = ff.inertia.clone();
let node_nelim = ff.nelim;
let node_n_delayed = ff.n_delayed;
if ff.contrib_dim > 0 {
let cdim = ff.contrib_dim;
let mut contrib_row_indices = Vec::with_capacity(cdim);
for cj in 0..cdim {
contrib_row_indices.push(row_indices[ff.perm[node_nelim + cj]]);
}
let contrib_data = std::mem::take(&mut ff.contrib);
contrib_blocks[snode_idx] = Some(ContribBlock {
row_indices: contrib_row_indices,
data: contrib_data,
dim: cdim,
n_delayed: node_n_delayed,
});
}
for &global in &row_indices {
ws.row_map[global] = usize::MAX;
}
Ok(NodeFactors {
first_col: snode.first_col,
ncol: expanded_ncol,
nelim: node_nelim,
n_delayed_in,
nrow: actual_nrow,
row_indices,
frontal_factors: ff,
inertia: node_inertia,
})
}
#[allow(clippy::too_many_arguments)]
fn factor_one_small_leaf(
snode_idx: usize,
precomputed_rows: &[usize],
symbolic: &SymbolicFactorization,
permuted: &CscMatrix,
scaling_pivot_order: &[f64],
is_root: &[bool],
params: &NumericParams,
ws: &mut FactorWorkspace,
contrib_blocks: &mut [Option<ContribBlock>],
) -> Result<NodeFactors, FeralError> {
let snode = &symbolic.supernodes[snode_idx];
debug_assert!(
snode.children.is_empty(),
"factor_one_small_leaf called on non-leaf supernode {}",
snode_idx
);
let own_ncol = snode.ncol();
let nrow = snode.nrow;
if nrow == 0 || own_ncol == 0 {
return Ok(NodeFactors {
first_col: snode.first_col,
ncol: 0,
nelim: 0,
n_delayed_in: 0,
nrow: 0,
row_indices: Vec::new(),
frontal_factors: FrontalFactors {
nrow: 0,
ncol: 0,
nelim: 0,
l: Vec::new(),
d_diag: Vec::new(),
d_subdiag: Vec::new(),
perm: Vec::new(),
perm_inv: Vec::new(),
contrib: Vec::new(),
contrib_dim: 0,
n_delayed: 0,
inertia: Inertia {
positive: 0,
negative: 0,
zero: 0,
},
needs_refinement: false,
n_rook_rescues: 0,
n_tiny: 0,
zero_tol: params.bk.zero_tol,
zero_tol_2x2: params.bk.zero_tol_2x2,
},
inertia: Inertia {
positive: 0,
negative: 0,
zero: 0,
},
});
}
let row_indices = precomputed_rows.to_vec();
let actual_nrow = row_indices.len();
let expanded_ncol = own_ncol;
debug_assert!(actual_nrow >= expanded_ncol);
for (local, &global) in row_indices.iter().enumerate() {
ws.row_map[global] = local;
}
let scaling = scaling_pivot_order;
let frontal_buf = std::mem::take(&mut ws.frontal_values);
let mut frontal = SymmetricMatrix::from_pooled_buf(actual_nrow, frontal_buf);
for (local_j, &gj) in row_indices[..own_ncol].iter().enumerate() {
let s_j = scaling[gj];
for k in permuted.col_ptr[gj]..permuted.col_ptr[gj + 1] {
let gi = permuted.row_idx[k];
let local_i = ws.row_map[gi];
if local_i != usize::MAX {
let val = permuted.values[k] * scaling[gi] * s_j;
frontal.set(local_i, local_j, val);
}
}
}
let may_delay = !is_root[snode_idx] && params.allow_delayed_pivots;
let mut ff = if params.sqd_mode {
factor_frontal_diagonal_in_place(&mut frontal, expanded_ncol, ¶ms.bk)?
} else {
factor_frontal_blocked_in_place_with_scratch(
&mut frontal,
expanded_ncol,
may_delay,
¶ms.bk,
&mut ws.factor_scratch,
)?
};
ws.frontal_values = frontal.data;
let node_inertia = ff.inertia.clone();
let node_nelim = ff.nelim;
let node_n_delayed = ff.n_delayed;
if ff.contrib_dim > 0 {
let cdim = ff.contrib_dim;
let mut contrib_row_indices = Vec::with_capacity(cdim);
for cj in 0..cdim {
contrib_row_indices.push(row_indices[ff.perm[node_nelim + cj]]);
}
let contrib_data = std::mem::take(&mut ff.contrib);
contrib_blocks[snode_idx] = Some(ContribBlock {
row_indices: contrib_row_indices,
data: contrib_data,
dim: cdim,
n_delayed: node_n_delayed,
});
}
for &global in &row_indices {
ws.row_map[global] = usize::MAX;
}
Ok(NodeFactors {
first_col: snode.first_col,
ncol: expanded_ncol,
nelim: node_nelim,
n_delayed_in: 0,
nrow: actual_nrow,
row_indices,
frontal_factors: ff,
inertia: node_inertia,
})
}
pub const N_PAR_MIN: usize = 32;
pub const PAR_MIN_FLOPS: u64 = 10_000_000;
pub fn estimate_assembly_flops(supernodes: &[Supernode]) -> u64 {
supernodes
.iter()
.map(|s| {
let ncol = s.ncol as u64;
let nrow = s.nrow as u64;
ncol.saturating_mul(nrow).saturating_mul(nrow)
})
.fold(0u64, |acc, x| acc.saturating_add(x))
}
pub fn should_parallelize_assembly(symbolic: &SymbolicFactorization) -> bool {
should_parallelize_assembly_with_threshold(symbolic, PAR_MIN_FLOPS)
}
pub fn should_parallelize_assembly_with_threshold(
symbolic: &SymbolicFactorization,
min_flops: u64,
) -> bool {
if symbolic.supernodes.len() < N_PAR_MIN {
return false;
}
if !symbolic.supernodes.iter().any(|s| s.children.len() >= 2) {
return false;
}
estimate_assembly_flops(&symbolic.supernodes) >= min_flops
}
pub fn factorize_multifrontal_parallel_with_workspace(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
ws: &mut FactorWorkspace,
) -> Result<(SparseFactors, Inertia), FeralError> {
if should_use_dense_fast_path(matrix.n, matrix.row_idx.len()) {
return dense_fast_factor_with_workspace(matrix, params, ws);
}
let threshold = params.min_parallel_flops.unwrap_or(PAR_MIN_FLOPS);
if should_parallelize_assembly_with_threshold(symbolic, threshold) {
return factorize_multifrontal_supernodal_parallel(matrix, symbolic, params);
}
factorize_multifrontal_supernodal_with_workspace(matrix, symbolic, params, ws)
}
pub fn factorize_multifrontal_parallel(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
) -> Result<(SparseFactors, Inertia), FeralError> {
let mut ws = FactorWorkspace::new();
factorize_multifrontal_parallel_with_workspace(matrix, symbolic, params, &mut ws)
}
pub fn factorize_multifrontal_supernodal_parallel(
matrix: &CscMatrix,
symbolic: &SymbolicFactorization,
params: &NumericParams,
) -> Result<(SparseFactors, Inertia), FeralError> {
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Mutex;
let intrafront_on = !matches!(
std::env::var("FERAL_INTRAFRONT").as_deref(),
Ok("0") | Ok("off") | Ok("false") | Ok("no")
);
let params = &{
let mut p = params.clone();
p.bk.intrafront_parallel = intrafront_on;
p
};
let n = symbolic.n;
let n_snodes = symbolic.supernodes.len();
let telemetry = params.parallel_telemetry.as_deref();
let t_phase = telemetry.map(|_| std::time::Instant::now());
let (scaling_user, scaling_info) =
compute_scaling_with_cache(matrix, ¶ms.scaling, symbolic.cached_mc64.as_ref())?;
if params.warn_partial_singular {
if let crate::scaling::ScalingInfo::PartialSingular { n_unmatched } = &scaling_info {
eprintln!(
"warning: MC64 matching left {} of {} variables unmatched; \
scaling is identity on those rows/columns",
n_unmatched, n
);
}
}
let scaling_pivot_order: Vec<f64> =
symbolic.perm.iter().map(|&old| scaling_user[old]).collect();
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_scaling_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let t_phase = telemetry.map(|_| std::time::Instant::now());
let (permuted, _) = permute_csc_values(matrix, &symbolic.perm, &symbolic.perm_inv, false)?;
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_permute_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let local_params = apply_post_scaling_overrides(
params,
scaled_matrix_infnorm(&permuted, &scaling_pivot_order),
n,
);
let params: &NumericParams = local_params.as_ref().unwrap_or(params);
let t_phase = telemetry.map(|_| std::time::Instant::now());
let full_pattern = &symbolic.permuted_pattern;
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_symmetric_pattern_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let t_phase = telemetry.map(|_| std::time::Instant::now());
let mut is_root = vec![true; n_snodes];
for snode in &symbolic.supernodes {
for &child_idx in &snode.children {
if child_idx < n_snodes {
is_root[child_idx] = false;
}
}
}
let mut parents: Vec<Option<usize>> = vec![None; n_snodes];
for (i, snode) in symbolic.supernodes.iter().enumerate() {
for &c in &snode.children {
if c < n_snodes {
parents[c] = Some(i);
}
}
}
let pending: Vec<AtomicUsize> = symbolic
.supernodes
.iter()
.map(|s| {
let cnt = s.children.iter().filter(|&&c| c < n_snodes).count();
AtomicUsize::new(cnt)
})
.collect();
let contrib_blocks: Mutex<Vec<Option<ContribBlock>>> =
Mutex::new((0..n_snodes).map(|_| None).collect());
let node_factors_out: Mutex<Vec<Option<NodeFactors>>> =
Mutex::new((0..n_snodes).map(|_| None).collect());
let first_error: Mutex<Option<FeralError>> = Mutex::new(None);
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_tree_setup_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let t_phase = telemetry.map(|_| std::time::Instant::now());
let num_threads = rayon::current_num_threads().max(1);
let thread_ws: Vec<Mutex<FactorWorkspace>> = (0..num_threads + 1)
.map(|_| {
let mut w = FactorWorkspace::new();
w.row_map.resize(n, usize::MAX);
w.build_seen.resize(n, false);
w.local_contribs.resize_with(n_snodes, || None);
Mutex::new(w)
})
.collect();
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_thread_ws_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let t_phase = telemetry.map(|_| std::time::Instant::now());
let leaves: Vec<usize> = symbolic
.supernodes
.iter()
.enumerate()
.filter_map(|(i, s)| {
if s.children.iter().all(|&c| c >= n_snodes) {
Some(i)
} else {
None
}
})
.collect();
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_leaves_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let t_phase = telemetry.map(|_| std::time::Instant::now());
rayon::scope(|scope| {
for &leaf_idx in &leaves {
run_parallel_task(
scope,
leaf_idx,
symbolic,
&permuted,
full_pattern,
&scaling_pivot_order,
&is_root,
params,
&parents,
&pending,
&contrib_blocks,
&node_factors_out,
&first_error,
&thread_ws,
);
}
});
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_scope_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let t_phase = telemetry.map(|_| std::time::Instant::now());
let err_opt = match first_error.into_inner() {
Ok(v) => v,
Err(p) => p.into_inner(),
};
if let Some(err) = err_opt {
return Err(err);
}
let nodes_vec = match node_factors_out.into_inner() {
Ok(v) => v,
Err(p) => p.into_inner(),
};
let mut final_nodes: Vec<NodeFactors> = Vec::with_capacity(n_snodes);
let mut total_inertia = Inertia {
positive: 0,
negative: 0,
zero: 0,
};
let mut needs_refinement = false;
for opt in nodes_vec.into_iter() {
let node = match opt {
Some(n) => n,
None => {
return Err(FeralError::InvalidInput(
"parallel driver: supernode was not processed (graph stall)".to_string(),
));
}
};
total_inertia.positive += node.inertia.positive;
total_inertia.negative += node.inertia.negative;
total_inertia.zero += node.inertia.zero;
if node.frontal_factors.needs_refinement {
needs_refinement = true;
}
final_nodes.push(node);
}
if let (Some(t), Some(start)) = (telemetry, t_phase) {
t.phase_collect_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
Ok((
SparseFactors {
n,
perm: symbolic.perm.clone(),
perm_inv: symbolic.perm_inv.clone(),
node_factors: final_nodes,
needs_refinement,
scaling: scaling_user,
scaling_info,
resolved_method: symbolic.resolved_method,
resolved_amalgamation: symbolic.resolved_amalgamation,
resolved_preprocess: symbolic.resolved_preprocess,
},
total_inertia,
))
}
#[allow(clippy::too_many_arguments)]
fn run_parallel_task<'a>(
scope: &rayon::Scope<'a>,
snode_idx: usize,
symbolic: &'a SymbolicFactorization,
permuted: &'a CscMatrix,
full_pattern: &'a crate::sparse::csc::CscPattern,
scaling_pivot_order: &'a [f64],
is_root: &'a [bool],
params: &'a NumericParams,
parents: &'a [Option<usize>],
pending: &'a [std::sync::atomic::AtomicUsize],
contrib_blocks: &'a std::sync::Mutex<Vec<Option<ContribBlock>>>,
node_factors_out: &'a std::sync::Mutex<Vec<Option<NodeFactors>>>,
first_error: &'a std::sync::Mutex<Option<FeralError>>,
thread_ws: &'a [std::sync::Mutex<FactorWorkspace>],
) {
use std::sync::atomic::Ordering;
scope.spawn(move |s| {
let telemetry = params.parallel_telemetry.as_deref();
let t_task_start = telemetry.map(|_| std::time::Instant::now());
if let Some(t) = telemetry {
t.n_tasks.fetch_add(1, Ordering::Relaxed);
}
{
let err_guard = match first_error.lock() {
Ok(g) => g,
Err(p) => p.into_inner(),
};
if err_guard.is_some() {
if let (Some(t), Some(start)) = (telemetry, t_task_start) {
t.task_wall_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
return;
}
}
let snode = &symbolic.supernodes[snode_idx];
let n_snodes = symbolic.supernodes.len();
let thread_idx = rayon::current_thread_index().unwrap_or(thread_ws.len() - 1);
let thread_idx = thread_idx.min(thread_ws.len() - 1);
let ws_mtx = &thread_ws[thread_idx];
let mut prof_us: Option<u64> = None;
let (result, own_contrib) = {
let t_ws_wait = telemetry.map(|_| std::time::Instant::now());
let mut ws_guard = match ws_mtx.lock() {
Ok(g) => g,
Err(p) => p.into_inner(),
};
if let (Some(t), Some(start)) = (telemetry, t_ws_wait) {
t.ws_lock_wait_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let mut local_contribs = std::mem::take(&mut ws_guard.local_contribs);
{
let t_wait_start = telemetry.map(|_| std::time::Instant::now());
let mut shared = match contrib_blocks.lock() {
Ok(g) => g,
Err(p) => p.into_inner(),
};
if let (Some(t), Some(start)) = (telemetry, t_wait_start) {
t.contrib_wait_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let hold_start = telemetry.map(|_| std::time::Instant::now());
for &c in &snode.children {
if c < n_snodes {
local_contribs[c] = shared[c].take();
}
}
if let (Some(t), Some(start)) = (telemetry, hold_start) {
t.contrib_hold_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
}
let factor_start = telemetry.map(|_| std::time::Instant::now());
let prof_start = params.profiler.as_ref().map(|_| std::time::Instant::now());
let res = factor_one_supernode(
snode_idx,
symbolic,
permuted,
full_pattern,
scaling_pivot_order,
is_root,
params,
&mut ws_guard,
&mut local_contribs,
0, );
if let (Some(t), Some(start)) = (telemetry, factor_start) {
t.factor_body_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
if let Some(start) = prof_start {
prof_us = Some(start.elapsed().as_micros() as u64);
}
let own = local_contribs[snode_idx].take();
ws_guard.local_contribs = local_contribs;
(res, own)
};
match result {
Ok(node) => {
if let (Some(arc), Some(us)) = (params.profiler.as_ref(), prof_us) {
let timing = SupernodeTiming {
snode_idx,
nrow: snode.nrow,
ncol: snode.ncol,
us,
assembly_us: 0,
densefactor_us: 0,
panelfactor_us: 0,
schur_us: 0,
scalartail_us: 0,
};
if let Ok(mut prof) = arc.lock() {
prof.timings.push(timing);
}
}
{
let t_wait_start = telemetry.map(|_| std::time::Instant::now());
let mut shared = match contrib_blocks.lock() {
Ok(g) => g,
Err(p) => p.into_inner(),
};
if let (Some(t), Some(start)) = (telemetry, t_wait_start) {
t.contrib_wait_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let hold_start = telemetry.map(|_| std::time::Instant::now());
shared[snode_idx] = own_contrib;
if let (Some(t), Some(start)) = (telemetry, hold_start) {
t.contrib_hold_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
}
{
let t_wait_start = telemetry.map(|_| std::time::Instant::now());
let mut nf = match node_factors_out.lock() {
Ok(g) => g,
Err(p) => p.into_inner(),
};
if let (Some(t), Some(start)) = (telemetry, t_wait_start) {
t.node_factors_wait_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
let hold_start = telemetry.map(|_| std::time::Instant::now());
nf[snode_idx] = Some(node);
if let (Some(t), Some(start)) = (telemetry, hold_start) {
t.node_factors_hold_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
}
if let Some(parent_idx) = parents[snode_idx] {
let prev = pending[parent_idx].fetch_sub(1, Ordering::AcqRel);
if prev == 1 {
run_parallel_task(
s,
parent_idx,
symbolic,
permuted,
full_pattern,
scaling_pivot_order,
is_root,
params,
parents,
pending,
contrib_blocks,
node_factors_out,
first_error,
thread_ws,
);
}
}
}
Err(e) => {
let mut err_guard = match first_error.lock() {
Ok(g) => g,
Err(p) => p.into_inner(),
};
if err_guard.is_none() {
*err_guard = Some(e);
}
}
}
if let (Some(t), Some(start)) = (telemetry, t_task_start) {
t.task_wall_ns
.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed);
}
});
}
fn scaled_matrix_infnorm(permuted: &CscMatrix, scaling: &[f64]) -> f64 {
let n = permuted.n;
let mut row_sum = vec![0.0_f64; n];
for j in 0..n {
let s_j = scaling[j];
for k in permuted.col_ptr[j]..permuted.col_ptr[j + 1] {
let i = permuted.row_idx[k];
let m = (permuted.values[k] * scaling[i] * s_j).abs();
row_sum[i] += m;
if i != j {
row_sum[j] += m;
}
}
}
row_sum.into_iter().fold(0.0_f64, f64::max)
}
fn scaled_matrix_infnorm_dense(data: &[f64], n: usize) -> f64 {
let mut row_sum = vec![0.0_f64; n];
for j in 0..n {
for i in j..n {
let m = data[j * n + i].abs();
row_sum[i] += m;
if i != j {
row_sum[j] += m;
}
}
}
row_sum.into_iter().fold(0.0_f64, f64::max)
}
#[inline]
fn null_pivot_floor(scaled_infnorm: f64, n: usize) -> f64 {
(n as f64).sqrt() * f64::EPSILON * scaled_infnorm
}
fn apply_post_scaling_overrides(
params: &NumericParams,
scaled_infnorm: f64,
n: usize,
) -> Option<NumericParams> {
let static_floor = params
.static_pivot_threshold
.filter(|t| *t > 0.0)
.map(|t| t * scaled_infnorm)
.filter(|f| f.is_finite() && *f > 0.0);
let null_floor = if matches!(params.bk.on_zero_pivot, ZeroPivotAction::Fail) {
None
} else {
let floor = null_pivot_floor(scaled_infnorm, n);
(floor > params.bk.null_pivot_tol).then_some(floor)
};
if static_floor.is_none() && null_floor.is_none() {
return None;
}
let mut local = params.clone();
if let Some(sf) = static_floor {
local.bk.static_pivot_floor = sf;
}
if let Some(floor) = null_floor {
local.bk.null_pivot_tol = floor;
local.bk.null_pivot_tol_2x2 = (floor * scaled_infnorm).max(floor * floor);
}
Some(local)
}
fn permute_csc_values(
matrix: &CscMatrix,
_perm: &[usize],
perm_inv: &[usize],
profile: bool,
) -> Result<(CscMatrix, u64), FeralError> {
let n = matrix.n;
let mut triplets: Vec<(usize, usize, f64)> = Vec::with_capacity(matrix.nnz());
for old_j in 0..n {
let new_j = perm_inv[old_j];
for k in matrix.col_ptr[old_j]..matrix.col_ptr[old_j + 1] {
let old_i = matrix.row_idx[k];
let new_i = perm_inv[old_i];
let val = matrix.values[k];
if new_i >= new_j {
triplets.push((new_i, new_j, val));
} else {
triplets.push((new_j, new_i, val));
}
}
}
let rows: Vec<usize> = triplets.iter().map(|t| t.0).collect();
let cols: Vec<usize> = triplets.iter().map(|t| t.1).collect();
let vals: Vec<f64> = triplets.iter().map(|t| t.2).collect();
let t_ft = profile.then(Instant::now);
let permuted = CscMatrix::from_triplets(n, &rows, &cols, &vals)?;
let from_triplets_us = t_ft.map(|t| t.elapsed().as_micros() as u64).unwrap_or(0);
Ok((permuted, from_triplets_us))
}
fn permute_csc_values_with_cache(
matrix: &CscMatrix,
perm: &[usize],
perm_inv: &[usize],
profile: bool,
pattern_reused_hint: bool,
cache: &mut Option<PermuteCache>,
) -> Result<(CscMatrix, u64), FeralError> {
let n = matrix.n;
let nnz = matrix.nnz();
if pattern_reused_hint {
if let Some(c) = cache.as_ref() {
if c.input_n == n
&& c.input_nnz == nnz
&& c.value_map.len() == nnz
&& c.input_col_ptr == matrix.col_ptr
&& c.input_row_idx == matrix.row_idx
&& c.input_perm_inv.as_slice() == perm_inv
{
let permuted_nnz = c.permuted_row_idx.len();
let mut values = vec![0.0f64; permuted_nnz];
for k in 0..nnz {
let dest = c.value_map[k];
debug_assert!(dest < permuted_nnz);
values[dest] += matrix.values[k];
}
let permuted = CscMatrix {
n,
col_ptr: c.permuted_col_ptr.clone(),
row_idx: c.permuted_row_idx.clone(),
values,
};
return Ok((permuted, 0));
}
}
}
let (permuted, from_triplets_us) = permute_csc_values(matrix, perm, perm_inv, profile)?;
if pattern_reused_hint {
match build_permute_value_map(matrix, perm_inv, &permuted) {
Ok(value_map) => {
*cache = Some(PermuteCache {
input_n: n,
input_nnz: nnz,
input_col_ptr: matrix.col_ptr.clone(),
input_row_idx: matrix.row_idx.clone(),
input_perm_inv: perm_inv.to_vec(),
permuted_col_ptr: permuted.col_ptr.clone(),
permuted_row_idx: permuted.row_idx.clone(),
value_map,
});
}
Err(_) => {
*cache = None;
}
}
} else {
*cache = None;
}
Ok((permuted, from_triplets_us))
}
fn build_permute_value_map(
matrix: &CscMatrix,
perm_inv: &[usize],
permuted: &CscMatrix,
) -> Result<Vec<usize>, FeralError> {
let n = matrix.n;
let nnz = matrix.nnz();
let mut value_map = vec![0usize; nnz];
for old_j in 0..n {
let new_j = perm_inv[old_j];
#[allow(clippy::needless_range_loop)]
for k in matrix.col_ptr[old_j]..matrix.col_ptr[old_j + 1] {
let old_i = matrix.row_idx[k];
let new_i = perm_inv[old_i];
let (lr, lc) = if new_i >= new_j {
(new_i, new_j)
} else {
(new_j, new_i)
};
let col_start = permuted.col_ptr[lc];
let col_end = permuted.col_ptr[lc + 1];
let dest_off = permuted.row_idx[col_start..col_end]
.binary_search(&lr)
.map_err(|_| {
FeralError::InvalidInput(format!(
"permute cache build: row {} not found in permuted column {} \
(col range {}..{}). Should not occur on valid CSC input.",
lr, lc, col_start, col_end
))
})?;
value_map[k] = col_start + dest_off;
}
}
Ok(value_map)
}
fn build_row_indices(
snode: &crate::symbolic::supernode::Supernode,
full_pattern: &crate::sparse::csc::CscPattern,
contrib_blocks: &[Option<ContribBlock>],
build_delayed: &mut Vec<usize>,
build_trailing: &mut Vec<usize>,
build_seen: &mut Vec<bool>,
) -> Vec<usize> {
let own_ncol = snode.ncol();
let first_col = snode.first_col;
let n = full_pattern.n;
if build_seen.len() < n {
build_seen.resize(n, false);
}
build_delayed.clear();
for &child_idx in &snode.children {
if let Some(contrib) = &contrib_blocks[child_idx] {
build_delayed.extend_from_slice(&contrib.row_indices[..contrib.n_delayed]);
}
}
for seen in build_seen.iter_mut().skip(first_col).take(own_ncol) {
*seen = true;
}
for &c in build_delayed.iter() {
build_seen[c] = true;
}
let own_last = first_col + own_ncol;
build_trailing.clear();
for j in first_col..first_col + own_ncol {
for k in full_pattern.col_ptr[j]..full_pattern.col_ptr[j + 1] {
let r = full_pattern.row_idx[k];
if r < own_last {
continue;
}
if !build_seen[r] {
build_seen[r] = true;
build_trailing.push(r);
}
}
}
for &child_idx in &snode.children {
if let Some(contrib) = &contrib_blocks[child_idx] {
for &r in &contrib.row_indices[contrib.n_delayed..] {
if r < own_last {
continue;
}
if !build_seen[r] {
build_seen[r] = true;
build_trailing.push(r);
}
}
}
}
build_trailing.sort_unstable();
let total = own_ncol + build_delayed.len() + build_trailing.len();
let mut result = Vec::with_capacity(total);
result.extend(first_col..first_col + own_ncol);
result.extend_from_slice(build_delayed);
result.extend_from_slice(build_trailing);
for seen in build_seen.iter_mut().skip(first_col).take(own_ncol) {
*seen = false;
}
for &c in build_delayed.iter() {
build_seen[c] = false;
}
for &r in build_trailing.iter() {
build_seen[r] = false;
}
result
}
#[derive(Debug)]
struct ContribBlock {
row_indices: Vec<usize>,
data: Vec<f64>,
dim: usize,
n_delayed: usize,
}
fn extend_add(contrib: &ContribBlock, parent_row_map: &[usize], frontal: &mut SymmetricMatrix) {
let cdim = contrib.dim;
let fn_ = frontal.n;
let f_data: &mut [f64] = frontal.data.as_mut_slice();
for cj in 0..cdim {
let parent_j = parent_row_map[contrib.row_indices[cj]];
if parent_j == usize::MAX {
continue;
}
for ci in cj..cdim {
let parent_i = parent_row_map[contrib.row_indices[ci]];
if parent_i == usize::MAX {
continue;
}
let val = contrib.data[cj * cdim + ci];
if val == 0.0 {
continue;
}
let (row, col) = if parent_i >= parent_j {
(parent_i, parent_j)
} else {
(parent_j, parent_i)
};
f_data[col * fn_ + row] += val;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dense::factor::ZeroPivotAction;
use crate::symbolic::{symbolic_factorize, SupernodeParams};
fn make_params() -> NumericParams {
NumericParams::with_bk(BunchKaufmanParams {
on_zero_pivot: ZeroPivotAction::ForceAccept,
..BunchKaufmanParams::default()
})
}
#[test]
fn test_factorize_diagonal() {
let m = CscMatrix::from_triplets(3, &[0, 1, 2], &[0, 1, 2], &[2.0, 3.0, 5.0]).unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let (factors, inertia) = factorize_multifrontal(&m, &sym, &make_params()).unwrap();
assert_eq!(inertia.positive, 3);
assert_eq!(inertia.negative, 0);
assert_eq!(inertia.zero, 0);
assert_eq!(factors.n, 3);
}
#[test]
fn test_summary_one_liner() {
let n = 32usize;
let mut rows: Vec<usize> = Vec::new();
let mut cols: Vec<usize> = Vec::new();
let mut vals: Vec<f64> = Vec::new();
for i in 0..n {
rows.push(i);
cols.push(i);
vals.push(2.0);
if i + 1 < n {
rows.push(i + 1);
cols.push(i);
vals.push(-1.0);
}
}
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let (factors, inertia) = factorize_multifrontal(&m, &sym, &make_params()).unwrap();
let s = factors.summary();
assert!(s.contains("ord="), "summary missing ord field: {}", s);
assert!(s.contains("amalg="), "summary missing amalg field: {}", s);
assert!(
s.contains("preproc="),
"summary missing preproc field: {}",
s
);
assert!(
s.contains("scaling="),
"summary missing scaling field: {}",
s
);
let nnz_l = factors.factor_nnz();
assert!(nnz_l > 0, "tridiagonal factor_nnz must be > 0");
assert!(
s.contains(&format!("nnz_L={}", nnz_l)),
"summary nnz_L mismatch: got {}, want nnz_L={}",
s,
nnz_l
);
let expected = format!(
"inertia=({},{},{})",
inertia.positive, inertia.negative, inertia.zero
);
assert!(
s.contains(&expected),
"summary inertia mismatch: got {}, want substring {}",
s,
expected
);
assert_ne!(
factors.resolved_amalgamation,
crate::symbolic::AmalgamationStrategy::Auto
);
assert_ne!(
factors.resolved_method,
crate::symbolic::OrderingMethod::Auto
);
assert_ne!(
factors.resolved_preprocess,
crate::symbolic::OrderingPreprocess::Auto
);
assert_eq!(factors.resolved_method, sym.resolved_method);
assert_eq!(factors.resolved_amalgamation, sym.resolved_amalgamation);
assert_eq!(factors.resolved_preprocess, sym.resolved_preprocess);
}
#[test]
fn test_factorize_tridiagonal() {
let m = CscMatrix::from_triplets(
3,
&[0, 1, 1, 2, 2],
&[0, 0, 1, 1, 2],
&[2.0, -1.0, 2.0, -1.0, 2.0],
)
.unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let (factors, inertia) = factorize_multifrontal(&m, &sym, &make_params()).unwrap();
assert_eq!(inertia.positive, 3);
assert_eq!(inertia.negative, 0);
assert_eq!(inertia.zero, 0);
assert_eq!(factors.n, 3);
}
#[test]
fn test_factorize_matches_dense() {
let m = CscMatrix::from_triplets(
3,
&[0, 1, 1, 2, 2],
&[0, 0, 1, 1, 2],
&[2.0, -1.0, 3.0, -1.0, 4.0],
)
.unwrap();
let dense_mat = m.to_dense();
let params = make_params();
let (_, dense_inertia) = factor(&dense_mat, ¶ms.bk).unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let (_, sparse_inertia) = factorize_multifrontal(&m, &sym, ¶ms).unwrap();
assert_eq!(sparse_inertia, dense_inertia);
}
#[test]
fn test_factorize_kkt() {
let m = CscMatrix::from_triplets(
3,
&[0, 1, 2, 2, 2],
&[0, 1, 0, 1, 2],
&[2.0, 3.0, 1.0, 1.0, -1e-8],
)
.unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let params = make_params();
let (_, inertia) = factorize_multifrontal(&m, &sym, ¶ms).unwrap();
assert_eq!(inertia.positive, 2);
assert_eq!(inertia.negative, 1);
assert_eq!(inertia.zero, 0);
}
#[test]
fn test_factorize_indefinite() {
let m = CscMatrix::from_triplets(2, &[0, 1, 1], &[0, 0, 1], &[1.0, 2.0, 1.0]).unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let params = make_params();
let (_, inertia) = factorize_multifrontal(&m, &sym, ¶ms).unwrap();
assert_eq!(inertia.positive, 1);
assert_eq!(inertia.negative, 1);
assert_eq!(inertia.zero, 0);
}
#[test]
fn factorize_multifrontal_with_two_strategies_on_one_symbolic() {
use crate::scaling::ScalingStrategy;
let m = CscMatrix::from_triplets(
3,
&[0, 2, 1, 2, 2],
&[0, 0, 1, 1, 2],
&[2.0, -1.0, 2.0, -1.0, 0.0],
)
.unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let infnorm = NumericParams {
bk: BunchKaufmanParams {
on_zero_pivot: ZeroPivotAction::ForceAccept,
..BunchKaufmanParams::default()
},
scaling: ScalingStrategy::InfNorm,
small_leaf: Default::default(),
profiler: None,
parallel_telemetry: None,
fma: false,
allow_delayed_pivots: true,
cascade_break_ratio: None,
cascade_break_eps: None,
min_parallel_flops: None,
sqd_mode: false,
static_pivot_threshold: None,
warn_partial_singular: false,
pattern_reused_hint: false,
};
let identity = NumericParams {
bk: infnorm.bk.clone(),
scaling: ScalingStrategy::Identity,
small_leaf: Default::default(),
profiler: None,
parallel_telemetry: None,
fma: false,
allow_delayed_pivots: true,
cascade_break_ratio: None,
cascade_break_eps: None,
min_parallel_flops: None,
sqd_mode: false,
static_pivot_threshold: None,
warn_partial_singular: false,
pattern_reused_hint: false,
};
let (_, i_inf) = factorize_multifrontal(&m, &sym, &infnorm).unwrap();
let (_, i_id) = factorize_multifrontal(&m, &sym, &identity).unwrap();
assert_eq!(i_inf.positive, 2);
assert_eq!(i_inf.negative, 1);
assert_eq!(i_id.positive, 2);
assert_eq!(i_id.negative, 1);
}
#[test]
fn dense_fast_factor_auto_matches_explicit_infnorm_bitwise() {
use crate::scaling::ScalingStrategy;
let n = 24usize;
let mut rows = Vec::new();
let mut cols = Vec::new();
let mut vals = Vec::new();
for j in 0..n {
rows.push(j);
cols.push(j);
vals.push(10.0 * (j as f64 + 1.0));
for i in (j + 1)..n {
rows.push(i);
cols.push(j);
vals.push(1.0 + 0.1 * (i - j) as f64);
}
}
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
assert!(
should_use_dense_fast_path(m.n, m.row_idx.len()),
"test setup: dense fast-path gate must fire"
);
let auto = NumericParams {
bk: BunchKaufmanParams {
on_zero_pivot: ZeroPivotAction::ForceAccept,
..BunchKaufmanParams::default()
},
scaling: ScalingStrategy::Auto,
small_leaf: Default::default(),
profiler: None,
parallel_telemetry: None,
fma: false,
allow_delayed_pivots: true,
cascade_break_ratio: None,
cascade_break_eps: None,
min_parallel_flops: None,
sqd_mode: false,
static_pivot_threshold: None,
warn_partial_singular: false,
pattern_reused_hint: false,
};
let infnorm = NumericParams {
scaling: ScalingStrategy::InfNorm,
..auto.clone()
};
let (f_auto, i_auto) = dense_fast_factor(&m, &auto).unwrap();
let (f_inf, i_inf) = dense_fast_factor(&m, &infnorm).unwrap();
assert_eq!(i_auto, i_inf, "inertia diverged under Auto vs InfNorm");
assert_eq!(
f_auto.node_factors.len(),
f_inf.node_factors.len(),
"node count diverged",
);
for (k, (na, ni)) in f_auto
.node_factors
.iter()
.zip(f_inf.node_factors.iter())
.enumerate()
{
assert_eq!(
na.frontal_factors.d_diag.len(),
ni.frontal_factors.d_diag.len(),
"node {} D-diag length diverged",
k,
);
for (j, (da, di)) in na
.frontal_factors
.d_diag
.iter()
.zip(ni.frontal_factors.d_diag.iter())
.enumerate()
{
assert_eq!(
da.to_bits(),
di.to_bits(),
"node {} D_diag[{}] bits differ: auto={} infnorm={}",
k,
j,
da,
di,
);
}
}
}
fn small_kkt_6x6_for_schur() -> CscMatrix {
let mut rows = Vec::new();
let mut cols = Vec::new();
let mut vals = Vec::new();
for i in 0..4 {
rows.push(i);
cols.push(i);
vals.push((i + 1) as f64);
}
rows.push(4);
cols.push(0);
vals.push(0.5);
rows.push(4);
cols.push(2);
vals.push(0.7);
rows.push(5);
cols.push(1);
vals.push(0.3);
rows.push(5);
cols.push(3);
vals.push(0.9);
rows.push(4);
cols.push(4);
vals.push(1.5);
rows.push(5);
cols.push(4);
vals.push(0.2);
rows.push(5);
cols.push(5);
vals.push(2.5);
CscMatrix::from_triplets(6, &rows, &cols, &vals).unwrap()
}
fn hand_computed_schur_2x2() -> [[f64; 2]; 2] {
let s00 = 1.5 - (0.25 + 0.49 / 3.0);
let s11 = 2.5 - (0.045 + 0.2025);
let s01 = 0.2;
[[s00, s01], [s01, s11]]
}
#[test]
fn schur_block_matches_hand_computed_for_small_kkt() {
let m = small_kkt_6x6_for_schur();
let params = crate::symbolic::SupernodeParams::default();
let sym = crate::symbolic::symbolic_factorize_with_schur(&m, ¶ms, &[4, 5]).unwrap();
let nparams = NumericParams {
scaling: crate::scaling::ScalingStrategy::Identity,
..NumericParams::default()
};
let (_factors, _inertia, schur) =
factorize_multifrontal_with_schur(&m, &sym, &nparams).unwrap();
assert_eq!(schur.dim, 2);
let expected = hand_computed_schur_2x2();
let tol = 1e-12;
for i in 0..2 {
for j in 0..2 {
let got = schur.get(i, j);
let want = expected[i][j];
assert!(
(got - want).abs() < tol,
"S({},{}) got {} want {} diff {}",
i,
j,
got,
want,
(got - want).abs()
);
}
}
}
#[test]
fn schur_block_full_square_storage() {
let m = small_kkt_6x6_for_schur();
let params = crate::symbolic::SupernodeParams::default();
let sym = crate::symbolic::symbolic_factorize_with_schur(&m, ¶ms, &[4, 5]).unwrap();
let nparams = NumericParams {
scaling: crate::scaling::ScalingStrategy::Identity,
..NumericParams::default()
};
let (_, _, schur) = factorize_multifrontal_with_schur(&m, &sym, &nparams).unwrap();
for i in 0..schur.dim {
for j in 0..schur.dim {
assert!((schur.get(i, j) - schur.get(j, i)).abs() < 1e-15);
}
}
}
#[test]
fn schur_user_order_preserved_when_reversed() {
let m = small_kkt_6x6_for_schur();
let params = crate::symbolic::SupernodeParams::default();
let sym = crate::symbolic::symbolic_factorize_with_schur(&m, ¶ms, &[5, 4]).unwrap();
let nparams = NumericParams {
scaling: crate::scaling::ScalingStrategy::Identity,
..NumericParams::default()
};
let (_, _, schur) = factorize_multifrontal_with_schur(&m, &sym, &nparams).unwrap();
let hand = hand_computed_schur_2x2();
let map = [1usize, 0usize];
let tol = 1e-12;
for i in 0..2 {
for j in 0..2 {
let got = schur.get(i, j);
let want = hand[map[i]][map[j]];
assert!(
(got - want).abs() < tol,
"reversed S({},{}) got {} want {}",
i,
j,
got,
want
);
}
}
}
#[test]
fn schur_rejects_symbolic_without_schur_tail() {
let m = small_kkt_6x6_for_schur();
let params = crate::symbolic::SupernodeParams::default();
let sym = crate::symbolic::symbolic_factorize(&m, ¶ms).unwrap(); assert_eq!(sym.is_schur_tail, None);
let nparams = NumericParams::default();
let r = factorize_multifrontal_with_schur(&m, &sym, &nparams);
assert!(matches!(r, Err(FeralError::InvalidInput(_))));
}
#[test]
fn schur_multi_supernode_tail_matches_oracle() {
let half = 25usize;
let k_each = 40usize;
let n = 2 * half + 2 * k_each;
let mut rows: Vec<usize> = Vec::new();
let mut cols: Vec<usize> = Vec::new();
let mut vals: Vec<f64> = Vec::new();
for i in 0..n {
rows.push(i);
cols.push(i);
vals.push(2.0 + i as f64);
}
for i in 0..half {
for s in 0..k_each {
let j = 2 * half + s;
rows.push(j);
cols.push(i);
vals.push(0.1);
}
}
for i in half..2 * half {
for s in 0..k_each {
let j = 2 * half + k_each + s;
rows.push(j);
cols.push(i);
vals.push(0.1);
}
}
for s in 0..k_each {
for t in 0..s {
rows.push(2 * half + s);
cols.push(2 * half + t);
vals.push(0.05);
rows.push(2 * half + k_each + s);
cols.push(2 * half + k_each + t);
vals.push(0.05);
}
}
for s in (2 * half + 1)..n {
rows.push(s);
cols.push(s - 1);
vals.push(0.03);
}
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
let schur: Vec<usize> = (2 * half..n).collect();
let n_schur = schur.len();
let params = crate::symbolic::SupernodeParams::default();
let sym = crate::symbolic::symbolic_factorize_with_schur(&m, ¶ms, &schur).unwrap();
let nparams = NumericParams {
scaling: crate::scaling::ScalingStrategy::Identity,
..NumericParams::default()
};
let (_, _, schur_block) = factorize_multifrontal_with_schur(&m, &sym, &nparams).unwrap();
assert_eq!(schur_block.dim, n_schur);
let mut is_schur = vec![false; n];
for &i in &schur {
is_schur[i] = true;
}
let f_indices: Vec<usize> = (0..n).filter(|i| !is_schur[*i]).collect();
let nf = f_indices.len();
let mut f_inv = vec![usize::MAX; n];
for (k, &i) in f_indices.iter().enumerate() {
f_inv[i] = k;
}
let mut a = vec![0.0f64; n * n];
for j in 0..n {
for k in m.col_ptr[j]..m.col_ptr[j + 1] {
let i = m.row_idx[k];
a[j * n + i] = m.values[k];
if i != j {
a[i * n + j] = m.values[k];
}
}
}
let mut tr = (Vec::new(), Vec::new(), Vec::new());
for j in 0..n {
if is_schur[j] {
continue;
}
for k in m.col_ptr[j]..m.col_ptr[j + 1] {
let i = m.row_idx[k];
if !is_schur[i] {
tr.0.push(f_inv[i]);
tr.1.push(f_inv[j]);
tr.2.push(m.values[k]);
}
}
}
let a_ff = CscMatrix::from_triplets(nf, &tr.0, &tr.1, &tr.2).unwrap();
let sym_ff = crate::symbolic::symbolic_factorize(&a_ff, ¶ms).unwrap();
let (factors_ff, _) = factorize_multifrontal(&a_ff, &sym_ff, &nparams).unwrap();
let mut s_ref = vec![0.0f64; n_schur * n_schur];
for (si, &i) in schur.iter().enumerate() {
for (sj, &j) in schur.iter().enumerate() {
s_ref[sj * n_schur + si] = a[j * n + i];
}
}
for (sj, &j) in schur.iter().enumerate() {
let mut rhs = vec![0.0f64; nf];
for &fi in &f_indices {
rhs[f_inv[fi]] = a[j * n + fi];
}
let x = crate::numeric::solve::solve_sparse(&factors_ff, &rhs).unwrap();
for (si, &i) in schur.iter().enumerate() {
let mut acc = 0.0;
for &fi in &f_indices {
acc += a[i * n + fi] * x[f_inv[fi]];
}
s_ref[sj * n_schur + si] -= acc;
}
}
let mut max_rel = 0.0f64;
for sj in 0..n_schur {
for si in 0..n_schur {
let want = s_ref[sj * n_schur + si];
let got = schur_block.get(si, sj);
let denom = want.abs().max(1e-14);
let rel = (got - want).abs() / denom;
if rel > max_rel {
max_rel = rel;
}
}
}
assert!(
max_rel < 1e-10,
"Schur block max relative error {} exceeds 1e-10",
max_rel
);
}
#[test]
fn schur_block_solve_small_explicit() {
let dim = 3;
let s = vec![
4.0, -1.0, 0.0, -1.0, 3.0, -1.0, 0.0, -1.0, 2.0, ];
let block = SchurBlock {
dim,
data: s.clone(),
};
let rhs = vec![2.0, 2.0, 4.0];
let x = block.solve(&rhs).expect("solve must succeed");
let want = [1.0, 2.0, 3.0];
for i in 0..dim {
assert!(
(x[i] - want[i]).abs() < 1e-12,
"x[{i}] = {} != {} (rel diff {:.3e})",
x[i],
want[i],
(x[i] - want[i]).abs()
);
}
let mut rhs_check = vec![0.0; dim];
block.symv(&x, &mut rhs_check).expect("symv");
for i in 0..dim {
assert!((rhs_check[i] - rhs[i]).abs() < 1e-12);
}
}
#[test]
fn schur_block_solve_roundtrip_after_factorize() {
let entries = [
(0, 0, 2.0),
(1, 1, 3.0),
(2, 2, 4.0),
(3, 0, 1.0),
(3, 1, 1.0),
(3, 2, 1.0),
];
let n = 4;
let mut rows = Vec::new();
let mut cols = Vec::new();
let mut vals = Vec::new();
for &(r, c, v) in &entries {
rows.push(r);
cols.push(c);
vals.push(v);
}
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
let schur_indices = vec![3];
let snode = crate::symbolic::SupernodeParams::default();
let sym =
crate::symbolic::symbolic_factorize_with_schur(&m, &snode, &schur_indices).unwrap();
let nparams = NumericParams {
scaling: crate::scaling::ScalingStrategy::Identity,
..NumericParams::default()
};
let (_, _, schur_block) = factorize_multifrontal_with_schur(&m, &sym, &nparams).unwrap();
let want_s = -(1.0 / 2.0 + 1.0 / 3.0 + 1.0 / 4.0);
assert!((schur_block.get(0, 0) - want_s).abs() < 1e-12);
let x_target = vec![1.5_f64];
let mut rhs_s = vec![0.0; 1];
schur_block.symv(&x_target, &mut rhs_s).unwrap();
let x_solved = schur_block.solve(&rhs_s).unwrap();
assert!((x_solved[0] - x_target[0]).abs() < 1e-12);
}
#[test]
fn schur_block_solve_dim_mismatch() {
let block = SchurBlock {
dim: 2,
data: vec![1.0, 0.0, 0.0, 1.0],
};
let rhs = vec![1.0, 2.0, 3.0];
let r = block.solve(&rhs);
assert!(matches!(r, Err(FeralError::DimensionMismatch { .. })));
}
fn issue5_mss1_inertia_sweep_with(
zero_tol: f64,
pivot_threshold: f64,
) -> Option<Vec<(f64, Inertia)>> {
let path = std::path::Path::new("data/matrices/kkt/MSS1/MSS1_0000.mtx");
let mtx = match crate::io::mtx::read_mtx(path) {
Ok(m) => m,
Err(_) => return None, };
let n = mtx.n;
let n_x = 90usize;
assert_eq!(n, 163, "MSS1_0000 expected n=163");
let mut bk = BunchKaufmanParams {
on_zero_pivot: ZeroPivotAction::ForceAccept,
zero_tol,
zero_tol_2x2: zero_tol * zero_tol,
..BunchKaufmanParams::default()
};
bk.pivot_threshold = pivot_threshold;
let params = NumericParams {
bk,
scaling: ScalingStrategy::Identity,
small_leaf: SmallLeafBatch::default(),
profiler: None,
parallel_telemetry: None,
fma: false,
allow_delayed_pivots: true,
cascade_break_ratio: None,
cascade_break_eps: None,
min_parallel_flops: None,
sqd_mode: false,
static_pivot_threshold: None,
warn_partial_singular: false,
pattern_reused_hint: false,
};
let deltas = [0.0, 1e-4, 1e-2, 1.0, 1e2, 1e4, 1e6, 1e8, 1e10, 1e12];
let mut results = Vec::with_capacity(deltas.len());
for &dw in &deltas {
let mut rows: Vec<usize> = Vec::with_capacity(mtx.entries.len() + n_x);
let mut cols: Vec<usize> = Vec::with_capacity(mtx.entries.len() + n_x);
let mut vals: Vec<f64> = Vec::with_capacity(mtx.entries.len() + n_x);
for &(r, c, v) in &mtx.entries {
if r >= n_x && c >= n_x && r == c {
continue;
}
rows.push(r);
cols.push(c);
vals.push(v);
}
for i in 0..n_x {
rows.push(i);
cols.push(i);
vals.push(dw);
}
let csc = CscMatrix::from_triplets(n, &rows, &cols, &vals)
.expect("MSS1_0000 + δ_w·I CSC build");
let sym = symbolic_factorize(&csc, &SupernodeParams::default())
.expect("symbolic factorize MSS1_0000");
let (_factors, inertia) = factorize_multifrontal(&csc, &sym, ¶ms)
.expect("numeric factorize MSS1_0000 + δ_w·I");
results.push((dw, inertia));
}
Some(results)
}
fn issue5_mss1_inertia_sweep() -> Option<Vec<(f64, Inertia)>> {
issue5_mss1_inertia_sweep_with(1e-10, 0.0)
}
#[test]
fn issue_5_mss1_iter0_inertia_wanders_under_delta_w_sweep() {
let Some(results) = issue5_mss1_inertia_sweep() else {
return;
};
for (dw, inertia) in &results {
eprintln!(
"δ_w = {:>8.0e} → inertia(+{:3}, -{:3}, 0:{:>3})",
dw, inertia.positive, inertia.negative, inertia.zero
);
}
let positives: Vec<usize> = results.iter().map(|(_, i)| i.positive).collect();
let any_decrease = positives.windows(2).any(|w| w[1] < w[0]);
assert!(
any_decrease,
"issue #5 regression: expected non-monotone positive count, got monotone: \
{:?}. If this assertion fails, the in-kernel pivot behavior has changed; \
revisit the disposition in dev/research/issue-5-mss1-inertia-monotonicity.md \
§9 and flip the assertion to monotone non-decreasing.",
positives
);
}
#[test]
fn issue_5_mss1_zero_tol_sweep_diagnostic() {
let tols = [1e-10, 1e-8, 1e-6, 1e-4, 1e-2];
for &tol in &tols {
let Some(results) = issue5_mss1_inertia_sweep_with(tol, 0.0) else {
return;
};
let positives: Vec<usize> = results.iter().map(|(_, i)| i.positive).collect();
let negatives: Vec<usize> = results.iter().map(|(_, i)| i.negative).collect();
let zeros: Vec<usize> = results.iter().map(|(_, i)| i.zero).collect();
let monotone = positives.windows(2).all(|w| w[1] >= w[0]);
eprintln!(
"zero_tol = {:>5.0e}: monotone(+) = {}\n +: {:?}\n -: {:?}\n 0: {:?}",
tol, monotone, positives, negatives, zeros
);
}
}
#[test]
fn issue_5_mss1_pivot_threshold_sweep_diagnostic() {
let us = [0.0, 1e-10, 1e-8, 1e-6, 1e-4, 1e-2, 1e-1, 0.5];
for &u in &us {
let Some(results) = issue5_mss1_inertia_sweep_with(1e-10, u) else {
return;
};
let positives: Vec<usize> = results.iter().map(|(_, i)| i.positive).collect();
let negatives: Vec<usize> = results.iter().map(|(_, i)| i.negative).collect();
let zeros: Vec<usize> = results.iter().map(|(_, i)| i.zero).collect();
let monotone = positives.windows(2).all(|w| w[1] >= w[0]);
eprintln!(
"pivot_threshold = {:>5.0e}: monotone(+) = {}\n +: {:?}\n -: {:?}\n 0: {:?}",
u, monotone, positives, negatives, zeros
);
}
}
fn make_supernodes(specs: &[(usize, usize, usize)]) -> Vec<Supernode> {
specs
.iter()
.map(|&(ncol, nrow, n_children)| Supernode {
first_col: 0,
ncol,
nrow,
row_indices: Vec::new(),
children: vec![0; n_children],
delayed_capacity: usize::MAX,
})
.collect()
}
fn stub_symbolic(n: usize, snodes: Vec<Supernode>) -> SymbolicFactorization {
SymbolicFactorization {
n,
perm: Vec::new(),
perm_inv: Vec::new(),
supernodes: snodes,
factor_nnz_estimate: 0,
factor_slack: 1.2,
contrib_sizes: Vec::new(),
peak_contrib_bytes: 0,
etree: crate::ordering::elimination_tree::EliminationTree {
parent: Vec::new(),
n: 0,
},
permuted_pattern: crate::sparse::csc::CscPattern {
n: 0,
col_ptr: vec![0],
row_idx: Vec::new(),
},
col_counts: Vec::new(),
small_leaf_groups: Vec::new(),
snode_group: Vec::new(),
cached_mc64: None,
resolved_method: crate::symbolic::OrderingMethod::Amd,
resolved_amalgamation: crate::symbolic::AmalgamationStrategy::Adjacency,
resolved_preprocess: crate::symbolic::OrderingPreprocess::None,
is_schur_tail: None,
}
}
#[test]
fn estimate_assembly_flops_empty_tree_is_zero() {
assert_eq!(estimate_assembly_flops(&[]), 0);
}
#[test]
fn estimate_assembly_flops_sums_ncol_times_nrow_squared() {
let snodes = make_supernodes(&[(2, 4, 0), (3, 5, 0)]);
assert_eq!(estimate_assembly_flops(&snodes), 107);
}
#[test]
fn estimate_assembly_flops_saturates_on_pathological_input() {
let snodes = make_supernodes(&[(1, 1usize << 32, 0); 4]);
assert_eq!(estimate_assembly_flops(&snodes), u64::MAX);
}
#[test]
fn should_parallelize_assembly_rejects_tridiagonal_chain() {
let n = 64usize;
let mut rows: Vec<usize> = Vec::new();
let mut cols: Vec<usize> = Vec::new();
let mut vals: Vec<f64> = Vec::new();
for i in 0..n {
rows.push(i);
cols.push(i);
vals.push(2.0);
if i + 1 < n {
rows.push(i + 1);
cols.push(i);
vals.push(-1.0);
}
}
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
assert!(
!should_parallelize_assembly(&sym),
"tridiagonal n=64 has a chain etree (or trivially small \
supernode count); parallel driver must not fire"
);
}
#[test]
fn should_parallelize_assembly_rejects_low_flop_multi_child_tree() {
let mut specs = vec![(2usize, 4usize, 2usize); 64];
for (i, s) in specs.iter_mut().enumerate() {
if i % 2 == 1 {
s.2 = 0;
}
}
let snodes = make_supernodes(&specs);
assert!(snodes.len() >= N_PAR_MIN);
assert!(snodes.iter().any(|s| s.children.len() >= 2));
assert!(estimate_assembly_flops(&snodes) < PAR_MIN_FLOPS);
let sym = stub_symbolic(128, snodes);
assert!(
!should_parallelize_assembly(&sym),
"tree with {} supernodes and {} flops < PAR_MIN_FLOPS = {} \
must dispatch sequentially per issue #19",
sym.supernodes.len(),
estimate_assembly_flops(&sym.supernodes),
PAR_MIN_FLOPS,
);
}
#[test]
fn should_parallelize_assembly_accepts_high_flop_multi_child_tree() {
let mut specs = vec![(64usize, 256usize, 2usize); 64];
for (i, s) in specs.iter_mut().enumerate() {
if i % 2 == 1 {
s.2 = 0;
}
}
let snodes = make_supernodes(&specs);
assert!(estimate_assembly_flops(&snodes) >= PAR_MIN_FLOPS);
let sym = stub_symbolic(64 * 256, snodes);
assert!(
should_parallelize_assembly(&sym),
"tree with structural gates passed and {} flops >= {} \
must dispatch parallel",
estimate_assembly_flops(&sym.supernodes),
PAR_MIN_FLOPS,
);
}
#[test]
fn permute_csc_values_profile_flag_gates_timing_not_values() {
let m = CscMatrix::from_triplets(3, &[0, 1, 2, 1], &[0, 1, 2, 0], &[4.0, 5.0, 6.0, 2.0])
.unwrap();
let perm = [2usize, 1, 0];
let perm_inv = [2usize, 1, 0];
let (off, t_off) = permute_csc_values(&m, &perm, &perm_inv, false).unwrap();
let (on, t_on) = permute_csc_values(&m, &perm, &perm_inv, true).unwrap();
assert_eq!(off.n, on.n);
assert_eq!(off.col_ptr, on.col_ptr);
assert_eq!(off.row_idx, on.row_idx);
assert_eq!(off.values, on.values);
assert_eq!(t_off, 0, "profile=false must report zero timing");
let _ = t_on;
}
#[test]
fn prologue_breakdown_subphases_sum_within_prologue() {
let n = 64usize;
let mut rows: Vec<usize> = Vec::new();
let mut cols: Vec<usize> = Vec::new();
let mut vals: Vec<f64> = Vec::new();
for i in 0..n {
rows.push(i);
cols.push(i);
vals.push(2.0);
if i + 1 < n {
rows.push(i + 1);
cols.push(i);
vals.push(-1.0);
}
}
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
let sym = symbolic_factorize(&m, &SupernodeParams::default()).unwrap();
let prof = Arc::new(Mutex::new(Profiler::new()));
let mut params = make_params();
params.profiler = Some(prof.clone());
let (_factors, inertia) = factorize_multifrontal(&m, &sym, ¶ms).unwrap();
assert_eq!(
(inertia.positive, inertia.negative, inertia.zero),
(64, 0, 0),
"SPD tridiagonal must factor with all-positive inertia"
);
let report = match prof.lock() {
Ok(p) => p.report(),
Err(_) => panic!("profiler mutex poisoned"),
};
let bd = &report.prologue_breakdown;
assert!(
bd.permute_from_triplets_us <= bd.permute_us,
"from_triplets ({} us) cannot exceed permute total ({} us)",
bd.permute_from_triplets_us,
bd.permute_us
);
let subphase_sum = bd.row_map_us
+ bd.scaling_us
+ bd.scaling_pivot_order_us
+ bd.permute_us
+ bd.infnorm_tol_us
+ bd.symmetric_pattern_us
+ bd.setup_us;
assert!(
subphase_sum <= report.prologue_us,
"prologue sub-phases sum to {} us, exceeding prologue {} us",
subphase_sum,
report.prologue_us
);
}
#[test]
fn permute_cache_not_built_for_one_shot_caller() {
let n = 4usize;
let rows = vec![0usize, 1, 1, 2, 2, 3, 3];
let cols = vec![0usize, 0, 1, 1, 2, 2, 3];
let vals = vec![4.0f64, -1.0, 4.0, -1.0, 4.0, -1.0, 4.0];
let m = CscMatrix::from_triplets(n, &rows, &cols, &vals).unwrap();
let perm: Vec<usize> = (0..n).collect();
let perm_inv: Vec<usize> = (0..n).collect();
let mut cache: Option<PermuteCache> = None;
let (permuted, _us) =
permute_csc_values_with_cache(&m, &perm, &perm_inv, false, false, &mut cache).unwrap();
assert_eq!(permuted.n, m.n);
assert_eq!(permuted.col_ptr, m.col_ptr);
assert_eq!(permuted.row_idx, m.row_idx);
assert_eq!(permuted.values, m.values);
assert!(
cache.is_none(),
"one-shot caller (hint=false) should not build the value-map cache"
);
let mut warm_cache: Option<PermuteCache> = None;
let (_permuted2, _us2) =
permute_csc_values_with_cache(&m, &perm, &perm_inv, true, true, &mut warm_cache)
.unwrap();
assert!(
warm_cache.is_some(),
"warm-reuse caller (hint=true) should build the value-map cache on its cold call"
);
}
#[test]
fn permute_cache_rejects_stale_pattern_same_n_nnz() {
let n = 4usize;
let perm: Vec<usize> = (0..n).collect();
let perm_inv: Vec<usize> = (0..n).collect();
let a = CscMatrix::from_triplets(
n,
&[0, 1, 1, 2, 2, 3, 3],
&[0, 0, 1, 1, 2, 2, 3],
&[4.0, -1.0, 4.0, -1.0, 4.0, -1.0, 4.0],
)
.unwrap();
let b = CscMatrix::from_triplets(
n,
&[0, 1, 2, 2, 3, 3, 3],
&[0, 1, 0, 2, 1, 3, 0],
&[4.0, 4.0, -1.0, 4.0, -1.0, 4.0, -2.0],
)
.unwrap();
assert_eq!(a.nnz(), b.nnz(), "patterns must share nnz for the probe");
let mut cache: Option<PermuteCache> = None;
permute_csc_values_with_cache(&a, &perm, &perm_inv, false, true, &mut cache).unwrap();
assert!(cache.is_some());
permute_csc_values_with_cache(&b, &perm, &perm_inv, false, false, &mut cache).unwrap();
let (warm_b, _) =
permute_csc_values_with_cache(&b, &perm, &perm_inv, false, true, &mut cache).unwrap();
let (canon_b, _) = permute_csc_values(&b, &perm, &perm_inv, false).unwrap();
assert_eq!(warm_b.col_ptr, canon_b.col_ptr, "stale-pattern col_ptr");
assert_eq!(warm_b.row_idx, canon_b.row_idx, "stale-pattern row_idx");
assert_eq!(warm_b.values, canon_b.values, "stale-pattern values");
}
#[test]
fn permute_cache_rejects_stale_permutation() {
let n = 4usize;
let m = CscMatrix::from_triplets(
n,
&[0, 1, 2, 3, 1, 2, 3],
&[0, 0, 0, 0, 1, 2, 3],
&[10.0, 2.0, 3.0, 4.0, 20.0, 30.0, 40.0],
)
.unwrap();
let perm1: Vec<usize> = (0..n).collect();
let perm1_inv: Vec<usize> = (0..n).collect();
let mut cache: Option<PermuteCache> = None;
permute_csc_values_with_cache(&m, &perm1, &perm1_inv, false, true, &mut cache).unwrap();
assert!(cache.is_some());
let perm2: Vec<usize> = vec![3, 2, 1, 0];
let mut perm2_inv = vec![0usize; n];
for (i, &p) in perm2.iter().enumerate() {
perm2_inv[p] = i;
}
let (warm, _) =
permute_csc_values_with_cache(&m, &perm2, &perm2_inv, false, true, &mut cache).unwrap();
let (canon, _) = permute_csc_values(&m, &perm2, &perm2_inv, false).unwrap();
assert_eq!(warm.col_ptr, canon.col_ptr, "stale-perm col_ptr");
assert_eq!(warm.row_idx, canon.row_idx, "stale-perm row_idx");
assert_eq!(warm.values, canon.values, "stale-perm values");
}
}