use faer::sparse::SparseColMat;
use faer::sparse::linalg::cholesky::SymbolicCholeskyRaw;
use faer::{Mat, MatMut, Par};
use super::diagonal::{MixedDiagonal, PivotEntry};
use super::factor::{
AptpFactorResult, AptpOptions, aptp_factor_in_place, compute_contribution_gemm,
compute_contribution_gemm_rect, extract_contribution_rect, extract_front_factors_rect,
tpp_factor_rect,
};
use super::pivot::{Block2x2, PivotType};
use super::symbolic::AptpSymbolic;
use crate::error::SparseError;
#[cfg(feature = "diagnostic")]
use crate::profiling::{FinishedSession, ProfileSession};
const NOT_IN_FRONT: usize = usize::MAX;
pub(crate) struct FactorizationWorkspace {
frontal_data: Mat<f64>,
frontal_row_indices: Vec<usize>,
delayed_cols_buf: Vec<usize>,
global_to_local: Vec<usize>,
contrib_buffer: Mat<f64>,
ld_workspace: Mat<f64>,
}
impl Default for FactorizationWorkspace {
fn default() -> Self {
Self::empty()
}
}
impl FactorizationWorkspace {
pub(crate) fn new(max_front: usize, n: usize) -> Self {
if max_front == 0 {
return Self {
frontal_data: Mat::zeros(0, 0),
frontal_row_indices: Vec::new(),
delayed_cols_buf: Vec::new(),
global_to_local: vec![NOT_IN_FRONT; n],
contrib_buffer: Mat::zeros(0, 0),
ld_workspace: Mat::new(),
};
}
Self {
frontal_data: Mat::zeros(max_front, max_front),
frontal_row_indices: Vec::with_capacity(max_front),
delayed_cols_buf: Vec::with_capacity(max_front),
global_to_local: vec![NOT_IN_FRONT; n],
contrib_buffer: Mat::new(),
ld_workspace: Mat::new(),
}
}
const fn empty() -> Self {
Self {
frontal_data: Mat::new(),
frontal_row_indices: Vec::new(),
delayed_cols_buf: Vec::new(),
global_to_local: Vec::new(),
contrib_buffer: Mat::new(),
ld_workspace: Mat::new(),
}
}
fn zero_frontal(&mut self, m: usize) {
assert!(
m <= self.frontal_data.nrows(),
"front size {} exceeds workspace capacity {}",
m,
self.frontal_data.nrows()
);
for j in 0..m {
self.frontal_data.col_as_slice_mut(j)[j..m].fill(0.0);
}
}
fn ensure_g2l(&mut self, n: usize) {
if self.global_to_local.len() < n {
self.global_to_local.resize(n, NOT_IN_FRONT);
}
}
}
pub(crate) const INTRA_NODE_THRESHOLD: usize = 256;
fn estimate_max_front_size(supernodes: &[SupernodeInfo]) -> usize {
supernodes
.iter()
.map(|sn| {
let owned: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
owned + sn.pattern.len()
})
.max()
.unwrap_or(0)
}
pub(crate) struct AssemblyMaps {
pub amap_entries: Vec<u32>,
pub amap_offsets: Vec<usize>,
pub ea_map: Vec<u32>,
pub ea_offsets: Vec<usize>,
pub ea_snode_child_begin: Vec<usize>,
}
fn build_assembly_maps(
supernodes: &[SupernodeInfo],
children: &[Vec<usize>],
matrix: &SparseColMat<usize, f64>,
perm_fwd: &[usize],
perm_inv: &[usize],
n: usize,
) -> AssemblyMaps {
let n_supernodes = supernodes.len();
let symbolic = matrix.symbolic();
let col_ptrs = symbolic.col_ptr();
let row_indices_csc = symbolic.row_idx();
let mut amap_entries = Vec::new();
let mut amap_offsets = vec![0usize; n_supernodes + 1];
let mut g2l = vec![NOT_IN_FRONT; n];
for (s, sn) in supernodes.iter().enumerate() {
let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
let mut frontal_rows: Vec<usize> = Vec::with_capacity(sn_ncols + sn.pattern.len());
for range in &sn.owned_ranges {
frontal_rows.extend(range.clone());
}
frontal_rows.extend_from_slice(&sn.pattern);
let m = frontal_rows.len();
for (i, &global) in frontal_rows.iter().enumerate() {
g2l[global] = i;
}
let total_owned = sn_ncols;
for range in &sn.owned_ranges {
for pj in range.clone() {
let orig_col = perm_fwd[pj];
let local_col = g2l[pj];
let start = col_ptrs[orig_col];
let end = col_ptrs[orig_col + 1];
for (idx, &orig_row) in row_indices_csc.iter().enumerate().take(end).skip(start) {
if orig_row < orig_col {
let perm_row = perm_inv[orig_row];
let local_peer = g2l[perm_row];
if local_peer != NOT_IN_FRONT && local_peer < total_owned {
continue;
}
}
let global_row = perm_inv[orig_row];
let local_row = g2l[global_row];
if local_row == NOT_IN_FRONT {
continue;
}
let (dest_row, dest_col) = if local_row >= local_col {
(local_row, local_col)
} else {
(local_col, local_row)
};
let dest_linear = dest_col * m + dest_row;
amap_entries.push(idx as u32);
amap_entries.push(dest_linear as u32);
amap_entries.push(global_row as u32); amap_entries.push(pj as u32); }
}
}
for &global in &frontal_rows {
g2l[global] = NOT_IN_FRONT;
}
let entry_end = amap_entries.len() / 4;
amap_offsets[s + 1] = entry_end;
}
let mut ea_map = Vec::new();
let mut ea_offsets = vec![0usize];
let mut ea_snode_child_begin = vec![0usize; n_supernodes + 1];
for (s, sn) in supernodes.iter().enumerate() {
let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
let mut parent_rows: Vec<usize> = Vec::with_capacity(sn_ncols + sn.pattern.len());
for range in &sn.owned_ranges {
parent_rows.extend(range.clone());
}
parent_rows.extend_from_slice(&sn.pattern);
for (i, &global) in parent_rows.iter().enumerate() {
g2l[global] = i;
}
for &c in &children[s] {
let child_sn = &supernodes[c];
for &child_row in &child_sn.pattern {
let parent_local = g2l[child_row];
debug_assert!(
parent_local != NOT_IN_FRONT,
"child pattern row {} not in parent supernode {}",
child_row,
s
);
ea_map.push(parent_local as u32);
}
ea_offsets.push(ea_map.len());
}
for &global in &parent_rows {
g2l[global] = NOT_IN_FRONT;
}
ea_snode_child_begin[s + 1] = ea_offsets.len() - 1;
}
AssemblyMaps {
amap_entries,
amap_offsets,
ea_map,
ea_offsets,
ea_snode_child_begin,
}
}
#[derive(Clone)]
pub(crate) struct SupernodeInfo {
pub col_begin: usize,
pub col_end: usize,
pub pattern: Vec<usize>,
pub parent: Option<usize>,
pub owned_ranges: Vec<std::ops::Range<usize>>,
pub in_small_leaf: bool,
}
pub(crate) struct SmallLeafSubtree {
#[allow(dead_code)]
pub root: usize,
pub nodes: Vec<usize>,
pub max_front_size: usize,
pub parent_of_root: Option<usize>,
}
pub(crate) struct FrontalMatrix<'a> {
pub data: MatMut<'a, f64>,
#[allow(dead_code)]
pub row_indices: &'a [usize],
pub num_fully_summed: usize,
}
pub(crate) struct ContributionBlock {
pub data: Mat<f64>,
pub row_indices: Vec<usize>,
pub num_delayed: usize,
}
#[derive(Debug)]
pub struct FrontFactors {
pub(crate) l11: Mat<f64>,
pub(crate) d11: MixedDiagonal,
pub(crate) l21: Mat<f64>,
pub(crate) local_perm: Vec<usize>,
pub(crate) num_eliminated: usize,
pub(crate) col_indices: Vec<usize>,
pub(crate) row_indices: Vec<usize>,
}
impl FrontFactors {
pub fn l11(&self) -> &Mat<f64> {
&self.l11
}
pub fn d11(&self) -> &MixedDiagonal {
&self.d11
}
pub fn l21(&self) -> &Mat<f64> {
&self.l21
}
pub fn local_perm(&self) -> &[usize] {
&self.local_perm
}
pub fn num_eliminated(&self) -> usize {
self.num_eliminated
}
pub fn col_indices(&self) -> &[usize] {
&self.col_indices
}
pub fn row_indices(&self) -> &[usize] {
&self.row_indices
}
}
#[derive(Debug, Clone)]
pub struct PerSupernodeStats {
pub snode_id: usize,
pub front_size: usize,
pub num_fully_summed: usize,
pub num_eliminated: usize,
pub num_delayed: usize,
pub num_1x1: usize,
pub num_2x2: usize,
pub max_l_entry: f64,
#[cfg(feature = "diagnostic")]
pub assembly_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub kernel_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub extraction_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub zero_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub g2l_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub scatter_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub extend_add_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub extract_factors_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub extract_contrib_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub contrib_gemm_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub g2l_reset_time: std::time::Duration,
}
#[derive(Debug, Clone)]
pub struct FactorizationStats {
pub total_1x1_pivots: usize,
pub total_2x2_pivots: usize,
pub total_delayed: usize,
pub zero_pivots: usize,
pub max_front_size: usize,
pub supernodes_before_amalgamation: usize,
pub supernodes_after_amalgamation: usize,
pub merges_performed: usize,
#[cfg(feature = "diagnostic")]
pub total_assembly_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_kernel_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_extraction_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_zero_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_g2l_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_scatter_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_extend_add_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_extract_factors_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_extract_contrib_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_contrib_gemm_time: std::time::Duration,
#[cfg(feature = "diagnostic")]
pub total_g2l_reset_time: std::time::Duration,
pub small_leaf_subtrees: usize,
pub small_leaf_nodes: usize,
}
#[cfg(feature = "diagnostic")]
pub struct ExportedFrontal {
pub data: Mat<f64>,
pub front_size: usize,
pub num_fully_summed: usize,
pub row_indices: Vec<usize>,
}
pub struct AptpNumeric {
front_factors: Vec<FrontFactors>,
stats: FactorizationStats,
per_supernode_stats: Vec<PerSupernodeStats>,
n: usize,
#[cfg(feature = "diagnostic")]
profile_session: Option<FinishedSession>,
}
impl std::fmt::Debug for AptpNumeric {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AptpNumeric")
.field("n", &self.n)
.field("stats", &self.stats)
.field("front_factors_count", &self.front_factors.len())
.finish()
}
}
impl AptpNumeric {
pub fn front_factors(&self) -> &[FrontFactors] {
&self.front_factors
}
pub fn stats(&self) -> &FactorizationStats {
&self.stats
}
pub fn per_supernode_stats(&self) -> &[PerSupernodeStats] {
&self.per_supernode_stats
}
pub fn n(&self) -> usize {
self.n
}
#[cfg(feature = "diagnostic")]
pub fn profile_session(&self) -> Option<&FinishedSession> {
self.profile_session.as_ref()
}
pub fn factor(
symbolic: &AptpSymbolic,
matrix: &SparseColMat<usize, f64>,
options: &AptpOptions,
scaling: Option<&[f64]>,
) -> Result<Self, SparseError> {
let nemin = options.nemin;
let n = symbolic.nrows();
if matrix.nrows() != n || matrix.ncols() != n {
return Err(SparseError::DimensionMismatch {
expected: (n, n),
got: (matrix.nrows(), matrix.ncols()),
context: "Matrix dimensions must match symbolic analysis".to_string(),
});
}
let supernodes = build_supernode_info(symbolic);
let n_supernodes_before = supernodes.len();
let mut supernodes = super::amalgamation::amalgamate(supernodes, nemin);
let n_supernodes = supernodes.len();
let merges_performed = n_supernodes_before - n_supernodes;
let children = build_children_map(&supernodes);
let small_leaf_subtrees =
classify_small_leaf_subtrees(&mut supernodes, &children, options.small_leaf_threshold);
let (perm_fwd, perm_inv) = symbolic.perm_vecs();
let assembly_maps =
build_assembly_maps(&supernodes, &children, matrix, &perm_fwd, &perm_inv, n);
#[cfg(feature = "diagnostic")]
let session = ProfileSession::new();
#[cfg(feature = "diagnostic")]
let _factor_loop_guard = session.enter_section("factor_loop");
let all_node_results = factor_tree_levelset(
&supernodes,
&children,
matrix,
&perm_fwd,
&perm_inv,
options,
scaling,
n,
&assembly_maps,
&small_leaf_subtrees,
)?;
#[cfg(feature = "diagnostic")]
drop(_factor_loop_guard);
let mut front_factors_vec: Vec<Option<FrontFactors>> =
(0..n_supernodes).map(|_| None).collect();
let mut per_sn_stats_vec: Vec<Option<PerSupernodeStats>> =
(0..n_supernodes).map(|_| None).collect();
for (idx, ff, stats) in all_node_results {
front_factors_vec[idx] = Some(ff);
per_sn_stats_vec[idx] = Some(stats);
}
let front_factors: Vec<FrontFactors> = front_factors_vec
.into_iter()
.enumerate()
.map(|(i, opt)| opt.unwrap_or_else(|| panic!("supernode {} not factored", i)))
.collect();
let per_sn_stats: Vec<PerSupernodeStats> = per_sn_stats_vec
.into_iter()
.enumerate()
.map(|(i, opt)| opt.unwrap_or_else(|| panic!("supernode {} missing stats", i)))
.collect();
let mut stats = FactorizationStats {
total_1x1_pivots: 0,
total_2x2_pivots: 0,
total_delayed: 0,
zero_pivots: 0,
max_front_size: 0,
supernodes_before_amalgamation: n_supernodes_before,
supernodes_after_amalgamation: n_supernodes,
merges_performed,
#[cfg(feature = "diagnostic")]
total_assembly_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_kernel_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_extraction_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_zero_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_g2l_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_scatter_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_extend_add_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_extract_factors_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_extract_contrib_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_contrib_gemm_time: std::time::Duration::ZERO,
#[cfg(feature = "diagnostic")]
total_g2l_reset_time: std::time::Duration::ZERO,
small_leaf_subtrees: small_leaf_subtrees.len(),
small_leaf_nodes: small_leaf_subtrees.iter().map(|s| s.nodes.len()).sum(),
};
for sn_stat in &per_sn_stats {
stats.total_1x1_pivots += sn_stat.num_1x1;
stats.total_2x2_pivots += sn_stat.num_2x2;
stats.total_delayed += sn_stat.num_delayed;
if sn_stat.front_size > stats.max_front_size {
stats.max_front_size = sn_stat.front_size;
}
#[cfg(feature = "diagnostic")]
{
stats.total_assembly_time += sn_stat.assembly_time;
stats.total_kernel_time += sn_stat.kernel_time;
stats.total_extraction_time += sn_stat.extraction_time;
stats.total_zero_time += sn_stat.zero_time;
stats.total_g2l_time += sn_stat.g2l_time;
stats.total_scatter_time += sn_stat.scatter_time;
stats.total_extend_add_time += sn_stat.extend_add_time;
stats.total_extract_factors_time += sn_stat.extract_factors_time;
stats.total_extract_contrib_time += sn_stat.extract_contrib_time;
stats.total_contrib_gemm_time += sn_stat.contrib_gemm_time;
stats.total_g2l_reset_time += sn_stat.g2l_reset_time;
}
}
for ff in &front_factors {
for (_, entry) in ff.d11().iter_pivots() {
if let PivotEntry::OneByOne(val) = entry {
if val == 0.0 {
stats.zero_pivots += 1;
}
}
}
}
for (s, sn) in supernodes.iter().enumerate() {
if sn.parent.is_none() {
let sn_stat = &per_sn_stats[s];
let m = sn_stat.front_size;
let ne = sn_stat.num_eliminated;
if ne < m {
let k = sn_stat.num_fully_summed;
let n_unresolved = k - ne;
stats.zero_pivots += n_unresolved;
}
}
}
#[cfg(feature = "diagnostic")]
let finished_session = session.finish();
Ok(AptpNumeric {
front_factors,
stats,
per_supernode_stats: per_sn_stats,
n,
#[cfg(feature = "diagnostic")]
profile_session: Some(finished_session),
})
}
#[cfg(feature = "diagnostic")]
pub fn export_assembled_frontal(
symbolic: &AptpSymbolic,
matrix: &SparseColMat<usize, f64>,
options: &AptpOptions,
scaling: Option<&[f64]>,
target_snode: Option<usize>,
) -> Result<ExportedFrontal, SparseError> {
let n = symbolic.nrows();
if matrix.nrows() != n || matrix.ncols() != n {
return Err(SparseError::DimensionMismatch {
expected: (n, n),
got: (matrix.nrows(), matrix.ncols()),
context: "Matrix dimensions must match symbolic analysis".to_string(),
});
}
let supernodes = build_supernode_info(symbolic);
let n_supernodes = supernodes.len();
let children = build_children_map(&supernodes);
let (perm_fwd, perm_inv) = symbolic.perm_vecs();
let target = match target_snode {
Some(t) => {
if t >= n_supernodes {
return Err(SparseError::AnalysisFailure {
reason: format!(
"Target supernode {} out of range (n_supernodes={})",
t, n_supernodes
),
});
}
t
}
None => {
let mut best = 0;
let mut best_m = 0;
for (s, sn) in supernodes.iter().enumerate() {
let sn_ncols = sn.col_end - sn.col_begin;
let m_approx = sn_ncols + sn.pattern.len();
if m_approx > best_m {
best_m = m_approx;
best = s;
}
}
best
}
};
let mut global_to_local = vec![NOT_IN_FRONT; n];
let mut contributions: Vec<Option<ContributionBlock>> =
(0..n_supernodes).map(|_| None).collect();
for s in 0..=target {
let sn = &supernodes[s];
let mut delayed_cols: Vec<usize> = Vec::new();
for &c in &children[s] {
if let Some(ref cb) = contributions[c] {
delayed_cols.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
}
}
let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
let k = sn_ncols + delayed_cols.len();
let mut frontal_rows: Vec<usize> = Vec::with_capacity(k + sn.pattern.len());
for range in &sn.owned_ranges {
frontal_rows.extend(range.clone());
}
frontal_rows.extend_from_slice(&delayed_cols);
frontal_rows.extend_from_slice(&sn.pattern);
let m = frontal_rows.len();
for (i, &global) in frontal_rows.iter().enumerate() {
global_to_local[global] = i;
}
let mut frontal_data = Mat::zeros(m, m);
{
let mut frontal = FrontalMatrix {
data: frontal_data.as_mut(),
row_indices: &frontal_rows,
num_fully_summed: k,
};
scatter_original_entries_multi(
&mut frontal,
matrix,
&perm_fwd,
&perm_inv,
&global_to_local,
&sn.owned_ranges,
scaling,
);
for &c in &children[s] {
if let Some(cb) = contributions[c].take() {
let _ = extend_add(&mut frontal, cb, &global_to_local);
}
}
}
if s == target {
for &global in &frontal_rows {
global_to_local[global] = NOT_IN_FRONT;
}
return Ok(ExportedFrontal {
data: frontal_data,
front_size: m,
num_fully_summed: k,
row_indices: frontal_rows,
});
}
let result = aptp_factor_in_place(frontal_data.as_mut(), k, options)?;
let ne = result.num_eliminated;
if sn.parent.is_some() && ne < m {
let nfs = m - k;
let mut cb = Mat::zeros(m, m);
if nfs > 0 {
let mut ld_ws = Mat::zeros(m, ne.max(1));
compute_contribution_gemm(
&frontal_data,
k,
ne,
m,
&result.d,
&mut cb,
&mut ld_ws,
Par::Seq,
);
}
contributions[s] = Some(extract_contribution(
&frontal_data,
m,
k,
&frontal_rows,
&result,
cb,
));
}
for &global in &frontal_rows {
global_to_local[global] = NOT_IN_FRONT;
}
}
unreachable!("Target supernode {} not reached in postorder loop", target)
}
}
struct SupernodeResult {
ff: FrontFactors,
contribution: Option<ContributionBlock>,
stats: PerSupernodeStats,
}
#[allow(clippy::too_many_arguments)]
fn factor_single_supernode(
s: usize,
sn: &SupernodeInfo,
child_contributions: Vec<Option<ContributionBlock>>,
matrix: &SparseColMat<usize, f64>,
perm_fwd: &[usize],
perm_inv: &[usize],
options: &AptpOptions,
scaling: Option<&[f64]>,
workspace: &mut FactorizationWorkspace,
assembly_maps: &AssemblyMaps,
) -> Result<SupernodeResult, SparseError> {
workspace.delayed_cols_buf.clear();
for cb in child_contributions.iter().flatten() {
workspace
.delayed_cols_buf
.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
}
let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
let k = sn_ncols + workspace.delayed_cols_buf.len();
workspace.frontal_row_indices.clear();
workspace.frontal_row_indices.reserve(k + sn.pattern.len());
for range in &sn.owned_ranges {
workspace.frontal_row_indices.extend(range.clone());
}
workspace
.frontal_row_indices
.extend_from_slice(&workspace.delayed_cols_buf);
workspace.frontal_row_indices.extend_from_slice(&sn.pattern);
let m = workspace.frontal_row_indices.len();
if m > workspace.frontal_data.nrows() {
workspace.frontal_data = Mat::zeros(m, m);
}
#[cfg(feature = "diagnostic")]
let zero_start = std::time::Instant::now();
workspace.zero_frontal(m);
#[cfg(feature = "diagnostic")]
let zero_time = zero_start.elapsed();
#[cfg(feature = "diagnostic")]
let g2l_start = std::time::Instant::now();
for (i, &global) in workspace.frontal_row_indices.iter().enumerate() {
workspace.global_to_local[global] = i;
}
#[cfg(feature = "diagnostic")]
let g2l_time = g2l_start.elapsed();
#[cfg(feature = "diagnostic")]
let assembly_start = std::time::Instant::now();
let ndelay_in = workspace.delayed_cols_buf.len();
let mut frontal = FrontalMatrix {
data: workspace.frontal_data.as_mut().submatrix_mut(0, 0, m, m),
row_indices: &workspace.frontal_row_indices,
num_fully_summed: k,
};
#[cfg(feature = "diagnostic")]
let scatter_start = std::time::Instant::now();
if ndelay_in == 0 {
let amap_start = assembly_maps.amap_offsets[s];
let amap_end = assembly_maps.amap_offsets[s + 1];
let values = matrix.val();
let amap = &assembly_maps.amap_entries[amap_start * 4..amap_end * 4];
if let Some(sc) = scaling {
for entry in amap.chunks_exact(4) {
let src_idx = entry[0] as usize;
let dest_linear = entry[1] as usize;
let scale_row = entry[2] as usize;
let scale_col = entry[3] as usize;
let val = values[src_idx] * sc[scale_row] * sc[scale_col];
let dest_col = dest_linear / m;
let dest_row = dest_linear % m;
frontal.data[(dest_row, dest_col)] += val;
}
} else {
for entry in amap.chunks_exact(4) {
let src_idx = entry[0] as usize;
let dest_linear = entry[1] as usize;
let dest_col = dest_linear / m;
let dest_row = dest_linear % m;
frontal.data[(dest_row, dest_col)] += values[src_idx];
}
}
} else {
scatter_original_entries_multi(
&mut frontal,
matrix,
perm_fwd,
perm_inv,
&workspace.global_to_local,
&sn.owned_ranges,
scaling,
);
}
#[cfg(feature = "diagnostic")]
let scatter_time = scatter_start.elapsed();
#[cfg(feature = "diagnostic")]
let ea_start_time = std::time::Instant::now();
let ea_children_begin = assembly_maps.ea_snode_child_begin[s];
let mut ea_child_idx = ea_children_begin;
for opt_cb in child_contributions {
if let Some(cb) = opt_cb {
let recycled = if cb.num_delayed > 0 || ndelay_in > 0 {
extend_add(&mut frontal, cb, &workspace.global_to_local)
} else {
let ea_start = assembly_maps.ea_offsets[ea_child_idx];
let ea_end = assembly_maps.ea_offsets[ea_child_idx + 1];
let ea_row_map = &assembly_maps.ea_map[ea_start..ea_end];
extend_add_mapped(&mut frontal, cb, ea_row_map)
};
if recycled.nrows() >= workspace.contrib_buffer.nrows() {
workspace.contrib_buffer = recycled;
}
}
ea_child_idx += 1;
}
#[cfg(feature = "diagnostic")]
let extend_add_time = ea_start_time.elapsed();
#[cfg(feature = "diagnostic")]
let assembly_time = assembly_start.elapsed();
#[cfg(feature = "diagnostic")]
let kernel_start = std::time::Instant::now();
let effective_par = if m < INTRA_NODE_THRESHOLD {
Par::Seq
} else {
options.par
};
let per_sn_options = AptpOptions {
par: effective_par,
..options.clone()
};
let result = aptp_factor_in_place(
workspace.frontal_data.as_mut().submatrix_mut(0, 0, m, m),
k,
&per_sn_options,
)?;
let ne = result.num_eliminated;
#[cfg(feature = "diagnostic")]
let kernel_time = kernel_start.elapsed();
#[cfg(feature = "diagnostic")]
let contrib_gemm_start = std::time::Instant::now();
let nfs = m - k;
if sn.parent.is_some() && ne < m && nfs > 0 {
if workspace.contrib_buffer.nrows() < m || workspace.contrib_buffer.ncols() < m {
workspace.contrib_buffer = Mat::zeros(m, m);
}
compute_contribution_gemm(
&workspace.frontal_data,
k,
ne,
m,
&result.d,
&mut workspace.contrib_buffer,
&mut workspace.ld_workspace,
effective_par,
);
}
#[cfg(feature = "diagnostic")]
let contrib_gemm_time = contrib_gemm_start.elapsed();
#[cfg(feature = "diagnostic")]
let extraction_start = std::time::Instant::now();
#[cfg(feature = "diagnostic")]
let extract_factors_start = std::time::Instant::now();
let ff = extract_front_factors(
&workspace.frontal_data,
m,
k,
&workspace.frontal_row_indices,
&result,
);
#[cfg(feature = "diagnostic")]
let extract_factors_time = extract_factors_start.elapsed();
#[cfg(feature = "diagnostic")]
let extract_contrib_start = std::time::Instant::now();
let contribution = if sn.parent.is_some() && ne < m {
Some(extract_contribution(
&workspace.frontal_data,
m,
k,
&workspace.frontal_row_indices,
&result,
std::mem::replace(&mut workspace.contrib_buffer, Mat::new()),
))
} else {
None
};
#[cfg(feature = "diagnostic")]
let extract_contrib_time = extract_contrib_start.elapsed();
#[cfg(feature = "diagnostic")]
let extraction_time = extraction_start.elapsed();
let stats = PerSupernodeStats {
snode_id: s,
front_size: m,
num_fully_summed: k,
num_eliminated: ne,
num_delayed: k - ne,
num_1x1: result.stats.num_1x1,
num_2x2: result.stats.num_2x2,
max_l_entry: result.stats.max_l_entry,
#[cfg(feature = "diagnostic")]
assembly_time,
#[cfg(feature = "diagnostic")]
kernel_time,
#[cfg(feature = "diagnostic")]
extraction_time,
#[cfg(feature = "diagnostic")]
zero_time,
#[cfg(feature = "diagnostic")]
g2l_time,
#[cfg(feature = "diagnostic")]
scatter_time,
#[cfg(feature = "diagnostic")]
extend_add_time,
#[cfg(feature = "diagnostic")]
extract_factors_time,
#[cfg(feature = "diagnostic")]
extract_contrib_time,
#[cfg(feature = "diagnostic")]
contrib_gemm_time,
#[cfg(feature = "diagnostic")]
g2l_reset_time: std::time::Duration::ZERO, };
#[cfg(feature = "diagnostic")]
let g2l_reset_start = std::time::Instant::now();
for &global in &workspace.frontal_row_indices[..m] {
workspace.global_to_local[global] = NOT_IN_FRONT;
}
#[cfg(feature = "diagnostic")]
let g2l_reset_time = g2l_reset_start.elapsed();
#[cfg(feature = "diagnostic")]
let stats = {
let mut s = stats;
s.g2l_reset_time = g2l_reset_time;
s
};
Ok(SupernodeResult {
ff,
contribution,
stats,
})
}
#[allow(clippy::too_many_arguments)]
fn factor_small_leaf_subtree(
subtree: &SmallLeafSubtree,
supernodes: &[SupernodeInfo],
children: &[Vec<usize>],
matrix: &SparseColMat<usize, f64>,
perm_fwd: &[usize],
perm_inv: &[usize],
options: &AptpOptions,
scaling: Option<&[f64]>,
n: usize,
contributions: &mut [Option<ContributionBlock>],
assembly_maps: &AssemblyMaps,
) -> Result<Vec<(usize, FrontFactors, PerSupernodeStats)>, SparseError> {
let max_front = subtree.max_front_size;
let mut results = Vec::with_capacity(subtree.nodes.len());
let mut l_storage = Mat::<f64>::zeros(max_front, max_front);
let mut global_to_local = vec![NOT_IN_FRONT; n];
let mut contrib_buffer = Mat::<f64>::new();
let mut ld_workspace = Mat::<f64>::new();
let mut frontal_row_indices = Vec::<usize>::with_capacity(max_front);
let mut delayed_cols_buf = Vec::<usize>::new();
let symbolic_matrix = matrix.symbolic();
let col_ptrs = symbolic_matrix.col_ptr();
let row_indices_csc = symbolic_matrix.row_idx();
let values = matrix.val();
for &node_id in &subtree.nodes {
let sn = &supernodes[node_id];
delayed_cols_buf.clear();
for &c in &children[node_id] {
if let Some(ref cb) = contributions[c] {
delayed_cols_buf.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
}
}
let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
let k = sn_ncols + delayed_cols_buf.len();
frontal_row_indices.clear();
frontal_row_indices.reserve(k + sn.pattern.len());
for range in &sn.owned_ranges {
frontal_row_indices.extend(range.clone());
}
frontal_row_indices.extend_from_slice(&delayed_cols_buf);
frontal_row_indices.extend_from_slice(&sn.pattern);
let m = frontal_row_indices.len();
#[cfg(feature = "diagnostic")]
let zero_start = std::time::Instant::now();
if m > l_storage.nrows() || k > l_storage.ncols() {
l_storage = Mat::zeros(m, k.max(l_storage.ncols()));
}
for j in 0..k {
l_storage.col_as_slice_mut(j)[0..m].fill(0.0);
}
#[cfg(feature = "diagnostic")]
let zero_time = zero_start.elapsed();
#[cfg(feature = "diagnostic")]
let g2l_start = std::time::Instant::now();
for (i, &global) in frontal_row_indices.iter().enumerate() {
global_to_local[global] = i;
}
#[cfg(feature = "diagnostic")]
let g2l_time = g2l_start.elapsed();
#[cfg(feature = "diagnostic")]
let assembly_start = std::time::Instant::now();
#[cfg(feature = "diagnostic")]
let scatter_start = std::time::Instant::now();
let ndelay_in = delayed_cols_buf.len();
let total_owned = sn_ncols;
let nfs = m - k;
let has_nfs = nfs > 0 && sn.parent.is_some();
if has_nfs {
if contrib_buffer.nrows() < nfs || contrib_buffer.ncols() < nfs {
contrib_buffer = Mat::zeros(
nfs.max(contrib_buffer.nrows()),
nfs.max(contrib_buffer.ncols()),
);
}
for j in 0..nfs {
contrib_buffer.col_as_slice_mut(j)[0..nfs].fill(0.0);
}
}
if ndelay_in == 0 {
let amap_start = assembly_maps.amap_offsets[node_id];
let amap_end = assembly_maps.amap_offsets[node_id + 1];
let amap = &assembly_maps.amap_entries[amap_start * 4..amap_end * 4];
if let Some(sc) = scaling {
for entry in amap.chunks_exact(4) {
let src_idx = entry[0] as usize;
let dest_linear = entry[1] as usize;
let scale_row = entry[2] as usize;
let scale_col = entry[3] as usize;
let val = values[src_idx] * sc[scale_row] * sc[scale_col];
let amap_col = dest_linear / m;
let amap_row = dest_linear % m;
if amap_col < k {
l_storage[(amap_row, amap_col)] += val;
} else if has_nfs && amap_row >= k {
contrib_buffer[(amap_row - k, amap_col - k)] += val;
}
}
} else {
for entry in amap.chunks_exact(4) {
let src_idx = entry[0] as usize;
let dest_linear = entry[1] as usize;
let amap_col = dest_linear / m;
let amap_row = dest_linear % m;
if amap_col < k {
l_storage[(amap_row, amap_col)] += values[src_idx];
} else if has_nfs && amap_row >= k {
contrib_buffer[(amap_row - k, amap_col - k)] += values[src_idx];
}
}
}
} else {
for range in &sn.owned_ranges {
for pj in range.clone() {
let orig_col = perm_fwd[pj];
let local_col = global_to_local[pj];
let start = col_ptrs[orig_col];
let end = col_ptrs[orig_col + 1];
for idx in start..end {
let orig_row = row_indices_csc[idx];
if orig_row < orig_col {
let perm_row = perm_inv[orig_row];
let local_peer = global_to_local[perm_row];
if local_peer != NOT_IN_FRONT && local_peer < total_owned {
continue;
}
}
let global_row = perm_inv[orig_row];
let local_row = global_to_local[global_row];
if local_row == NOT_IN_FRONT {
continue;
}
if local_row >= total_owned && local_row < k {
continue;
}
let mut val = values[idx];
if let Some(s) = scaling {
val *= s[perm_inv[orig_row]] * s[perm_inv[orig_col]];
}
let (r, c) = if local_row >= local_col {
(local_row, local_col)
} else {
(local_col, local_row)
};
if c < k {
l_storage[(r, c)] += val;
} else if has_nfs && r >= k {
contrib_buffer[(r - k, c - k)] += val;
}
}
}
}
}
#[cfg(feature = "diagnostic")]
let scatter_time = scatter_start.elapsed();
#[cfg(feature = "diagnostic")]
let extend_add_start = std::time::Instant::now();
for &c in &children[node_id] {
if let Some(cb) = contributions[c].take() {
let cb_size = cb.row_indices.len();
for i in 0..cb_size {
let gi = cb.row_indices[i];
let li = global_to_local[gi];
debug_assert!(li != NOT_IN_FRONT, "child row {} not in parent", gi);
for j in 0..=i {
let gj = cb.row_indices[j];
let lj = global_to_local[gj];
debug_assert!(lj != NOT_IN_FRONT, "child col {} not in parent", gj);
let val = cb.data[(i, j)];
if val != 0.0 {
let (li, lj) = if li >= lj { (li, lj) } else { (lj, li) };
if lj < k {
l_storage[(li, lj)] += val;
} else if has_nfs {
contrib_buffer[(li - k, lj - k)] += val;
}
}
}
}
if !has_nfs && cb.data.nrows() >= contrib_buffer.nrows() {
contrib_buffer = cb.data;
}
}
}
#[cfg(feature = "diagnostic")]
let extend_add_time = extend_add_start.elapsed();
#[cfg(feature = "diagnostic")]
let assembly_time = assembly_start.elapsed();
#[cfg(feature = "diagnostic")]
let kernel_start = std::time::Instant::now();
let result = tpp_factor_rect(l_storage.as_mut().submatrix_mut(0, 0, m, k), k, options)?;
let ne = result.num_eliminated;
#[cfg(feature = "diagnostic")]
let kernel_time = kernel_start.elapsed();
#[cfg(feature = "diagnostic")]
let mut contrib_gemm_time = std::time::Duration::ZERO;
#[cfg(feature = "diagnostic")]
let mut extract_contrib_time = std::time::Duration::ZERO;
let contribution = if sn.parent.is_some() && ne < m {
if contrib_buffer.nrows() < m || contrib_buffer.ncols() < m {
let new_size = m.max(contrib_buffer.nrows());
let mut new_buf = Mat::zeros(new_size, new_size);
if nfs > 0 {
for j in 0..nfs {
let col_len = nfs - j;
let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
new_buf.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
}
}
contrib_buffer = new_buf;
}
#[cfg(feature = "diagnostic")]
let gemm_start = std::time::Instant::now();
if nfs > 0 && ne > 0 {
compute_contribution_gemm_rect(
&l_storage,
k,
ne,
m,
&result.d,
&mut contrib_buffer,
&mut ld_workspace,
Par::Seq,
);
}
#[cfg(feature = "diagnostic")]
{
contrib_gemm_time = gemm_start.elapsed();
}
#[cfg(feature = "diagnostic")]
let extract_contrib_start = std::time::Instant::now();
let cb = extract_contribution_rect(
&l_storage,
m,
k,
&frontal_row_indices,
&result,
std::mem::replace(&mut contrib_buffer, Mat::new()),
);
#[cfg(feature = "diagnostic")]
{
extract_contrib_time = extract_contrib_start.elapsed();
}
Some(cb)
} else {
None
};
#[cfg(feature = "diagnostic")]
let extract_factors_start = std::time::Instant::now();
let ff = extract_front_factors_rect(&l_storage, m, k, &frontal_row_indices, &result);
#[cfg(feature = "diagnostic")]
let extract_factors_time = extract_factors_start.elapsed();
let stats = PerSupernodeStats {
snode_id: node_id,
front_size: m,
num_fully_summed: k,
num_eliminated: ne,
num_delayed: k - ne,
num_1x1: result.stats.num_1x1,
num_2x2: result.stats.num_2x2,
max_l_entry: result.stats.max_l_entry,
#[cfg(feature = "diagnostic")]
assembly_time,
#[cfg(feature = "diagnostic")]
kernel_time,
#[cfg(feature = "diagnostic")]
extraction_time: extract_factors_time + extract_contrib_time,
#[cfg(feature = "diagnostic")]
zero_time,
#[cfg(feature = "diagnostic")]
g2l_time,
#[cfg(feature = "diagnostic")]
scatter_time,
#[cfg(feature = "diagnostic")]
extend_add_time,
#[cfg(feature = "diagnostic")]
extract_factors_time,
#[cfg(feature = "diagnostic")]
extract_contrib_time,
#[cfg(feature = "diagnostic")]
contrib_gemm_time,
#[cfg(feature = "diagnostic")]
g2l_reset_time: std::time::Duration::ZERO, };
#[cfg(feature = "diagnostic")]
let g2l_reset_start = std::time::Instant::now();
for &global in &frontal_row_indices[..m] {
global_to_local[global] = NOT_IN_FRONT;
}
#[cfg(feature = "diagnostic")]
let stats = {
let mut s = stats;
s.g2l_reset_time = g2l_reset_start.elapsed();
s
};
contributions[node_id] = contribution;
results.push((node_id, ff, stats));
}
Ok(results)
}
#[allow(clippy::too_many_arguments)]
fn factor_tree_levelset(
supernodes: &[SupernodeInfo],
children: &[Vec<usize>],
matrix: &SparseColMat<usize, f64>,
perm_fwd: &[usize],
perm_inv: &[usize],
options: &AptpOptions,
scaling: Option<&[f64]>,
n: usize,
assembly_maps: &AssemblyMaps,
small_leaf_subtrees: &[SmallLeafSubtree],
) -> Result<Vec<(usize, FrontFactors, PerSupernodeStats)>, SparseError> {
let n_supernodes = supernodes.len();
let mut contributions: Vec<Option<ContributionBlock>> =
(0..n_supernodes).map(|_| None).collect();
let mut remaining_children: Vec<usize> = children.iter().map(|c| c.len()).collect();
let mut all_results: Vec<(usize, FrontFactors, PerSupernodeStats)> =
Vec::with_capacity(n_supernodes);
for subtree in small_leaf_subtrees {
let subtree_results = factor_small_leaf_subtree(
subtree,
supernodes,
children,
matrix,
perm_fwd,
perm_inv,
options,
scaling,
n,
&mut contributions,
assembly_maps,
)?;
all_results.extend(subtree_results);
if let Some(parent) = subtree.parent_of_root {
remaining_children[parent] -= 1;
}
}
let is_parallel = !matches!(options.par, Par::Seq);
let max_front = estimate_max_front_size(supernodes);
let mut seq_workspace = if !is_parallel {
FactorizationWorkspace::new(max_front, n)
} else {
FactorizationWorkspace::empty()
};
let workspace_pool: std::sync::Mutex<Vec<FactorizationWorkspace>> =
std::sync::Mutex::new(Vec::new());
let mut ready: Vec<usize> = (0..n_supernodes)
.filter(|&s| remaining_children[s] == 0 && !supernodes[s].in_small_leaf)
.collect();
while !ready.is_empty() {
let mut batch_inputs: Vec<(usize, Vec<Option<ContributionBlock>>)> =
Vec::with_capacity(ready.len());
for &s in &ready {
let child_contribs: Vec<Option<ContributionBlock>> = children[s]
.iter()
.map(|&c| contributions[c].take())
.collect();
batch_inputs.push((s, child_contribs));
}
let batch_results: Vec<SupernodeResult> = if !is_parallel {
batch_inputs
.into_iter()
.map(|(s, contribs)| {
factor_single_supernode(
s,
&supernodes[s],
contribs,
matrix,
perm_fwd,
perm_inv,
options,
scaling,
&mut seq_workspace,
assembly_maps,
)
})
.collect::<Result<_, _>>()?
} else if batch_inputs.len() == 1 {
let mut pool = workspace_pool.lock().unwrap();
let mut ws = pool.pop().unwrap_or_else(FactorizationWorkspace::empty);
pool.clear();
drop(pool);
ws.ensure_g2l(n);
let results = batch_inputs
.into_iter()
.map(|(s, contribs)| {
factor_single_supernode(
s,
&supernodes[s],
contribs,
matrix,
perm_fwd,
perm_inv,
options,
scaling,
&mut ws,
assembly_maps,
)
})
.collect::<Result<_, _>>();
workspace_pool.lock().unwrap().push(ws);
results?
} else {
use rayon::iter::{IntoParallelIterator, ParallelIterator};
batch_inputs
.into_par_iter()
.map(|(s, contribs)| {
let mut ws = workspace_pool
.lock()
.unwrap()
.pop()
.unwrap_or_else(FactorizationWorkspace::empty);
ws.ensure_g2l(n);
let result = factor_single_supernode(
s,
&supernodes[s],
contribs,
matrix,
perm_fwd,
perm_inv,
options,
scaling,
&mut ws,
assembly_maps,
);
workspace_pool.lock().unwrap().push(ws);
result
})
.collect::<Result<_, _>>()?
};
let mut next_ready = Vec::new();
for (result, &s) in batch_results.into_iter().zip(ready.iter()) {
contributions[s] = result.contribution;
all_results.push((s, result.ff, result.stats));
if let Some(parent) = supernodes[s].parent {
remaining_children[parent] -= 1;
if remaining_children[parent] == 0 {
next_ready.push(parent);
}
}
}
ready = next_ready;
}
Ok(all_results)
}
#[allow(clippy::single_range_in_vec_init)]
fn build_supernode_info(symbolic: &AptpSymbolic) -> Vec<SupernodeInfo> {
match symbolic.raw() {
SymbolicCholeskyRaw::Supernodal(sn) => {
let ns = sn.n_supernodes();
let begin = sn.supernode_begin();
let end = sn.supernode_end();
(0..ns)
.map(|s| {
let pattern = sn.supernode(s).pattern().to_vec();
let parent = symbolic.supernode_parent(s);
SupernodeInfo {
col_begin: begin[s],
col_end: end[s],
pattern,
parent,
owned_ranges: vec![begin[s]..end[s]],
in_small_leaf: false,
}
})
.collect()
}
SymbolicCholeskyRaw::Simplicial(simp) => {
let n = symbolic.nrows();
let etree = symbolic.etree();
let l_symbolic = simp.factor();
let col_ptr = l_symbolic.col_ptr();
let row_idx = l_symbolic.row_idx();
(0..n)
.map(|j| {
let start = col_ptr[j];
let end = col_ptr[j + 1];
let pattern: Vec<usize> = row_idx[start..end]
.iter()
.copied()
.filter(|&r| r > j)
.collect();
let parent = if etree[j] < 0 {
None
} else {
Some(etree[j] as usize)
};
SupernodeInfo {
col_begin: j,
col_end: j + 1,
pattern,
parent,
owned_ranges: vec![j..j + 1],
in_small_leaf: false,
}
})
.collect()
}
}
}
fn build_children_map(infos: &[SupernodeInfo]) -> Vec<Vec<usize>> {
let n = infos.len();
let mut children = vec![Vec::new(); n];
for (s, info) in infos.iter().enumerate() {
if let Some(p) = info.parent {
children[p].push(s);
}
}
children
}
fn classify_small_leaf_subtrees(
supernodes: &mut [SupernodeInfo],
children_map: &[Vec<usize>],
threshold: usize,
) -> Vec<SmallLeafSubtree> {
if threshold == 0 {
return Vec::new();
}
let n_supernodes = supernodes.len();
for s in 0..n_supernodes {
let owned_cols: usize = supernodes[s].owned_ranges.iter().map(|r| r.len()).sum();
let front_size = owned_cols + supernodes[s].pattern.len();
if front_size >= threshold {
supernodes[s].in_small_leaf = false;
continue;
}
if children_map[s].is_empty() {
supernodes[s].in_small_leaf = true;
} else if children_map[s].iter().all(|&c| supernodes[c].in_small_leaf) {
supernodes[s].in_small_leaf = true;
} else {
supernodes[s].in_small_leaf = false;
}
}
let mut subtrees = Vec::new();
for s in 0..n_supernodes {
if !supernodes[s].in_small_leaf {
continue;
}
let is_root = match supernodes[s].parent {
None => true,
Some(p) => !supernodes[p].in_small_leaf,
};
if !is_root {
continue;
}
let mut nodes = Vec::new();
let mut stack = vec![s];
let mut visit_stack = Vec::new();
while let Some(node) = stack.pop() {
visit_stack.push(node);
for &c in &children_map[node] {
if supernodes[c].in_small_leaf {
stack.push(c);
}
}
}
while let Some(node) = visit_stack.pop() {
nodes.push(node);
}
if nodes.len() < 2 {
supernodes[s].in_small_leaf = false;
continue;
}
let max_front_size = nodes
.iter()
.map(|&node| {
let owned: usize = supernodes[node].owned_ranges.iter().map(|r| r.len()).sum();
owned + supernodes[node].pattern.len()
})
.max()
.unwrap_or(0);
subtrees.push(SmallLeafSubtree {
root: s,
nodes,
max_front_size,
parent_of_root: supernodes[s].parent,
});
}
subtrees
}
fn scatter_original_entries_multi(
frontal: &mut FrontalMatrix<'_>,
matrix: &SparseColMat<usize, f64>,
perm_fwd: &[usize],
perm_inv: &[usize],
global_to_local: &[usize],
owned_ranges: &[std::ops::Range<usize>],
scaling: Option<&[f64]>,
) {
let symbolic = matrix.symbolic();
let col_ptrs = symbolic.col_ptr();
let row_indices_csc = symbolic.row_idx();
let values = matrix.val();
let total_owned: usize = owned_ranges.iter().map(|r| r.len()).sum();
let k = frontal.num_fully_summed;
for range in owned_ranges {
for pj in range.clone() {
let orig_col = perm_fwd[pj];
let local_col = global_to_local[pj];
let start = col_ptrs[orig_col];
let end = col_ptrs[orig_col + 1];
for idx in start..end {
let orig_row = row_indices_csc[idx];
if orig_row < orig_col {
let perm_row = perm_inv[orig_row];
let local_peer = global_to_local[perm_row];
if local_peer != NOT_IN_FRONT && local_peer < total_owned {
continue; }
}
let global_row = perm_inv[orig_row];
let local_row = global_to_local[global_row];
if local_row == NOT_IN_FRONT {
continue;
}
if local_row >= total_owned && local_row < k {
continue;
}
let mut val = values[idx];
if let Some(s) = scaling {
val *= s[perm_inv[orig_row]] * s[perm_inv[orig_col]];
}
if local_row >= local_col {
frontal.data[(local_row, local_col)] += val;
} else {
frontal.data[(local_col, local_row)] += val;
}
}
}
}
}
pub(crate) fn extend_add(
parent: &mut FrontalMatrix<'_>,
child: ContributionBlock,
global_to_local: &[usize],
) -> Mat<f64> {
let cb_size = child.row_indices.len();
for j in 0..cb_size {
let gj = child.row_indices[j];
let lj = global_to_local[gj];
debug_assert!(
lj != NOT_IN_FRONT,
"extend_add: child col {} not in parent",
gj
);
let child_col = &child.data.col_as_slice(j)[j..cb_size];
let row_indices = &child.row_indices[j..cb_size];
for (&val, &gi) in child_col.iter().zip(row_indices) {
if val != 0.0 {
let li = global_to_local[gi];
debug_assert!(
li != NOT_IN_FRONT,
"extend_add: child row {} not in parent",
gi
);
if li >= lj {
parent.data[(li, lj)] += val;
} else {
parent.data[(lj, li)] += val;
}
}
}
}
child.data
}
fn extend_add_mapped(
parent: &mut FrontalMatrix<'_>,
child: ContributionBlock,
ea_row_map: &[u32],
) -> Mat<f64> {
let cb_size = child.row_indices.len();
debug_assert!(
ea_row_map.len() >= cb_size,
"extend_add_mapped: ea_row_map len {} < cb_size {}",
ea_row_map.len(),
cb_size
);
for j in 0..cb_size {
let lj = ea_row_map[j] as usize;
let child_col = child.data.col_as_slice(j);
let row_map = &ea_row_map[j..cb_size];
let src = &child_col[j..cb_size];
let n = cb_size - j;
let n4 = n / 4 * 4;
let mut k = 0;
while k < n4 {
ea_scatter_one(&mut parent.data, row_map[k] as usize, lj, src[k]);
ea_scatter_one(&mut parent.data, row_map[k + 1] as usize, lj, src[k + 1]);
ea_scatter_one(&mut parent.data, row_map[k + 2] as usize, lj, src[k + 2]);
ea_scatter_one(&mut parent.data, row_map[k + 3] as usize, lj, src[k + 3]);
k += 4;
}
while k < n {
ea_scatter_one(&mut parent.data, row_map[k] as usize, lj, src[k]);
k += 1;
}
}
child.data
}
#[inline(always)]
fn ea_scatter_one(parent: &mut MatMut<'_, f64>, li: usize, lj: usize, val: f64) {
if li >= lj {
parent[(li, lj)] += val;
} else {
parent[(lj, li)] += val;
}
}
pub(crate) fn extract_front_factors(
frontal_data: &Mat<f64>,
m: usize,
k: usize,
frontal_row_indices: &[usize],
result: &AptpFactorResult,
) -> FrontFactors {
let ne = result.num_eliminated;
let l11 = if ne > 0 {
let mut l = Mat::zeros(ne, ne);
let mut col = 0;
while col < ne {
l[(col, col)] = 1.0; match result.d.pivot_type(col) {
PivotType::OneByOne => {
let n_entries = ne - (col + 1);
if n_entries > 0 {
let src = &frontal_data.col_as_slice(col)[col + 1..ne];
l.col_as_slice_mut(col)[col + 1..ne].copy_from_slice(src);
}
col += 1;
}
PivotType::TwoByTwo { partner } if partner > col => {
l[(col + 1, col + 1)] = 1.0; let n_entries = ne - (col + 2);
if n_entries > 0 {
let src0 = &frontal_data.col_as_slice(col)[col + 2..ne];
l.col_as_slice_mut(col)[col + 2..ne].copy_from_slice(src0);
let src1 = &frontal_data.col_as_slice(col + 1)[col + 2..ne];
l.col_as_slice_mut(col + 1)[col + 2..ne].copy_from_slice(src1);
}
col += 2;
}
PivotType::TwoByTwo { .. } => {
col += 1;
}
PivotType::Delayed => {
unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
}
}
}
l
} else {
Mat::zeros(0, 0)
};
let mut d11 = MixedDiagonal::new(ne);
let mut col = 0;
while col < ne {
match result.d.pivot_type(col) {
PivotType::OneByOne => {
d11.set_1x1(col, result.d.diagonal_1x1(col));
col += 1;
}
PivotType::TwoByTwo { partner: _ } => {
let block = result.d.diagonal_2x2(col);
d11.set_2x2(Block2x2 {
first_col: col,
a: block.a,
b: block.b,
c: block.c,
});
col += 2;
}
PivotType::Delayed => {
unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
}
}
}
let r = m - ne;
let l21 = if ne > 0 && r > 0 {
let mut l = Mat::zeros(r, ne);
for j in 0..ne {
let src = &frontal_data.col_as_slice(j)[ne..m];
l.col_as_slice_mut(j)[..r].copy_from_slice(src);
}
l
} else {
Mat::zeros(r, ne)
};
let local_perm = result.perm[..k].to_vec();
let col_indices: Vec<usize> = local_perm[..ne]
.iter()
.map(|&lp| frontal_row_indices[lp])
.collect();
let mut row_indices = Vec::with_capacity(m - ne);
for &lp in &result.perm[ne..k] {
row_indices.push(frontal_row_indices[lp]);
}
row_indices.extend_from_slice(&frontal_row_indices[k..m]);
FrontFactors {
l11,
d11,
l21,
local_perm,
num_eliminated: ne,
col_indices,
row_indices,
}
}
pub(crate) fn extract_contribution(
frontal_data: &Mat<f64>,
m: usize,
k: usize,
frontal_row_indices: &[usize],
result: &AptpFactorResult,
mut contrib_buffer: Mat<f64>,
) -> ContributionBlock {
let ne = result.num_eliminated;
let size = m - ne;
let num_delayed = k - ne;
let nfs = m - k;
if num_delayed > 0 {
let mut data = Mat::zeros(size, size);
for j in 0..num_delayed {
let col_len = num_delayed - j;
let src = &frontal_data.col_as_slice(ne + j)[ne + j..ne + j + col_len];
data.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
}
for j in 0..num_delayed {
let src = &frontal_data.col_as_slice(ne + j)[k..m];
data.col_as_slice_mut(j)[num_delayed..size].copy_from_slice(src);
}
for j in 0..nfs {
let col_len = nfs - j;
let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
data.col_as_slice_mut(num_delayed + j)[num_delayed + j..size].copy_from_slice(src);
}
contrib_buffer = data;
}
let mut row_indices = Vec::with_capacity(size);
for &lp in &result.perm[ne..k] {
row_indices.push(frontal_row_indices[lp]);
}
row_indices.extend_from_slice(&frontal_row_indices[k..m]);
ContributionBlock {
data: contrib_buffer,
row_indices,
num_delayed,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::symmetric::factor::AptpOptions;
fn make_frontal_data(
lower: &Mat<f64>,
k: usize,
row_indices: Vec<usize>,
) -> (Mat<f64>, Vec<usize>, usize) {
let m = lower.nrows();
assert_eq!(lower.ncols(), m);
assert_eq!(row_indices.len(), m);
assert!(k <= m);
let mut data = Mat::zeros(m, m);
for i in 0..m {
for j in 0..=i {
data[(i, j)] = lower[(i, j)];
data[(j, i)] = lower[(i, j)];
}
}
(data, row_indices, k)
}
fn make_delay_matrix(m: usize, k: usize, delay_rows: &[usize]) -> Mat<f64> {
let mut a = Mat::zeros(m, m);
for i in 0..m {
a[(i, i)] = if i % 2 == 0 { 4.0 } else { -4.0 };
for j in 0..i {
let val = 0.5 / ((i - j) as f64 + 1.0);
a[(i, j)] = val;
}
}
for &r in delay_rows {
if r < k {
for j in 0..r {
a[(r, j)] *= 1000.0;
}
for i in (r + 1)..m {
a[(i, r)] *= 1000.0;
}
}
}
a
}
#[test]
fn test_extract_front_factors_l21_includes_delayed_rows() {
let m = 12;
let k = 8;
let delay_rows = &[1, 5];
let lower = make_delay_matrix(m, k, delay_rows);
let row_indices: Vec<usize> = (100..100 + m).collect();
let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
let options = AptpOptions::default();
let result =
aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
let ne = result.num_eliminated;
let num_delayed = k - ne;
if num_delayed > 0 {
let ff = extract_front_factors(&data, m, k, &row_indices, &result);
assert_eq!(
ff.l21.nrows(),
m - ne,
"L21 should have m-ne={} rows (not m-k={}), ne={}, delays={}",
m - ne,
m - k,
ne,
num_delayed
);
assert_eq!(
ff.row_indices.len(),
ff.l21.nrows(),
"row_indices length must match L21 rows"
);
assert_eq!(ff.col_indices.len(), ne);
} else {
let ff = extract_front_factors(&data, m, k, &row_indices, &result);
assert_eq!(ff.l21.nrows(), m - ne);
assert_eq!(ff.row_indices.len(), ff.l21.nrows());
}
}
#[test]
fn test_extract_front_factors_contribution_row_indices_consistent() {
let m = 10;
let k = 6;
let delay_rows = &[2];
let lower = make_delay_matrix(m, k, delay_rows);
let row_indices: Vec<usize> = (200..200 + m).collect();
let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
let options = AptpOptions::default();
let result =
aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
let ne = result.num_eliminated;
let num_delayed = k - ne;
if num_delayed > 0 {
let ff = extract_front_factors(&data, m, k, &row_indices, &result);
let nfs = m - k;
let mut contrib_buf = Mat::zeros(m, m);
if nfs > 0 {
let mut ld_ws = Mat::new();
compute_contribution_gemm(
&data,
k,
ne,
m,
&result.d,
&mut contrib_buf,
&mut ld_ws,
Par::Seq,
);
}
let cb = extract_contribution(&data, m, k, &row_indices, &result, contrib_buf);
assert_eq!(cb.num_delayed, num_delayed);
let ff_delayed_indices = &ff.row_indices[..num_delayed];
let cb_delayed_indices = &cb.row_indices[..num_delayed];
assert_eq!(
ff_delayed_indices, cb_delayed_indices,
"Delayed row indices must be identical between extract_front_factors \
and extract_contribution.\n front_factors: {:?}\n contribution: {:?}",
ff_delayed_indices, cb_delayed_indices
);
let ff_nfs_indices = &ff.row_indices[num_delayed..];
let cb_nfs_indices = &cb.row_indices[num_delayed..];
assert_eq!(
ff_nfs_indices, cb_nfs_indices,
"Non-fully-summed row indices must match"
);
}
}
#[test]
fn test_extract_front_factors_delayed_l21_entries_populated() {
let m = 12;
let k = 8;
let delay_rows = &[0, 3, 5];
let lower = make_delay_matrix(m, k, delay_rows);
let row_indices: Vec<usize> = (0..m).collect();
let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
let options = AptpOptions::default();
let result =
aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
let ne = result.num_eliminated;
let num_delayed = k - ne;
if num_delayed > 0 && ne > 0 {
let ff = extract_front_factors(&data, m, k, &row_indices, &result);
let delayed_l21 = ff.l21.as_ref().subrows(0, num_delayed);
let mut has_nonzero = false;
for i in 0..delayed_l21.nrows() {
for j in 0..delayed_l21.ncols() {
if delayed_l21[(i, j)].abs() > 1e-16 {
has_nonzero = true;
}
}
}
assert!(
has_nonzero,
"L21 delayed rows should have non-zero entries from TRSM. \
ne={}, delays={}, L21 shape=({},{})",
ne,
num_delayed,
ff.l21.nrows(),
ff.l21.ncols()
);
}
}
#[test]
fn test_extract_front_factors_reconstruction_with_delays() {
let m = 16;
let k = 10;
let delay_rows = &[1, 4, 7];
let lower = make_delay_matrix(m, k, delay_rows);
let row_indices: Vec<usize> = (0..m).collect();
let mut original = Mat::zeros(m, m);
for i in 0..m {
for j in 0..=i {
original[(i, j)] = lower[(i, j)];
original[(j, i)] = lower[(i, j)];
}
}
let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
let options = AptpOptions::default();
let result =
aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
let ne = result.num_eliminated;
if ne == 0 {
return; }
let ff = extract_front_factors(&data, m, k, &row_indices, &result);
let mut ld = Mat::zeros(ne, ne);
let mut col = 0;
while col < ne {
match ff.d11.pivot_type(col) {
PivotType::OneByOne => {
let d = ff.d11.diagonal_1x1(col);
for i in 0..ne {
ld[(i, col)] = ff.l11[(i, col)] * d;
}
col += 1;
}
PivotType::TwoByTwo { .. } => {
let block = ff.d11.diagonal_2x2(col);
for i in 0..ne {
let l0 = ff.l11[(i, col)];
let l1 = ff.l11[(i, col + 1)];
ld[(i, col)] = l0 * block.a + l1 * block.b;
ld[(i, col + 1)] = l0 * block.b + l1 * block.c;
}
col += 2;
}
PivotType::Delayed => {
col += 1;
}
}
}
let mut reconstructed = Mat::zeros(ne, ne);
for i in 0..ne {
for j in 0..=i {
let mut val = 0.0;
for p in 0..ne {
val += ld[(i, p)] * ff.l11[(j, p)];
}
reconstructed[(i, j)] = val;
reconstructed[(j, i)] = val;
}
}
let mut perm_original = Mat::zeros(ne, ne);
for i in 0..ne {
for j in 0..=i {
let oi = result.perm[i];
let oj = result.perm[j];
let val = if oi >= oj {
original[(oi, oj)]
} else {
original[(oj, oi)]
};
perm_original[(i, j)] = val;
perm_original[(j, i)] = val;
}
}
let mut diff_norm = 0.0f64;
let mut orig_norm = 0.0f64;
for i in 0..ne {
for j in 0..=i {
let d = reconstructed[(i, j)] - perm_original[(i, j)];
diff_norm += d * d;
orig_norm += perm_original[(i, j)] * perm_original[(i, j)];
}
}
let rel_err = diff_norm.sqrt() / orig_norm.sqrt().max(1e-15);
assert!(
rel_err < 1e-10,
"L11*D11*L11^T reconstruction error: {:.2e} (ne={}, delays={})",
rel_err,
ne,
k - ne
);
}
#[test]
fn test_extract_front_factors_solve_roundtrip_with_delays() {
let m = 12;
let k = 8;
let lower = make_delay_matrix(m, k, &[0, 3, 5]);
let row_indices: Vec<usize> = (0..m).collect();
let mut a_full = Mat::zeros(m, m);
for i in 0..m {
for j in 0..=i {
a_full[(i, j)] = lower[(i, j)];
a_full[(j, i)] = lower[(i, j)];
}
}
let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
let options = AptpOptions::default();
let result =
aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
let ne = result.num_eliminated;
let num_delayed = k - ne;
if num_delayed == 0 || ne == 0 {
return;
}
let ff = extract_front_factors(&data, m, k, &row_indices, &result);
let r = ff.l21.nrows();
let full_l_rows = ne + r;
assert_eq!(full_l_rows, m, "L should cover all rows");
let mut ld = Mat::zeros(full_l_rows, ne);
let mut full_l = Mat::zeros(full_l_rows, ne);
for i in 0..ne {
for j in 0..ne {
full_l[(i, j)] = ff.l11[(i, j)];
}
}
for i in 0..r {
for j in 0..ne {
full_l[(ne + i, j)] = ff.l21[(i, j)];
}
}
let mut col = 0;
while col < ne {
match ff.d11.pivot_type(col) {
PivotType::OneByOne => {
let d = ff.d11.diagonal_1x1(col);
for i in 0..full_l_rows {
ld[(i, col)] = full_l[(i, col)] * d;
}
col += 1;
}
PivotType::TwoByTwo { .. } => {
let block = ff.d11.diagonal_2x2(col);
for i in 0..full_l_rows {
let l0 = full_l[(i, col)];
let l1 = full_l[(i, col + 1)];
ld[(i, col)] = l0 * block.a + l1 * block.b;
ld[(i, col + 1)] = l0 * block.b + l1 * block.c;
}
col += 2;
}
PivotType::Delayed => {
col += 1;
}
}
}
let mut reconstructed = Mat::zeros(full_l_rows, full_l_rows);
for i in 0..full_l_rows {
for j in 0..=i {
let mut val = 0.0;
for p in 0..ne {
val += ld[(i, p)] * full_l[(j, p)];
}
reconstructed[(i, j)] = val;
reconstructed[(j, i)] = val;
}
}
let mut perm_rows = Vec::with_capacity(full_l_rows);
perm_rows.extend_from_slice(ff.col_indices());
perm_rows.extend_from_slice(ff.row_indices());
let mut diff_norm = 0.0f64;
let mut orig_norm = 0.0f64;
for i in 0..ne {
for j in 0..=i {
let gi = perm_rows[i];
let gj = perm_rows[j];
let orig_val = if gi >= gj {
a_full[(gi, gj)]
} else {
a_full[(gj, gi)]
};
let d = reconstructed[(i, j)] - orig_val;
diff_norm += d * d;
orig_norm += orig_val * orig_val;
}
}
let rel_err = diff_norm.sqrt() / orig_norm.sqrt().max(1e-15);
assert!(
rel_err < 1e-10,
"Full L*D*L^T reconstruction error: {:.2e} (ne={}, delays={}, L21_rows={})",
rel_err,
ne,
num_delayed,
r
);
}
#[test]
fn test_scatter_original_entries_multi_non_contiguous() {
use faer::sparse::{SparseColMat, Triplet};
let triplets = vec![
Triplet::new(0, 0, 10.0),
Triplet::new(1, 1, 11.0),
Triplet::new(2, 2, 12.0),
Triplet::new(3, 3, 13.0),
Triplet::new(4, 4, 14.0),
Triplet::new(5, 5, 15.0),
Triplet::new(1, 0, 1.0),
Triplet::new(0, 1, 1.0), Triplet::new(2, 0, 2.0),
Triplet::new(0, 2, 2.0), Triplet::new(3, 1, 3.0),
Triplet::new(1, 3, 3.0), Triplet::new(4, 0, 4.0),
Triplet::new(0, 4, 4.0), Triplet::new(5, 1, 5.0),
Triplet::new(1, 5, 5.0), Triplet::new(5, 2, 6.0),
Triplet::new(2, 5, 6.0), Triplet::new(5, 4, 7.0),
Triplet::new(4, 5, 7.0), ];
let matrix = SparseColMat::try_new_from_triplets(6, 6, &triplets).expect("valid CSC");
let perm_fwd: Vec<usize> = (0..6).collect();
let perm_inv: Vec<usize> = (0..6).collect();
let frontal_rows = vec![0, 1, 4, 5, 2, 3];
let total_owned = 4; let m = frontal_rows.len();
let k = total_owned;
let mut global_to_local = vec![NOT_IN_FRONT; 6];
for (local, &global) in frontal_rows.iter().enumerate() {
global_to_local[global] = local;
}
let mut frontal_data = Mat::zeros(m, m);
let owned_ranges = vec![0..2, 4..6];
{
let mut frontal = FrontalMatrix {
data: frontal_data.as_mut(),
row_indices: &frontal_rows,
num_fully_summed: k,
};
scatter_original_entries_multi(
&mut frontal,
&matrix,
&perm_fwd,
&perm_inv,
&global_to_local,
&owned_ranges,
None,
);
}
let f = &frontal_data;
assert_eq!(f[(0, 0)], 10.0, "diag global 0");
assert_eq!(f[(1, 1)], 11.0, "diag global 1");
assert_eq!(f[(2, 2)], 14.0, "diag global 4");
assert_eq!(f[(3, 3)], 15.0, "diag global 5");
assert_eq!(f[(1, 0)], 1.0, "global(1,0) → local(1,0)");
assert_eq!(f[(2, 0)], 4.0, "global(4,0) → local(2,0)");
assert_eq!(f[(3, 1)], 5.0, "global(5,1) → local(3,1)");
assert_eq!(f[(3, 2)], 7.0, "global(5,4) → local(3,2)");
assert_eq!(f[(4, 0)], 2.0, "global(2,0) → local(4,0)");
assert_eq!(f[(4, 3)], 6.0, "global(2,5) → local(4,3)");
assert_eq!(f[(5, 1)], 3.0, "global(3,1) → local(5,1)");
assert_eq!(
f[(1, 0)],
1.0,
"upper-triangle dedup: (1,0) should be 1.0, not 2.0"
);
assert_eq!(
f[(2, 0)],
4.0,
"cross-range upper-triangle dedup: (2,0) should be 4.0, not 8.0"
);
assert_eq!(f[(4, 1)], 0.0, "no global(2,1) entry");
assert_eq!(f[(4, 2)], 0.0, "no global(2,4) entry");
assert_eq!(f[(5, 0)], 0.0, "no global(3,0) entry");
assert_eq!(f[(5, 2)], 0.0, "no global(3,4) entry");
assert_eq!(f[(5, 3)], 0.0, "no global(3,5) entry");
}
#[test]
fn test_extend_add_mapped_hand_constructed() {
let mut parent_data = Mat::<f64>::zeros(5, 5);
let parent_row_indices = vec![10, 20, 30, 40, 50]; let mut parent = FrontalMatrix {
data: parent_data.as_mut(),
row_indices: &parent_row_indices,
num_fully_summed: 3,
};
let mut child_data = Mat::<f64>::zeros(3, 3);
child_data[(0, 0)] = 1.0;
child_data[(1, 0)] = 2.0;
child_data[(1, 1)] = 3.0;
child_data[(2, 0)] = 4.0;
child_data[(2, 1)] = 5.0;
child_data[(2, 2)] = 6.0;
let child = ContributionBlock {
data: child_data,
row_indices: vec![30, 10, 40], num_delayed: 0,
};
let ea_row_map: Vec<u32> = vec![2, 0, 3];
let recycled = extend_add_mapped(&mut parent, child, &ea_row_map);
assert_eq!(recycled.nrows(), 3);
assert_eq!(recycled.ncols(), 3);
let p = parent.data.as_ref();
assert_eq!(p[(0, 0)], 3.0, "child(1,1)=3.0 → parent(0,0)");
assert_eq!(p[(2, 0)], 2.0, "child(1,0)=2.0 swapped → parent(2,0)");
assert_eq!(p[(2, 2)], 1.0, "child(0,0)=1.0 → parent(2,2)");
assert_eq!(p[(3, 0)], 5.0, "child(2,1)=5.0 → parent(3,0)");
assert_eq!(p[(3, 2)], 4.0, "child(2,0)=4.0 → parent(3,2)");
assert_eq!(p[(3, 3)], 6.0, "child(2,2)=6.0 → parent(3,3)");
assert_eq!(p[(1, 0)], 0.0, "row 1 untouched");
assert_eq!(p[(1, 1)], 0.0, "row 1 untouched");
assert_eq!(p[(4, 0)], 0.0, "row 4 untouched");
assert_eq!(p[(4, 4)], 0.0, "row 4 untouched");
}
#[test]
fn test_extend_add_mapped_10x10_vs_reference() {
let cb_size = 10;
let parent_size = 15;
let ea_row_map: Vec<u32> = vec![1, 3, 4, 5, 7, 8, 10, 11, 12, 14];
let mut child_data = Mat::<f64>::zeros(cb_size, cb_size);
for j in 0..cb_size {
for i in j..cb_size {
child_data[(i, j)] = (i as f64 + 1.0) * 100.0 + (j as f64 + 1.0);
}
}
let mut ref_parent = Mat::<f64>::zeros(parent_size, parent_size);
for j in 0..cb_size {
let lj = ea_row_map[j] as usize;
for i in j..cb_size {
let li = ea_row_map[i] as usize;
if li >= lj {
ref_parent[(li, lj)] += child_data[(i, j)];
} else {
ref_parent[(lj, li)] += child_data[(i, j)];
}
}
}
let mut parent_data = Mat::<f64>::zeros(parent_size, parent_size);
let parent_row_indices: Vec<usize> = (0..parent_size).collect();
let mut parent = FrontalMatrix {
data: parent_data.as_mut(),
row_indices: &parent_row_indices,
num_fully_summed: 5,
};
let child = ContributionBlock {
data: child_data,
row_indices: (0..cb_size).collect(),
num_delayed: 0,
};
let _ = extend_add_mapped(&mut parent, child, &ea_row_map);
for j in 0..parent_size {
for i in 0..parent_size {
assert_eq!(
parent.data[(i, j)],
ref_parent[(i, j)],
"mismatch at ({}, {})",
i,
j
);
}
}
}
#[test]
fn test_extend_add_mapped_non_monotonic() {
let cb_size = 4;
let parent_size = 8;
let ea_row_map: Vec<u32> = vec![5, 2, 7, 1];
let mut child_data = Mat::<f64>::zeros(cb_size, cb_size);
for j in 0..cb_size {
for i in j..cb_size {
child_data[(i, j)] = (i * 10 + j + 1) as f64;
}
}
let mut ref_parent = Mat::<f64>::zeros(parent_size, parent_size);
for j in 0..cb_size {
let lj = ea_row_map[j] as usize;
for i in j..cb_size {
let li = ea_row_map[i] as usize;
if li >= lj {
ref_parent[(li, lj)] += child_data[(i, j)];
} else {
ref_parent[(lj, li)] += child_data[(i, j)];
}
}
}
let mut parent_data = Mat::<f64>::zeros(parent_size, parent_size);
let parent_row_indices: Vec<usize> = (0..parent_size).collect();
let mut parent = FrontalMatrix {
data: parent_data.as_mut(),
row_indices: &parent_row_indices,
num_fully_summed: 4,
};
let child = ContributionBlock {
data: child_data,
row_indices: (0..cb_size).collect(),
num_delayed: 0,
};
let _ = extend_add_mapped(&mut parent, child, &ea_row_map);
for j in 0..parent_size {
for i in 0..parent_size {
assert_eq!(
parent.data[(i, j)],
ref_parent[(i, j)],
"mismatch at ({}, {})",
i,
j
);
}
}
}
#[test]
fn test_extend_add_mapped_accumulates() {
let mut parent_data = Mat::<f64>::zeros(4, 4);
let parent_row_indices = vec![0, 1, 2, 3];
let mut parent = FrontalMatrix {
data: parent_data.as_mut(),
row_indices: &parent_row_indices,
num_fully_summed: 2,
};
let mut c1_data = Mat::<f64>::zeros(2, 2);
c1_data[(0, 0)] = 10.0;
c1_data[(1, 0)] = 20.0;
c1_data[(1, 1)] = 30.0;
let child1 = ContributionBlock {
data: c1_data,
row_indices: vec![1, 3],
num_delayed: 0,
};
let map1: Vec<u32> = vec![1, 3];
let _ = extend_add_mapped(&mut parent, child1, &map1);
let mut c2_data = Mat::<f64>::zeros(2, 2);
c2_data[(0, 0)] = 5.0;
c2_data[(1, 0)] = 7.0;
c2_data[(1, 1)] = 9.0;
let child2 = ContributionBlock {
data: c2_data,
row_indices: vec![1, 3],
num_delayed: 0,
};
let map2: Vec<u32> = vec![1, 3];
let _ = extend_add_mapped(&mut parent, child2, &map2);
let p = parent.data.as_ref();
assert_eq!(p[(1, 1)], 15.0, "10 + 5 accumulated");
assert_eq!(p[(3, 1)], 27.0, "20 + 7 accumulated");
assert_eq!(p[(3, 3)], 39.0, "30 + 9 accumulated");
}
#[allow(clippy::single_range_in_vec_init)]
fn make_snode(
col_begin: usize,
ncols: usize,
pattern_len: usize,
parent: Option<usize>,
) -> SupernodeInfo {
SupernodeInfo {
col_begin,
col_end: col_begin + ncols,
pattern: (0..pattern_len).map(|i| 1000 + col_begin + i).collect(),
parent,
owned_ranges: vec![col_begin..col_begin + ncols],
in_small_leaf: false,
}
}
#[test]
fn test_classify_all_small() {
let mut supernodes = vec![
make_snode(0, 2, 3, Some(1)),
make_snode(2, 2, 3, Some(2)),
make_snode(4, 2, 3, Some(3)),
make_snode(6, 2, 3, Some(4)),
make_snode(8, 2, 3, None),
];
let children = build_children_map(&supernodes);
let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
for sn in &supernodes {
assert!(
sn.in_small_leaf,
"snode {} should be in_small_leaf",
sn.col_begin
);
}
assert_eq!(subtrees.len(), 1);
assert_eq!(subtrees[0].nodes.len(), 5);
assert_eq!(subtrees[0].nodes, vec![0, 1, 2, 3, 4]);
assert_eq!(subtrees[0].root, 4);
assert_eq!(subtrees[0].parent_of_root, None);
}
#[test]
fn test_classify_mixed_tree() {
let mut supernodes = vec![
make_snode(0, 2, 3, Some(2)), make_snode(2, 2, 3, Some(2)), make_snode(4, 100, 200, None), ];
let children = build_children_map(&supernodes);
let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
assert!(!supernodes[2].in_small_leaf);
assert!(!supernodes[0].in_small_leaf, "single node excluded");
assert!(!supernodes[1].in_small_leaf, "single node excluded");
assert_eq!(subtrees.len(), 0);
}
#[test]
fn test_classify_single_node_excluded() {
let mut supernodes = vec![
make_snode(0, 2, 3, None), make_snode(2, 2, 3, None), make_snode(4, 2, 3, None), ];
let children = build_children_map(&supernodes);
let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
assert_eq!(subtrees.len(), 0);
for sn in &supernodes {
assert!(!sn.in_small_leaf);
}
}
#[test]
fn test_classify_threshold_boundary() {
let mut supernodes = vec![
make_snode(0, 100, 155, Some(1)), make_snode(100, 128, 128, None), ];
let children = build_children_map(&supernodes);
let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
assert!(
!supernodes[1].in_small_leaf,
"front_size=256 must NOT qualify"
);
assert!(!supernodes[0].in_small_leaf, "single node excluded");
assert_eq!(subtrees.len(), 0);
let mut supernodes2 = vec![
make_snode(0, 100, 155, Some(1)), make_snode(100, 127, 128, None), ];
let children2 = build_children_map(&supernodes2);
let subtrees2 = classify_small_leaf_subtrees(&mut supernodes2, &children2, 256);
assert!(supernodes2[0].in_small_leaf);
assert!(supernodes2[1].in_small_leaf);
assert_eq!(subtrees2.len(), 1);
assert_eq!(subtrees2[0].nodes, vec![0, 1]);
}
#[test]
fn test_classify_disabled() {
let mut supernodes = vec![make_snode(0, 2, 3, Some(1)), make_snode(2, 2, 3, None)];
let children = build_children_map(&supernodes);
let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 0);
assert_eq!(subtrees.len(), 0);
assert!(!supernodes[0].in_small_leaf);
assert!(!supernodes[1].in_small_leaf);
}
#[test]
fn test_classify_multiple_subtrees() {
let mut supernodes = vec![
make_snode(0, 2, 3, Some(2)), make_snode(2, 2, 3, Some(2)), make_snode(4, 2, 3, Some(5)), make_snode(6, 2, 3, Some(4)), make_snode(8, 2, 3, Some(5)), make_snode(10, 100, 200, None), ];
let children = build_children_map(&supernodes);
let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
assert_eq!(subtrees.len(), 2);
assert_eq!(subtrees[0].root, 2);
assert_eq!(subtrees[0].nodes, vec![0, 1, 2]);
assert_eq!(subtrees[0].parent_of_root, Some(5));
assert_eq!(subtrees[1].root, 4);
assert_eq!(subtrees[1].nodes, vec![3, 4]);
assert_eq!(subtrees[1].parent_of_root, Some(5));
assert!(!supernodes[5].in_small_leaf);
for (s, sn) in supernodes.iter().enumerate().take(5) {
assert!(sn.in_small_leaf, "snode {} should be in_small_leaf", s);
}
}
}