rivrs_sparse/symmetric/numeric.rs
1//! Multifrontal numeric factorization for sparse symmetric indefinite matrices.
2//!
3//! Implements the multifrontal method with A Posteriori Threshold Pivoting (APTP)
4//! for computing P^T A P = L D L^T factorizations of sparse symmetric indefinite
5//! matrices. The factorization traverses the assembly tree in postorder, assembling
6//! dense frontal matrices and factoring them using the [`aptp_factor_in_place`]
7//! kernel.
8//!
9//! # Algorithm Overview
10//!
11//! For each supernode in assembly-tree postorder:
12//! 1. **Assemble** a dense frontal matrix by scattering original sparse entries
13//! and extend-adding child contribution blocks
14//! 2. **Factor** the fully-summed portion using APTP (1x1/2x2 pivoting with delays)
15//! 3. **Extract** per-supernode factors (L11, D11, L21) and contribution block
16//! 4. **Propagate** contribution block (including delayed columns) to parent
17//!
18//! # Key Types
19//!
20//! - [`AptpNumeric`] — complete factorization result (public API)
21//! - [`FrontFactors`] — per-supernode factors (public, used by triangular solve)
22//! - [`FactorizationStats`] — aggregate statistics (public)
23//! - `SupernodeInfo` — unified supernode descriptor (internal)
24//! - `FrontalMatrix` — temporary dense assembly matrix (internal)
25//! - `ContributionBlock` — Schur complement passed to parent (internal)
26//!
27//! # References
28//!
29//! - Duff & Reid (1983), "The multifrontal solution of indefinite sparse symmetric
30//! linear equations" — foundational multifrontal method
31//! - Liu (1992), "The Multifrontal Method for Sparse Matrix Solution: Theory and
32//! Practice" — frontal matrices, elimination trees, extend-add
33//! - Hogg, Duff & Lopez (2020), "A New Sparse LDL^T Solver Using A Posteriori
34//! Threshold Pivoting" — APTP algorithm integration with multifrontal framework
35
36use faer::sparse::SparseColMat;
37use faer::sparse::linalg::cholesky::SymbolicCholeskyRaw;
38use faer::{Mat, MatMut, Par};
39
40use super::diagonal::{MixedDiagonal, PivotEntry};
41use super::factor::{
42 AptpFactorResult, AptpOptions, aptp_factor_in_place, compute_contribution_gemm,
43 compute_contribution_gemm_rect, extract_contribution_rect, extract_front_factors_rect,
44 tpp_factor_rect,
45};
46use super::pivot::{Block2x2, PivotType};
47use super::symbolic::AptpSymbolic;
48use crate::error::SparseError;
49
50#[cfg(feature = "diagnostic")]
51use crate::profiling::{FinishedSession, ProfileSession};
52
53/// Sentinel value indicating a global column is not part of the current front.
54const NOT_IN_FRONT: usize = usize::MAX;
55
56// ---------------------------------------------------------------------------
57// Workspace types
58// ---------------------------------------------------------------------------
59
60/// Pre-allocated reusable buffers for the per-supernode factorization loop.
61///
62/// Eliminates per-supernode heap allocations for the frontal matrix, row indices,
63/// delayed columns buffer, and global-to-local mapping. Sized to the maximum
64/// front dimension from symbolic analysis and reused across supernodes.
65///
66/// # Invariant
67///
68/// Never shared between concurrent supernodes. In sequential mode, passed as
69/// `&mut`. In parallel mode, each rayon worker has its own instance via
70/// `Cell`-based move semantics.
71///
72/// # References
73///
74/// - Duff, Hogg & Lopez (2020), Section 5: workspace reuse strategy
75/// - Liu (1992), Section 4: multifrontal workspace management
76pub(crate) struct FactorizationWorkspace {
77 /// Dense buffer, max_front × max_front. Reused as each supernode's frontal matrix.
78 frontal_data: Mat<f64>,
79 /// Buffer for frontal matrix row indices, capacity max_front.
80 frontal_row_indices: Vec<usize>,
81 /// Buffer for collecting delayed columns from children.
82 delayed_cols_buf: Vec<usize>,
83 /// Global-to-local row index mapping, length n (matrix dimension).
84 global_to_local: Vec<usize>,
85 /// Pre-allocated contribution buffer, max_front × max_front. Receives the
86 /// deferred contribution GEMM output. Reused across supernodes via swap protocol:
87 /// final GEMM writes here → buffer moved into ContributionBlock → parent's
88 /// extend-add recycles buffer back.
89 contrib_buffer: Mat<f64>,
90 /// Pre-allocated L·D product workspace for `compute_contribution_gemm`.
91 /// Sized nfs × ne (grows on demand). Eliminates per-supernode allocation
92 /// in the hot loop.
93 ld_workspace: Mat<f64>,
94}
95
96impl Default for FactorizationWorkspace {
97 fn default() -> Self {
98 Self::empty()
99 }
100}
101
102impl FactorizationWorkspace {
103 /// Create a new workspace sized for the given maximum front dimension.
104 ///
105 /// # Arguments
106 ///
107 /// - `max_front`: Maximum frontal matrix dimension across all supernodes
108 /// - `n`: Matrix dimension (for global-to-local mapping)
109 pub(crate) fn new(max_front: usize, n: usize) -> Self {
110 if max_front == 0 {
111 return Self {
112 frontal_data: Mat::zeros(0, 0),
113 frontal_row_indices: Vec::new(),
114 delayed_cols_buf: Vec::new(),
115 global_to_local: vec![NOT_IN_FRONT; n],
116 contrib_buffer: Mat::zeros(0, 0),
117 ld_workspace: Mat::new(),
118 };
119 }
120 Self {
121 frontal_data: Mat::zeros(max_front, max_front),
122 frontal_row_indices: Vec::with_capacity(max_front),
123 delayed_cols_buf: Vec::with_capacity(max_front),
124 global_to_local: vec![NOT_IN_FRONT; n],
125 // Lazily allocated on first use in factor_single_supernode.
126 // Recycled via extend_add buffer return, so typically allocates once.
127 contrib_buffer: Mat::new(),
128 ld_workspace: Mat::new(),
129 }
130 }
131
132 /// Empty workspace for `Cell` default in thread-local storage.
133 const fn empty() -> Self {
134 Self {
135 frontal_data: Mat::new(),
136 frontal_row_indices: Vec::new(),
137 delayed_cols_buf: Vec::new(),
138 global_to_local: Vec::new(),
139 contrib_buffer: Mat::new(),
140 ld_workspace: Mat::new(),
141 }
142 }
143
144 /// Zero the lower triangle of the m × m subregion of `frontal_data`.
145 ///
146 /// Called after the front size `m` is known and row indices are populated.
147 /// Does NOT clear `frontal_row_indices` or `delayed_cols_buf` — those
148 /// are managed by the caller before determining `m`.
149 ///
150 /// # Panics
151 ///
152 /// Panics if `m > self.frontal_data.nrows()`.
153 fn zero_frontal(&mut self, m: usize) {
154 assert!(
155 m <= self.frontal_data.nrows(),
156 "front size {} exceeds workspace capacity {}",
157 m,
158 self.frontal_data.nrows()
159 );
160 // Zero the lower triangle of the m × m subregion via per-column fill.
161 // col_as_slice_mut gives a contiguous &mut [f64] for each column of the
162 // owned Mat, so fill(0.0) compiles to memset — much faster than indexed writes.
163 for j in 0..m {
164 self.frontal_data.col_as_slice_mut(j)[j..m].fill(0.0);
165 }
166 }
167
168 /// Ensure `global_to_local` mapping is at least `n` entries.
169 ///
170 /// Used by the parallel path where frontal_data grows on demand in
171 /// `factor_single_supernode` (avoiding eager max_front² allocation
172 /// per thread that causes OOM on large-front matrices like H2O).
173 fn ensure_g2l(&mut self, n: usize) {
174 if self.global_to_local.len() < n {
175 self.global_to_local.resize(n, NOT_IN_FRONT);
176 }
177 }
178}
179
180/// Front dimension below which intra-node BLAS uses `Par::Seq` regardless of
181/// the user-supplied parallelism setting. Fronts smaller than this threshold
182/// do not benefit from parallel BLAS (overhead exceeds computation).
183pub(crate) const INTRA_NODE_THRESHOLD: usize = 256;
184
185/// Compute an upper bound on the maximum front size across all supernodes.
186///
187/// The front size for a supernode is `owned_cols + delayed_cols + pattern.len()`.
188/// Since delayed columns depend on the actual factorization, we estimate using
189/// `owned_cols + pattern.len()` (front size without delays) and add a safety
190/// factor. If delayed columns cause a front to exceed this estimate, the
191/// workspace will be resized at that point.
192fn estimate_max_front_size(supernodes: &[SupernodeInfo]) -> usize {
193 supernodes
194 .iter()
195 .map(|sn| {
196 let owned: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
197 owned + sn.pattern.len()
198 })
199 .max()
200 .unwrap_or(0)
201}
202
203// ---------------------------------------------------------------------------
204// Assembly maps (precomputed scatter indices)
205// ---------------------------------------------------------------------------
206
207/// Precomputed index mappings for assembly operations.
208///
209/// Eliminates per-entry index arithmetic in `scatter_original_entries_multi`
210/// and `extend_add` by storing source→destination index pairs computed from
211/// the symbolic structure. Computed once after amalgamation at the start of
212/// factorization and read-only during the assembly loop.
213///
214/// Uses `u32` indices to halve memory (frontal matrices are bounded by
215/// max_front_size, well within u32 range).
216///
217/// # Layout
218///
219/// Both `amap_entries` and `ea_map` use CSC-style compressed storage with
220/// per-supernode/per-child offset arrays for O(1) slice access.
221///
222/// # References
223///
224/// - SPRAL `assemble.hxx:51-80` (`add_a_block`): amap-based assembly pattern
225pub(crate) struct AssemblyMaps {
226 /// Flattened tuples for original matrix scatter. Each entry is 4 u32 values:
227 /// `[src_csc_index, dest_frontal_linear_index, scale_row, scale_col]` where
228 /// `src_csc_index` is the position in the CSC values array, `dest_frontal_linear_index`
229 /// is `col * m + row` in column-major layout, and `scale_row`/`scale_col` are
230 /// the permuted indices for MC64 scaling lookup (`scaling[scale_row] * scaling[scale_col]`).
231 /// Length: 4 * total_entries across all supernodes.
232 pub amap_entries: Vec<u32>,
233 /// Per-supernode start offset into `amap_entries` (in entry units; each entry
234 /// is 4 u32 values). `amap_entries[amap_offsets[s]*4 .. amap_offsets[s+1]*4]`
235 /// gives the flattened entries for supernode s.
236 /// Length: num_supernodes + 1.
237 pub amap_offsets: Vec<usize>,
238
239 /// Flattened child→parent row index mappings for extend-add.
240 /// For each child's contribution row, stores the local row index in the
241 /// parent's frontal matrix (assuming zero delays).
242 pub ea_map: Vec<u32>,
243 /// Per-child start offset into `ea_map`. Indexed by the flattened child
244 /// enumeration (all children of all supernodes in tree order).
245 /// Length: total_children + 1.
246 pub ea_offsets: Vec<usize>,
247 /// Per-supernode start offset into the children enumeration.
248 /// `ea_snode_child_begin[s]..ea_snode_child_begin[s+1]` gives the range
249 /// in `ea_offsets` for supernode s's children.
250 /// Length: num_supernodes + 1.
251 pub ea_snode_child_begin: Vec<usize>,
252}
253
254/// Build precomputed assembly maps from post-amalgamation supernode structure.
255///
256/// # Arguments
257///
258/// - `supernodes`: Post-amalgamation supernode descriptors.
259/// - `children`: Children map (children[s] = list of child supernode indices).
260/// - `matrix`: Original sparse matrix (for CSC structure).
261/// - `perm_fwd`: Forward permutation (perm_fwd[new] = old).
262/// - `perm_inv`: Inverse permutation (perm_inv[old] = new).
263/// - `n`: Matrix dimension.
264fn build_assembly_maps(
265 supernodes: &[SupernodeInfo],
266 children: &[Vec<usize>],
267 matrix: &SparseColMat<usize, f64>,
268 perm_fwd: &[usize],
269 perm_inv: &[usize],
270 n: usize,
271) -> AssemblyMaps {
272 let n_supernodes = supernodes.len();
273 let symbolic = matrix.symbolic();
274 let col_ptrs = symbolic.col_ptr();
275 let row_indices_csc = symbolic.row_idx();
276
277 // ---- Build amap (original entry scatter maps) ----
278 // Each entry is 4 u32: [src_csc_index, dest_frontal_linear, scale_row, scale_col]
279 // where scale_row = perm_inv[orig_row], scale_col = perm_inv[orig_col]
280 // (precomputed for efficient MC64 scaling at assembly time).
281
282 let mut amap_entries = Vec::new();
283 let mut amap_offsets = vec![0usize; n_supernodes + 1];
284
285 // Temporary global-to-local mapping, reused per supernode
286 let mut g2l = vec![NOT_IN_FRONT; n];
287
288 for (s, sn) in supernodes.iter().enumerate() {
289 // Compute front structure for this supernode (without delays)
290 let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
291 let mut frontal_rows: Vec<usize> = Vec::with_capacity(sn_ncols + sn.pattern.len());
292 for range in &sn.owned_ranges {
293 frontal_rows.extend(range.clone());
294 }
295 frontal_rows.extend_from_slice(&sn.pattern);
296 let m = frontal_rows.len();
297
298 // Build global-to-local
299 for (i, &global) in frontal_rows.iter().enumerate() {
300 g2l[global] = i;
301 }
302
303 let total_owned = sn_ncols;
304
305 // Iterate owned columns and build amap entries
306 for range in &sn.owned_ranges {
307 for pj in range.clone() {
308 let orig_col = perm_fwd[pj];
309 let local_col = g2l[pj];
310
311 let start = col_ptrs[orig_col];
312 let end = col_ptrs[orig_col + 1];
313 for (idx, &orig_row) in row_indices_csc.iter().enumerate().take(end).skip(start) {
314 // Upper-triangle dedup: skip if both are owned supernode cols
315 if orig_row < orig_col {
316 let perm_row = perm_inv[orig_row];
317 let local_peer = g2l[perm_row];
318 if local_peer != NOT_IN_FRONT && local_peer < total_owned {
319 continue;
320 }
321 }
322 let global_row = perm_inv[orig_row];
323 let local_row = g2l[global_row];
324 if local_row == NOT_IN_FRONT {
325 continue;
326 }
327
328 // Determine destination in lower triangle
329 let (dest_row, dest_col) = if local_row >= local_col {
330 (local_row, local_col)
331 } else {
332 (local_col, local_row)
333 };
334 let dest_linear = dest_col * m + dest_row;
335
336 amap_entries.push(idx as u32);
337 amap_entries.push(dest_linear as u32);
338 amap_entries.push(global_row as u32); // perm_inv[orig_row]
339 amap_entries.push(pj as u32); // perm_inv[orig_col] = pj
340 }
341 }
342 }
343
344 // Reset g2l
345 for &global in &frontal_rows {
346 g2l[global] = NOT_IN_FRONT;
347 }
348
349 let entry_end = amap_entries.len() / 4;
350 amap_offsets[s + 1] = entry_end;
351 }
352
353 // ---- Build extend-add maps ----
354
355 let mut ea_map = Vec::new();
356 let mut ea_offsets = vec![0usize];
357 let mut ea_snode_child_begin = vec![0usize; n_supernodes + 1];
358
359 for (s, sn) in supernodes.iter().enumerate() {
360 // Build parent's frontal row structure (without delays)
361 let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
362 let mut parent_rows: Vec<usize> = Vec::with_capacity(sn_ncols + sn.pattern.len());
363 for range in &sn.owned_ranges {
364 parent_rows.extend(range.clone());
365 }
366 parent_rows.extend_from_slice(&sn.pattern);
367
368 // Build g2l for parent
369 for (i, &global) in parent_rows.iter().enumerate() {
370 g2l[global] = i;
371 }
372
373 for &c in &children[s] {
374 let child_sn = &supernodes[c];
375
376 // Child's contribution rows = child's pattern (off-diagonal rows).
377 // Without delays, the contribution block rows are exactly the pattern.
378 // With delays, additional delayed rows appear — the precomputed map
379 // won't cover those (fallback to g2l at factorization time).
380 for &child_row in &child_sn.pattern {
381 let parent_local = g2l[child_row];
382 debug_assert!(
383 parent_local != NOT_IN_FRONT,
384 "child pattern row {} not in parent supernode {}",
385 child_row,
386 s
387 );
388 ea_map.push(parent_local as u32);
389 }
390 ea_offsets.push(ea_map.len());
391 }
392
393 // Reset g2l
394 for &global in &parent_rows {
395 g2l[global] = NOT_IN_FRONT;
396 }
397
398 ea_snode_child_begin[s + 1] = ea_offsets.len() - 1;
399 }
400
401 AssemblyMaps {
402 amap_entries,
403 amap_offsets,
404 ea_map,
405 ea_offsets,
406 ea_snode_child_begin,
407 }
408}
409
410// ---------------------------------------------------------------------------
411// Internal types
412// ---------------------------------------------------------------------------
413
414/// Precomputed supernode descriptor unifying faer's supernodal and simplicial
415/// decompositions.
416///
417/// Built once from [`AptpSymbolic`] before the factorization loop via
418/// [`build_supernode_info`]. For supernodal decompositions, maps directly to
419/// faer's supernode structure. For simplicial decompositions, each column
420/// becomes a trivial 1-column supernode.
421///
422/// # Invariants
423///
424/// - `col_end > col_begin`
425/// - `parent.map_or(true, |p| p > s)` for supernode index `s` (postorder)
426///
427/// # References
428///
429/// - Liu (1992), Section 3: supernode definitions and assembly trees
430#[derive(Clone)]
431pub(crate) struct SupernodeInfo {
432 /// First column index of this supernode (inclusive).
433 pub col_begin: usize,
434 /// Past-the-end column index (exclusive).
435 pub col_end: usize,
436 /// Off-diagonal row indices (global permuted column indices of non-fully-summed rows).
437 pub pattern: Vec<usize>,
438 /// Parent supernode index in assembly tree, or `None` for root.
439 pub parent: Option<usize>,
440 /// Column ranges actually owned by this supernode. After amalgamation of
441 /// non-adjacent supernodes, `col_begin..col_end` may span columns belonging
442 /// to other supernodes. `owned_ranges` tracks the true owned columns for
443 /// scatter. Default: `[col_begin..col_end]`.
444 pub owned_ranges: Vec<std::ops::Range<usize>>,
445 /// True if this supernode belongs to a classified small-leaf subtree.
446 /// Set by `classify_small_leaf_subtrees()` after amalgamation.
447 pub in_small_leaf: bool,
448}
449
450/// A classified small-leaf subtree eligible for the fast-path factorization.
451///
452/// Produced by [`classify_small_leaf_subtrees`] after amalgamation and consumed
453/// by `factor_small_leaf_subtree` during the pre-pass before the main level-set
454/// loop. All supernodes in the subtree have front_size < `small_leaf_threshold`,
455/// enabling a streamlined code path with a single small reusable workspace.
456///
457/// # Invariants
458///
459/// - `nodes.len() >= 2` (single-node "subtrees" are excluded)
460/// - `*nodes.last().unwrap() == root`
461/// - All nodes have `in_small_leaf = true`
462/// - `max_front_size < small_leaf_threshold`
463/// - Nodes are in postorder: children appear before parents
464///
465/// # References
466///
467/// - SPRAL `SymbolicSubtree.hxx:57-84` (BSD-3): subtree classification
468/// - SPRAL `SmallLeafNumericSubtree.hxx:187-446` (BSD-3): fast-path factorization
469pub(crate) struct SmallLeafSubtree {
470 /// Supernode ID of the subtree root (topmost node in the subtree).
471 /// Used by tests for invariant verification; always equals `*nodes.last()`.
472 #[allow(dead_code)]
473 pub root: usize,
474 /// Supernode IDs in postorder (leaves first, root last).
475 pub nodes: Vec<usize>,
476 /// Maximum front size across all nodes in the subtree.
477 pub max_front_size: usize,
478 /// Parent supernode outside the subtree that receives the root's contribution,
479 /// or `None` if the subtree root is also a tree root.
480 pub parent_of_root: Option<usize>,
481}
482
483// No methods needed — fields accessed directly within this module.
484
485/// Temporary dense matrix for assembling and factoring one supernode.
486///
487/// References workspace buffers during the factorization loop, or owns its
488/// data in diagnostic contexts (`export_assembled_frontal`).
489///
490/// The matrix is partitioned into:
491/// - `F11 = data[0..k, 0..k]` — fully-summed block (factored by APTP)
492/// - `F21 = data[k..m, 0..k]` — subdiagonal block
493/// - `F22 = data[k..m, k..m]` — contribution block (Schur complement)
494///
495/// where `k = num_fully_summed` and `m = data.nrows()`.
496pub(crate) struct FrontalMatrix<'a> {
497 /// Dense m × m storage (lower triangle used). Borrows from workspace.
498 pub data: MatMut<'a, f64>,
499 /// Global permuted column indices for each local row (length m).
500 /// Used by callers that need index mapping (e.g., scatter maps).
501 #[allow(dead_code)]
502 pub row_indices: &'a [usize],
503 /// Number of fully-summed rows/columns (supernode cols + delayed from children).
504 pub num_fully_summed: usize,
505}
506
507/// Schur complement and delayed columns from a factored supernode.
508///
509/// Created by [`extract_contribution`] and consumed by [`extend_add`] into
510/// the parent supernode's frontal matrix.
511///
512/// # Structure
513///
514/// - Positions `0..num_delayed`: delayed columns (become fully-summed at parent)
515/// - Positions `num_delayed..size`: non-fully-summed rows with Schur complement
516pub(crate) struct ContributionBlock {
517 /// Dense (m - ne) × (m - ne) trailing submatrix from factored frontal matrix.
518 pub data: Mat<f64>,
519 /// Global permuted column indices for rows/columns of the contribution.
520 pub row_indices: Vec<usize>,
521 /// Number of delayed columns at the start of this block.
522 pub num_delayed: usize,
523}
524
525// ---------------------------------------------------------------------------
526// Public types
527// ---------------------------------------------------------------------------
528
529/// Per-supernode factorization result.
530///
531/// Stores the L11, D11, and L21 blocks extracted from the factored frontal
532/// matrix, along with permutation and index information needed by the
533/// triangular solve.
534///
535/// # Invariants
536///
537/// - `l11.nrows() == l11.ncols() == num_eliminated`
538/// - `d11.dimension() == num_eliminated`
539/// - `l21.ncols() == num_eliminated`
540/// - `l21.nrows() == row_indices.len()`
541/// - `col_indices.len() == num_eliminated`
542#[derive(Debug)]
543pub struct FrontFactors {
544 /// Unit lower triangular factor (ne × ne).
545 pub(crate) l11: Mat<f64>,
546 /// Block diagonal with 1x1 and 2x2 pivots (ne entries).
547 pub(crate) d11: MixedDiagonal,
548 /// Subdiagonal factor block (r × ne).
549 pub(crate) l21: Mat<f64>,
550 /// APTP local pivot permutation (length k). Maps factored position to
551 /// original front-local column.
552 pub(crate) local_perm: Vec<usize>,
553 /// Number of columns successfully eliminated (ne ≤ k).
554 pub(crate) num_eliminated: usize,
555 /// Global permuted column indices for the eliminated columns (length ne).
556 pub(crate) col_indices: Vec<usize>,
557 /// Global permuted row indices for L21 rows (length r).
558 pub(crate) row_indices: Vec<usize>,
559}
560
561impl FrontFactors {
562 /// Unit lower triangular factor L11 (ne × ne).
563 pub fn l11(&self) -> &Mat<f64> {
564 &self.l11
565 }
566
567 /// Block diagonal D11 with mixed 1x1/2x2 pivots.
568 pub fn d11(&self) -> &MixedDiagonal {
569 &self.d11
570 }
571
572 /// Subdiagonal factor block L21 (r × ne).
573 pub fn l21(&self) -> &Mat<f64> {
574 &self.l21
575 }
576
577 /// APTP local pivot permutation within this front.
578 pub fn local_perm(&self) -> &[usize] {
579 &self.local_perm
580 }
581
582 /// Number of successfully eliminated columns.
583 pub fn num_eliminated(&self) -> usize {
584 self.num_eliminated
585 }
586
587 /// Global permuted column indices for eliminated columns.
588 pub fn col_indices(&self) -> &[usize] {
589 &self.col_indices
590 }
591
592 /// Global permuted row indices for L21 rows.
593 pub fn row_indices(&self) -> &[usize] {
594 &self.row_indices
595 }
596}
597
598/// Per-supernode diagnostic statistics.
599///
600/// Collected during [`AptpNumeric::factor`] for each supernode processed.
601/// Provides visibility into per-front behavior for analysis and comparison
602/// with reference solvers (e.g., SPRAL).
603#[derive(Debug, Clone)]
604pub struct PerSupernodeStats {
605 /// Supernode index in postorder.
606 pub snode_id: usize,
607 /// Frontal matrix dimension (m).
608 pub front_size: usize,
609 /// Number of fully-summed rows/columns (k = supernode cols + delayed from children).
610 pub num_fully_summed: usize,
611 /// Number of columns successfully eliminated (ne <= k).
612 pub num_eliminated: usize,
613 /// Number of delayed columns (k - ne).
614 pub num_delayed: usize,
615 /// Number of 1x1 pivots accepted at this supernode.
616 pub num_1x1: usize,
617 /// Number of 2x2 pivot pairs accepted at this supernode.
618 pub num_2x2: usize,
619 /// Maximum absolute L entry at this supernode (stability metric).
620 pub max_l_entry: f64,
621 /// Wall-clock time for scatter + extend-add assembly.
622 #[cfg(feature = "diagnostic")]
623 pub assembly_time: std::time::Duration,
624 /// Wall-clock time for dense APTP kernel.
625 #[cfg(feature = "diagnostic")]
626 pub kernel_time: std::time::Duration,
627 /// Wall-clock time for front factor extraction.
628 #[cfg(feature = "diagnostic")]
629 pub extraction_time: std::time::Duration,
630 // -- Sub-phase timing (finer-grained diagnostic breakdown) --
631 /// Wall-clock time for zeroing the frontal matrix (memset-equivalent).
632 #[cfg(feature = "diagnostic")]
633 pub zero_time: std::time::Duration,
634 /// Wall-clock time for building the global-to-local mapping.
635 #[cfg(feature = "diagnostic")]
636 pub g2l_time: std::time::Duration,
637 /// Wall-clock time for scattering original CSC entries into the frontal matrix.
638 #[cfg(feature = "diagnostic")]
639 pub scatter_time: std::time::Duration,
640 /// Wall-clock time for extend-add (merging child contributions).
641 #[cfg(feature = "diagnostic")]
642 pub extend_add_time: std::time::Duration,
643 /// Wall-clock time for extracting L11, D11, L21 from the factored frontal matrix.
644 #[cfg(feature = "diagnostic")]
645 pub extract_factors_time: std::time::Duration,
646 /// Wall-clock time for extracting the contribution block.
647 #[cfg(feature = "diagnostic")]
648 pub extract_contrib_time: std::time::Duration,
649 /// Wall-clock time for the deferred contribution GEMM.
650 #[cfg(feature = "diagnostic")]
651 pub contrib_gemm_time: std::time::Duration,
652 /// Wall-clock time for resetting the global-to-local mapping after factoring.
653 #[cfg(feature = "diagnostic")]
654 pub g2l_reset_time: std::time::Duration,
655}
656
657/// Aggregate statistics from multifrontal factorization.
658///
659/// Reports pivot counts, delay events, and front sizes across all supernodes.
660#[derive(Debug, Clone)]
661pub struct FactorizationStats {
662 /// Total 1x1 pivots across all supernodes.
663 pub total_1x1_pivots: usize,
664 /// Total 2x2 pivot pairs across all supernodes.
665 pub total_2x2_pivots: usize,
666 /// Total delay events across all supernodes.
667 pub total_delayed: usize,
668 /// Columns that could not be eliminated at root supernodes (zero pivots).
669 /// These represent rank deficiency in the matrix. A nonzero value means
670 /// the matrix is numerically singular (or nearly so) — the solve phase
671 /// must handle this appropriately.
672 pub zero_pivots: usize,
673 /// Largest frontal matrix dimension encountered.
674 pub max_front_size: usize,
675 /// Number of supernodes before amalgamation.
676 pub supernodes_before_amalgamation: usize,
677 /// Number of supernodes after amalgamation.
678 pub supernodes_after_amalgamation: usize,
679 /// Number of merge operations performed during amalgamation.
680 pub merges_performed: usize,
681 /// Sum of assembly times across all supernodes.
682 #[cfg(feature = "diagnostic")]
683 pub total_assembly_time: std::time::Duration,
684 /// Sum of kernel times across all supernodes.
685 #[cfg(feature = "diagnostic")]
686 pub total_kernel_time: std::time::Duration,
687 /// Sum of extraction times across all supernodes.
688 #[cfg(feature = "diagnostic")]
689 pub total_extraction_time: std::time::Duration,
690 // -- Sub-phase totals --
691 /// Sum of zeroing times across all supernodes.
692 #[cfg(feature = "diagnostic")]
693 pub total_zero_time: std::time::Duration,
694 /// Sum of g2l build times across all supernodes.
695 #[cfg(feature = "diagnostic")]
696 pub total_g2l_time: std::time::Duration,
697 /// Sum of scatter times across all supernodes.
698 #[cfg(feature = "diagnostic")]
699 pub total_scatter_time: std::time::Duration,
700 /// Sum of extend-add times across all supernodes.
701 #[cfg(feature = "diagnostic")]
702 pub total_extend_add_time: std::time::Duration,
703 /// Sum of factor extraction times across all supernodes.
704 #[cfg(feature = "diagnostic")]
705 pub total_extract_factors_time: std::time::Duration,
706 /// Sum of contribution extraction times across all supernodes.
707 #[cfg(feature = "diagnostic")]
708 pub total_extract_contrib_time: std::time::Duration,
709 /// Sum of deferred contribution GEMM times across all supernodes.
710 #[cfg(feature = "diagnostic")]
711 pub total_contrib_gemm_time: std::time::Duration,
712 /// Sum of g2l reset times across all supernodes.
713 #[cfg(feature = "diagnostic")]
714 pub total_g2l_reset_time: std::time::Duration,
715 /// Number of small-leaf subtrees identified by classification.
716 pub small_leaf_subtrees: usize,
717 /// Number of supernodes processed via the small-leaf fast path.
718 pub small_leaf_nodes: usize,
719}
720
721/// Assembled frontal matrix exported for diagnostic comparison.
722#[cfg(feature = "diagnostic")]
723pub struct ExportedFrontal {
724 /// Dense m x m frontal matrix (lower triangle populated).
725 pub data: Mat<f64>,
726 /// Total front size.
727 pub front_size: usize,
728 /// Number of fully-summed columns.
729 pub num_fully_summed: usize,
730 /// Global permuted column indices for each local row.
731 pub row_indices: Vec<usize>,
732}
733
734/// Complete multifrontal numeric factorization result.
735///
736/// Contains per-supernode factors and aggregate statistics. Created by
737/// [`AptpNumeric::factor`] and used by the triangular solve to
738/// perform forward/backward substitution.
739///
740/// # Usage
741///
742/// ```
743/// use faer::sparse::{SparseColMat, Triplet};
744/// use faer::sparse::linalg::cholesky::SymmetricOrdering;
745/// use rivrs_sparse::symmetric::{AptpSymbolic, AptpOptions, AptpNumeric};
746///
747/// // Analyze
748/// # let triplets = vec![Triplet::new(0, 0, 1.0)];
749/// # let matrix = SparseColMat::try_new_from_triplets(1, 1, &triplets).unwrap();
750/// let symbolic = AptpSymbolic::analyze(matrix.symbolic(), SymmetricOrdering::Amd).unwrap();
751///
752/// // Factor
753/// let numeric = AptpNumeric::factor(&symbolic, &matrix, &AptpOptions::default(), None).unwrap();
754/// println!("Stats: {:?}", numeric.stats());
755/// ```
756pub struct AptpNumeric {
757 /// Per-supernode factors, indexed by supernode ID.
758 front_factors: Vec<FrontFactors>,
759 /// Aggregate factorization statistics.
760 stats: FactorizationStats,
761 /// Per-supernode diagnostic statistics (same order as `front_factors`).
762 per_supernode_stats: Vec<PerSupernodeStats>,
763 /// Matrix dimension.
764 n: usize,
765 /// Profiling session from factorization (Chrome Trace export).
766 #[cfg(feature = "diagnostic")]
767 profile_session: Option<FinishedSession>,
768}
769
770impl std::fmt::Debug for AptpNumeric {
771 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
772 f.debug_struct("AptpNumeric")
773 .field("n", &self.n)
774 .field("stats", &self.stats)
775 .field("front_factors_count", &self.front_factors.len())
776 .finish()
777 }
778}
779
780impl AptpNumeric {
781 /// Per-supernode factors, indexed by supernode ID.
782 pub fn front_factors(&self) -> &[FrontFactors] {
783 &self.front_factors
784 }
785
786 /// Aggregate factorization statistics.
787 pub fn stats(&self) -> &FactorizationStats {
788 &self.stats
789 }
790
791 /// Per-supernode diagnostic statistics.
792 ///
793 /// One entry per supernode in postorder, matching [`front_factors`](Self::front_factors).
794 pub fn per_supernode_stats(&self) -> &[PerSupernodeStats] {
795 &self.per_supernode_stats
796 }
797
798 /// Matrix dimension.
799 pub fn n(&self) -> usize {
800 self.n
801 }
802
803 /// Profiling session from factorization, for Chrome Trace export.
804 ///
805 /// Returns `None` if factorization was not profiled.
806 /// Only available with `diagnostic` feature.
807 #[cfg(feature = "diagnostic")]
808 pub fn profile_session(&self) -> Option<&FinishedSession> {
809 self.profile_session.as_ref()
810 }
811
812 /// Factor a sparse symmetric matrix using the multifrontal method with APTP.
813 ///
814 /// Traverses the assembly tree in postorder, assembling and factoring dense
815 /// frontal matrices at each supernode using the dense APTP kernel.
816 ///
817 /// # Arguments
818 ///
819 /// - `symbolic`: Symbolic analysis result from [`AptpSymbolic::analyze`]
820 /// - `matrix`: Sparse symmetric matrix (lower triangle stored). Dimensions
821 /// must match `symbolic.nrows()`
822 /// - `options`: APTP configuration (threshold, fallback strategy)
823 ///
824 /// # Errors
825 ///
826 /// - [`SparseError::DimensionMismatch`] if matrix dimensions don't match symbolic
827 /// - [`SparseError::AnalysisFailure`] if symbolic analysis is inconsistent
828 ///
829 /// # Zero Pivots
830 ///
831 /// If a root supernode has columns that cannot be eliminated (all pivots
832 /// delayed to root but still fail the threshold), these are recorded as
833 /// zero pivots in [`FactorizationStats::zero_pivots`] rather than returning
834 /// an error. The factorization succeeds;
835 /// the solve phase must handle the rank deficiency.
836 ///
837 /// # References
838 ///
839 /// - Duff & Reid (1983), "The multifrontal solution of indefinite sparse
840 /// symmetric linear equations"
841 /// - Liu (1992), "The Multifrontal Method for Sparse Matrix Solution"
842 /// - Hogg, Duff & Lopez (2020), "A New Sparse LDL^T Solver Using A Posteriori
843 /// Threshold Pivoting"
844 pub fn factor(
845 symbolic: &AptpSymbolic,
846 matrix: &SparseColMat<usize, f64>,
847 options: &AptpOptions,
848 scaling: Option<&[f64]>,
849 ) -> Result<Self, SparseError> {
850 let nemin = options.nemin;
851 let n = symbolic.nrows();
852
853 // Validate dimensions
854 if matrix.nrows() != n || matrix.ncols() != n {
855 return Err(SparseError::DimensionMismatch {
856 expected: (n, n),
857 got: (matrix.nrows(), matrix.ncols()),
858 context: "Matrix dimensions must match symbolic analysis".to_string(),
859 });
860 }
861
862 // Build supernode info (unified abstraction over supernodal/simplicial)
863 let supernodes = build_supernode_info(symbolic);
864 let n_supernodes_before = supernodes.len();
865 let mut supernodes = super::amalgamation::amalgamate(supernodes, nemin);
866 let n_supernodes = supernodes.len();
867 let merges_performed = n_supernodes_before - n_supernodes;
868 let children = build_children_map(&supernodes);
869
870 // Classify small-leaf subtrees for the fast-path pre-pass
871 let small_leaf_subtrees =
872 classify_small_leaf_subtrees(&mut supernodes, &children, options.small_leaf_threshold);
873
874 // Get fill-reducing permutation
875 let (perm_fwd, perm_inv) = symbolic.perm_vecs();
876
877 // Precompute assembly index maps (scatter + extend-add) for the
878 // zero-delay case. Supernodes with delayed children fall back to
879 // per-entry index arithmetic.
880 let assembly_maps =
881 build_assembly_maps(&supernodes, &children, matrix, &perm_fwd, &perm_inv, n);
882
883 // Profiling session (diagnostic only)
884 #[cfg(feature = "diagnostic")]
885 let session = ProfileSession::new();
886 #[cfg(feature = "diagnostic")]
887 let _factor_loop_guard = session.enter_section("factor_loop");
888
889 // Iterative level-set factorization: process all ready (leaf) nodes first,
890 // then propagate upward. No recursion — avoids stack overflow on deep
891 // elimination trees (e.g., c-71 with 76K supernodes on rayon's 2MB stacks).
892 let all_node_results = factor_tree_levelset(
893 &supernodes,
894 &children,
895 matrix,
896 &perm_fwd,
897 &perm_inv,
898 options,
899 scaling,
900 n,
901 &assembly_maps,
902 &small_leaf_subtrees,
903 )?;
904
905 #[cfg(feature = "diagnostic")]
906 drop(_factor_loop_guard);
907
908 // Scatter returned results into supernode-indexed vectors
909 let mut front_factors_vec: Vec<Option<FrontFactors>> =
910 (0..n_supernodes).map(|_| None).collect();
911 let mut per_sn_stats_vec: Vec<Option<PerSupernodeStats>> =
912 (0..n_supernodes).map(|_| None).collect();
913 for (idx, ff, stats) in all_node_results {
914 front_factors_vec[idx] = Some(ff);
915 per_sn_stats_vec[idx] = Some(stats);
916 }
917
918 let front_factors: Vec<FrontFactors> = front_factors_vec
919 .into_iter()
920 .enumerate()
921 .map(|(i, opt)| opt.unwrap_or_else(|| panic!("supernode {} not factored", i)))
922 .collect();
923 let per_sn_stats: Vec<PerSupernodeStats> = per_sn_stats_vec
924 .into_iter()
925 .enumerate()
926 .map(|(i, opt)| opt.unwrap_or_else(|| panic!("supernode {} missing stats", i)))
927 .collect();
928
929 // Compute aggregate stats from per-supernode stats
930 let mut stats = FactorizationStats {
931 total_1x1_pivots: 0,
932 total_2x2_pivots: 0,
933 total_delayed: 0,
934 zero_pivots: 0,
935 max_front_size: 0,
936 supernodes_before_amalgamation: n_supernodes_before,
937 supernodes_after_amalgamation: n_supernodes,
938 merges_performed,
939 #[cfg(feature = "diagnostic")]
940 total_assembly_time: std::time::Duration::ZERO,
941 #[cfg(feature = "diagnostic")]
942 total_kernel_time: std::time::Duration::ZERO,
943 #[cfg(feature = "diagnostic")]
944 total_extraction_time: std::time::Duration::ZERO,
945 #[cfg(feature = "diagnostic")]
946 total_zero_time: std::time::Duration::ZERO,
947 #[cfg(feature = "diagnostic")]
948 total_g2l_time: std::time::Duration::ZERO,
949 #[cfg(feature = "diagnostic")]
950 total_scatter_time: std::time::Duration::ZERO,
951 #[cfg(feature = "diagnostic")]
952 total_extend_add_time: std::time::Duration::ZERO,
953 #[cfg(feature = "diagnostic")]
954 total_extract_factors_time: std::time::Duration::ZERO,
955 #[cfg(feature = "diagnostic")]
956 total_extract_contrib_time: std::time::Duration::ZERO,
957 #[cfg(feature = "diagnostic")]
958 total_contrib_gemm_time: std::time::Duration::ZERO,
959 #[cfg(feature = "diagnostic")]
960 total_g2l_reset_time: std::time::Duration::ZERO,
961 small_leaf_subtrees: small_leaf_subtrees.len(),
962 small_leaf_nodes: small_leaf_subtrees.iter().map(|s| s.nodes.len()).sum(),
963 };
964 for sn_stat in &per_sn_stats {
965 stats.total_1x1_pivots += sn_stat.num_1x1;
966 stats.total_2x2_pivots += sn_stat.num_2x2;
967 stats.total_delayed += sn_stat.num_delayed;
968 if sn_stat.front_size > stats.max_front_size {
969 stats.max_front_size = sn_stat.front_size;
970 }
971 #[cfg(feature = "diagnostic")]
972 {
973 stats.total_assembly_time += sn_stat.assembly_time;
974 stats.total_kernel_time += sn_stat.kernel_time;
975 stats.total_extraction_time += sn_stat.extraction_time;
976 stats.total_zero_time += sn_stat.zero_time;
977 stats.total_g2l_time += sn_stat.g2l_time;
978 stats.total_scatter_time += sn_stat.scatter_time;
979 stats.total_extend_add_time += sn_stat.extend_add_time;
980 stats.total_extract_factors_time += sn_stat.extract_factors_time;
981 stats.total_extract_contrib_time += sn_stat.extract_contrib_time;
982 stats.total_contrib_gemm_time += sn_stat.contrib_gemm_time;
983 stats.total_g2l_reset_time += sn_stat.g2l_reset_time;
984 }
985 }
986
987 // Count zero pivots from front factors
988 for ff in &front_factors {
989 for (_, entry) in ff.d11().iter_pivots() {
990 if let PivotEntry::OneByOne(val) = entry {
991 if val == 0.0 {
992 stats.zero_pivots += 1;
993 }
994 }
995 }
996 }
997 // Also count unresolved delays at root supernodes
998 for (s, sn) in supernodes.iter().enumerate() {
999 if sn.parent.is_none() {
1000 let sn_stat = &per_sn_stats[s];
1001 let m = sn_stat.front_size;
1002 let ne = sn_stat.num_eliminated;
1003 if ne < m {
1004 let k = sn_stat.num_fully_summed;
1005 let n_unresolved = k - ne;
1006 stats.zero_pivots += n_unresolved;
1007 }
1008 }
1009 }
1010
1011 // Finish profiling session
1012 #[cfg(feature = "diagnostic")]
1013 let finished_session = session.finish();
1014
1015 Ok(AptpNumeric {
1016 front_factors,
1017 stats,
1018 per_supernode_stats: per_sn_stats,
1019 n,
1020 #[cfg(feature = "diagnostic")]
1021 profile_session: Some(finished_session),
1022 })
1023 }
1024
1025 /// Export the assembled (pre-factorization) frontal matrix for a specific supernode.
1026 ///
1027 /// Runs the multifrontal assembly up to the target supernode, factoring all
1028 /// preceding supernodes to generate their contributions, then returns the
1029 /// assembled frontal matrix for the target **before** APTP factoring.
1030 ///
1031 /// # Selecting the Target Supernode
1032 ///
1033 /// If `target_snode` is `None`, the supernode with the largest front size is used.
1034 /// Otherwise, the specified supernode index is used.
1035 ///
1036 /// # Use Case
1037 ///
1038 /// This is a diagnostic tool for comparing our APTP dense kernel with SPRAL's
1039 /// `ldlt_app_factor` on the same assembled input. The exported matrix can be
1040 /// written to a file and fed to a SPRAL driver.
1041 #[cfg(feature = "diagnostic")]
1042 pub fn export_assembled_frontal(
1043 symbolic: &AptpSymbolic,
1044 matrix: &SparseColMat<usize, f64>,
1045 options: &AptpOptions,
1046 scaling: Option<&[f64]>,
1047 target_snode: Option<usize>,
1048 ) -> Result<ExportedFrontal, SparseError> {
1049 let n = symbolic.nrows();
1050
1051 if matrix.nrows() != n || matrix.ncols() != n {
1052 return Err(SparseError::DimensionMismatch {
1053 expected: (n, n),
1054 got: (matrix.nrows(), matrix.ncols()),
1055 context: "Matrix dimensions must match symbolic analysis".to_string(),
1056 });
1057 }
1058
1059 let supernodes = build_supernode_info(symbolic);
1060 let n_supernodes = supernodes.len();
1061 let children = build_children_map(&supernodes);
1062 let (perm_fwd, perm_inv) = symbolic.perm_vecs();
1063
1064 // Determine target supernode
1065 let target = match target_snode {
1066 Some(t) => {
1067 if t >= n_supernodes {
1068 return Err(SparseError::AnalysisFailure {
1069 reason: format!(
1070 "Target supernode {} out of range (n_supernodes={})",
1071 t, n_supernodes
1072 ),
1073 });
1074 }
1075 t
1076 }
1077 None => {
1078 // Find supernode with largest front
1079 let mut best = 0;
1080 let mut best_m = 0;
1081 for (s, sn) in supernodes.iter().enumerate() {
1082 let sn_ncols = sn.col_end - sn.col_begin;
1083 // Approximate: delayed cols not counted here, just pattern + cols
1084 let m_approx = sn_ncols + sn.pattern.len();
1085 if m_approx > best_m {
1086 best_m = m_approx;
1087 best = s;
1088 }
1089 }
1090 best
1091 }
1092 };
1093
1094 let mut global_to_local = vec![NOT_IN_FRONT; n];
1095 let mut contributions: Vec<Option<ContributionBlock>> =
1096 (0..n_supernodes).map(|_| None).collect();
1097
1098 // Process supernodes in postorder up to and including target
1099 for s in 0..=target {
1100 let sn = &supernodes[s];
1101
1102 // 1. Collect delayed columns from children
1103 let mut delayed_cols: Vec<usize> = Vec::new();
1104 for &c in &children[s] {
1105 if let Some(ref cb) = contributions[c] {
1106 delayed_cols.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
1107 }
1108 }
1109
1110 // 2. Compute frontal matrix structure
1111 let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
1112 let k = sn_ncols + delayed_cols.len();
1113 let mut frontal_rows: Vec<usize> = Vec::with_capacity(k + sn.pattern.len());
1114 for range in &sn.owned_ranges {
1115 frontal_rows.extend(range.clone());
1116 }
1117 frontal_rows.extend_from_slice(&delayed_cols);
1118 frontal_rows.extend_from_slice(&sn.pattern);
1119 let m = frontal_rows.len();
1120
1121 // 3. Build global-to-local mapping
1122 for (i, &global) in frontal_rows.iter().enumerate() {
1123 global_to_local[global] = i;
1124 }
1125
1126 // 4. Assemble frontal matrix (owned data — diagnostic path)
1127 let mut frontal_data = Mat::zeros(m, m);
1128 {
1129 let mut frontal = FrontalMatrix {
1130 data: frontal_data.as_mut(),
1131 row_indices: &frontal_rows,
1132 num_fully_summed: k,
1133 };
1134
1135 scatter_original_entries_multi(
1136 &mut frontal,
1137 matrix,
1138 &perm_fwd,
1139 &perm_inv,
1140 &global_to_local,
1141 &sn.owned_ranges,
1142 scaling,
1143 );
1144
1145 for &c in &children[s] {
1146 if let Some(cb) = contributions[c].take() {
1147 let _ = extend_add(&mut frontal, cb, &global_to_local);
1148 }
1149 }
1150 }
1151
1152 if s == target {
1153 // Return the assembled frontal matrix BEFORE factoring
1154 // Clean up global_to_local
1155 for &global in &frontal_rows {
1156 global_to_local[global] = NOT_IN_FRONT;
1157 }
1158 return Ok(ExportedFrontal {
1159 data: frontal_data,
1160 front_size: m,
1161 num_fully_summed: k,
1162 row_indices: frontal_rows,
1163 });
1164 }
1165
1166 // Factor this supernode (needed for contribution propagation)
1167 let result = aptp_factor_in_place(frontal_data.as_mut(), k, options)?;
1168 let ne = result.num_eliminated;
1169
1170 if sn.parent.is_some() && ne < m {
1171 let nfs = m - k;
1172 let mut cb = Mat::zeros(m, m);
1173 if nfs > 0 {
1174 let mut ld_ws = Mat::zeros(m, ne.max(1));
1175 compute_contribution_gemm(
1176 &frontal_data,
1177 k,
1178 ne,
1179 m,
1180 &result.d,
1181 &mut cb,
1182 &mut ld_ws,
1183 Par::Seq,
1184 );
1185 }
1186 contributions[s] = Some(extract_contribution(
1187 &frontal_data,
1188 m,
1189 k,
1190 &frontal_rows,
1191 &result,
1192 cb,
1193 ));
1194 }
1195
1196 // Cleanup
1197 for &global in &frontal_rows {
1198 global_to_local[global] = NOT_IN_FRONT;
1199 }
1200 }
1201
1202 // Should not reach here — we return inside the loop at s == target
1203 unreachable!("Target supernode {} not reached in postorder loop", target)
1204 }
1205}
1206
1207// ---------------------------------------------------------------------------
1208// Tree-level factorization functions
1209// ---------------------------------------------------------------------------
1210
1211/// Result of factoring a single supernode.
1212struct SupernodeResult {
1213 ff: FrontFactors,
1214 contribution: Option<ContributionBlock>,
1215 stats: PerSupernodeStats,
1216}
1217
1218/// Factor a single supernode given its children's contribution blocks.
1219///
1220/// Assembles the frontal matrix from sparse entries and child contributions,
1221/// factors it with APTP, and extracts the front factors and contribution block.
1222///
1223/// Uses the pre-allocated `workspace` for the frontal matrix buffer, row indices,
1224/// delayed columns, and global-to-local mapping to avoid per-supernode allocations.
1225#[allow(clippy::too_many_arguments)]
1226fn factor_single_supernode(
1227 s: usize,
1228 sn: &SupernodeInfo,
1229 child_contributions: Vec<Option<ContributionBlock>>,
1230 matrix: &SparseColMat<usize, f64>,
1231 perm_fwd: &[usize],
1232 perm_inv: &[usize],
1233 options: &AptpOptions,
1234 scaling: Option<&[f64]>,
1235 workspace: &mut FactorizationWorkspace,
1236 assembly_maps: &AssemblyMaps,
1237) -> Result<SupernodeResult, SparseError> {
1238 // 1. Collect delayed columns from children into workspace buffer
1239 workspace.delayed_cols_buf.clear();
1240 for cb in child_contributions.iter().flatten() {
1241 workspace
1242 .delayed_cols_buf
1243 .extend_from_slice(&cb.row_indices[..cb.num_delayed]);
1244 }
1245
1246 // 2. Compute frontal matrix structure
1247 // Use owned_ranges (not col_begin..col_end) to enumerate fully-summed columns.
1248 // After amalgamation of non-adjacent supernodes, col_begin..col_end may span
1249 // columns belonging to other supernodes. Only owned columns should be in the front.
1250 let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
1251 let k = sn_ncols + workspace.delayed_cols_buf.len(); // num_fully_summed
1252
1253 workspace.frontal_row_indices.clear();
1254 workspace.frontal_row_indices.reserve(k + sn.pattern.len());
1255 for range in &sn.owned_ranges {
1256 workspace.frontal_row_indices.extend(range.clone());
1257 }
1258 workspace
1259 .frontal_row_indices
1260 .extend_from_slice(&workspace.delayed_cols_buf);
1261 workspace.frontal_row_indices.extend_from_slice(&sn.pattern);
1262 let m = workspace.frontal_row_indices.len();
1263
1264 // 3. Ensure workspace capacity and zero frontal data for this supernode
1265 // If delayed columns make this front larger than estimated, grow the workspace
1266 if m > workspace.frontal_data.nrows() {
1267 workspace.frontal_data = Mat::zeros(m, m);
1268 }
1269
1270 #[cfg(feature = "diagnostic")]
1271 let zero_start = std::time::Instant::now();
1272
1273 workspace.zero_frontal(m);
1274
1275 #[cfg(feature = "diagnostic")]
1276 let zero_time = zero_start.elapsed();
1277
1278 // 4. Build global-to-local mapping
1279 #[cfg(feature = "diagnostic")]
1280 let g2l_start = std::time::Instant::now();
1281
1282 for (i, &global) in workspace.frontal_row_indices.iter().enumerate() {
1283 workspace.global_to_local[global] = i;
1284 }
1285
1286 #[cfg(feature = "diagnostic")]
1287 let g2l_time = g2l_start.elapsed();
1288
1289 // 5. Assemble frontal matrix using workspace buffer
1290 #[cfg(feature = "diagnostic")]
1291 let assembly_start = std::time::Instant::now();
1292
1293 let ndelay_in = workspace.delayed_cols_buf.len();
1294
1295 let mut frontal = FrontalMatrix {
1296 data: workspace.frontal_data.as_mut().submatrix_mut(0, 0, m, m),
1297 row_indices: &workspace.frontal_row_indices,
1298 num_fully_summed: k,
1299 };
1300
1301 // 5a. Scatter original CSC entries
1302 #[cfg(feature = "diagnostic")]
1303 let scatter_start = std::time::Instant::now();
1304
1305 // Use precomputed amap when no delays shift row positions
1306 if ndelay_in == 0 {
1307 // Fast path: scatter using precomputed index tuples (4 u32 per entry)
1308 let amap_start = assembly_maps.amap_offsets[s];
1309 let amap_end = assembly_maps.amap_offsets[s + 1];
1310 let values = matrix.val();
1311 let amap = &assembly_maps.amap_entries[amap_start * 4..amap_end * 4];
1312
1313 if let Some(sc) = scaling {
1314 for entry in amap.chunks_exact(4) {
1315 let src_idx = entry[0] as usize;
1316 let dest_linear = entry[1] as usize;
1317 let scale_row = entry[2] as usize;
1318 let scale_col = entry[3] as usize;
1319 let val = values[src_idx] * sc[scale_row] * sc[scale_col];
1320 let dest_col = dest_linear / m;
1321 let dest_row = dest_linear % m;
1322 frontal.data[(dest_row, dest_col)] += val;
1323 }
1324 } else {
1325 for entry in amap.chunks_exact(4) {
1326 let src_idx = entry[0] as usize;
1327 let dest_linear = entry[1] as usize;
1328 let dest_col = dest_linear / m;
1329 let dest_row = dest_linear % m;
1330 frontal.data[(dest_row, dest_col)] += values[src_idx];
1331 }
1332 }
1333 } else {
1334 // Fallback: delayed columns shift positions, use per-entry assembly
1335 scatter_original_entries_multi(
1336 &mut frontal,
1337 matrix,
1338 perm_fwd,
1339 perm_inv,
1340 &workspace.global_to_local,
1341 &sn.owned_ranges,
1342 scaling,
1343 );
1344 }
1345
1346 #[cfg(feature = "diagnostic")]
1347 let scatter_time = scatter_start.elapsed();
1348
1349 // 5b. Extend-add: merge child contributions into parent frontal matrix
1350 #[cfg(feature = "diagnostic")]
1351 let ea_start_time = std::time::Instant::now();
1352
1353 let ea_children_begin = assembly_maps.ea_snode_child_begin[s];
1354 let mut ea_child_idx = ea_children_begin;
1355
1356 for opt_cb in child_contributions {
1357 if let Some(cb) = opt_cb {
1358 // Check if this child had delays — if so, fall back to g2l
1359 let recycled = if cb.num_delayed > 0 || ndelay_in > 0 {
1360 extend_add(&mut frontal, cb, &workspace.global_to_local)
1361 } else {
1362 // Fast path: use precomputed row mapping
1363 let ea_start = assembly_maps.ea_offsets[ea_child_idx];
1364 let ea_end = assembly_maps.ea_offsets[ea_child_idx + 1];
1365 let ea_row_map = &assembly_maps.ea_map[ea_start..ea_end];
1366 extend_add_mapped(&mut frontal, cb, ea_row_map)
1367 };
1368 // Recycle the returned buffer into workspace if it's larger than
1369 // what we currently have (or if ours was moved out).
1370 if recycled.nrows() >= workspace.contrib_buffer.nrows() {
1371 workspace.contrib_buffer = recycled;
1372 }
1373 }
1374 // Always increment: ea_offsets indexes all children, including those
1375 // with no contribution (fully eliminated, ne == m).
1376 ea_child_idx += 1;
1377 }
1378
1379 #[cfg(feature = "diagnostic")]
1380 let extend_add_time = ea_start_time.elapsed();
1381
1382 #[cfg(feature = "diagnostic")]
1383 let assembly_time = assembly_start.elapsed();
1384
1385 // 6. Factor the frontal matrix
1386 #[cfg(feature = "diagnostic")]
1387 let kernel_start = std::time::Instant::now();
1388
1389 let effective_par = if m < INTRA_NODE_THRESHOLD {
1390 Par::Seq
1391 } else {
1392 options.par
1393 };
1394 let per_sn_options = AptpOptions {
1395 par: effective_par,
1396 ..options.clone()
1397 };
1398 let result = aptp_factor_in_place(
1399 workspace.frontal_data.as_mut().submatrix_mut(0, 0, m, m),
1400 k,
1401 &per_sn_options,
1402 )?;
1403 let ne = result.num_eliminated;
1404
1405 #[cfg(feature = "diagnostic")]
1406 let kernel_time = kernel_start.elapsed();
1407
1408 // 6b. Deferred contribution GEMM: compute NFS×NFS Schur complement directly
1409 // into workspace.contrib_buffer. This replaces the per-block NFS×NFS
1410 // updates that were skipped during the blocking loop.
1411 #[cfg(feature = "diagnostic")]
1412 let contrib_gemm_start = std::time::Instant::now();
1413
1414 let nfs = m - k;
1415 if sn.parent.is_some() && ne < m && nfs > 0 {
1416 // Ensure contrib_buffer is large enough (rare: delayed cascades may exceed estimate)
1417 if workspace.contrib_buffer.nrows() < m || workspace.contrib_buffer.ncols() < m {
1418 workspace.contrib_buffer = Mat::zeros(m, m);
1419 }
1420 compute_contribution_gemm(
1421 &workspace.frontal_data,
1422 k,
1423 ne,
1424 m,
1425 &result.d,
1426 &mut workspace.contrib_buffer,
1427 &mut workspace.ld_workspace,
1428 effective_par,
1429 );
1430 }
1431
1432 #[cfg(feature = "diagnostic")]
1433 let contrib_gemm_time = contrib_gemm_start.elapsed();
1434
1435 // 7. Extract front factors (reads from workspace frontal data)
1436 #[cfg(feature = "diagnostic")]
1437 let extraction_start = std::time::Instant::now();
1438 #[cfg(feature = "diagnostic")]
1439 let extract_factors_start = std::time::Instant::now();
1440
1441 // Use owned Mat<f64> directly for col_as_slice-based bulk extraction
1442 let ff = extract_front_factors(
1443 &workspace.frontal_data,
1444 m,
1445 k,
1446 &workspace.frontal_row_indices,
1447 &result,
1448 );
1449
1450 #[cfg(feature = "diagnostic")]
1451 let extract_factors_time = extract_factors_start.elapsed();
1452
1453 // 8. Prepare contribution for parent (if not root and not fully eliminated)
1454 // NFS×NFS data is in workspace.contrib_buffer (from deferred GEMM).
1455 // Delayed-column data (if any) is still in workspace.frontal_data.
1456 #[cfg(feature = "diagnostic")]
1457 let extract_contrib_start = std::time::Instant::now();
1458
1459 let contribution = if sn.parent.is_some() && ne < m {
1460 Some(extract_contribution(
1461 &workspace.frontal_data,
1462 m,
1463 k,
1464 &workspace.frontal_row_indices,
1465 &result,
1466 std::mem::replace(&mut workspace.contrib_buffer, Mat::new()),
1467 ))
1468 } else {
1469 None
1470 };
1471
1472 #[cfg(feature = "diagnostic")]
1473 let extract_contrib_time = extract_contrib_start.elapsed();
1474
1475 #[cfg(feature = "diagnostic")]
1476 let extraction_time = extraction_start.elapsed();
1477
1478 let stats = PerSupernodeStats {
1479 snode_id: s,
1480 front_size: m,
1481 num_fully_summed: k,
1482 num_eliminated: ne,
1483 num_delayed: k - ne,
1484 num_1x1: result.stats.num_1x1,
1485 num_2x2: result.stats.num_2x2,
1486 max_l_entry: result.stats.max_l_entry,
1487 #[cfg(feature = "diagnostic")]
1488 assembly_time,
1489 #[cfg(feature = "diagnostic")]
1490 kernel_time,
1491 #[cfg(feature = "diagnostic")]
1492 extraction_time,
1493 #[cfg(feature = "diagnostic")]
1494 zero_time,
1495 #[cfg(feature = "diagnostic")]
1496 g2l_time,
1497 #[cfg(feature = "diagnostic")]
1498 scatter_time,
1499 #[cfg(feature = "diagnostic")]
1500 extend_add_time,
1501 #[cfg(feature = "diagnostic")]
1502 extract_factors_time,
1503 #[cfg(feature = "diagnostic")]
1504 extract_contrib_time,
1505 #[cfg(feature = "diagnostic")]
1506 contrib_gemm_time,
1507 #[cfg(feature = "diagnostic")]
1508 g2l_reset_time: std::time::Duration::ZERO, // filled below after reset
1509 };
1510
1511 // Reset global_to_local entries for reuse (O(m), not O(n))
1512 #[cfg(feature = "diagnostic")]
1513 let g2l_reset_start = std::time::Instant::now();
1514
1515 for &global in &workspace.frontal_row_indices[..m] {
1516 workspace.global_to_local[global] = NOT_IN_FRONT;
1517 }
1518
1519 #[cfg(feature = "diagnostic")]
1520 let g2l_reset_time = g2l_reset_start.elapsed();
1521 #[cfg(feature = "diagnostic")]
1522 let stats = {
1523 let mut s = stats;
1524 s.g2l_reset_time = g2l_reset_time;
1525 s
1526 };
1527
1528 Ok(SupernodeResult {
1529 ff,
1530 contribution,
1531 stats,
1532 })
1533}
1534
1535/// Factor a small-leaf subtree using rectangular L storage.
1536///
1537/// Replaces the general `factor_single_supernode` path for classified small-leaf
1538/// subtrees with a streamlined per-node loop that:
1539/// 1. Uses rectangular m×k L storage instead of square m×m frontal matrix
1540/// 2. Factors via `tpp_factor_rect` (rectangular TPP kernel)
1541/// 3. Computes contribution GEMM from rectangular storage
1542/// 4. Reuses a single small workspace across all nodes in the subtree
1543///
1544/// # Arguments
1545///
1546/// - `subtree`: Small-leaf subtree descriptor (nodes in postorder).
1547/// - `supernodes`: All supernode descriptors.
1548/// - `children`: Children map.
1549/// - `matrix`: Original sparse matrix.
1550/// - `perm_fwd`/`perm_inv`: Forward/inverse permutations.
1551/// - `options`: APTP options.
1552/// - `scaling`: Optional MC64 scaling factors.
1553/// - `n`: Matrix dimension.
1554/// - `contributions`: Global contribution storage (read children, write results).
1555/// - `assembly_maps`: Precomputed scatter maps.
1556///
1557/// # Returns
1558///
1559/// Vec of (node_id, FrontFactors, PerSupernodeStats) for each node in the subtree.
1560// SPRAL Equivalent: `SmallLeafNumericSubtree::factor()` in `SmallLeafNumericSubtree.hxx:187-446` (BSD-3).
1561#[allow(clippy::too_many_arguments)]
1562fn factor_small_leaf_subtree(
1563 subtree: &SmallLeafSubtree,
1564 supernodes: &[SupernodeInfo],
1565 children: &[Vec<usize>],
1566 matrix: &SparseColMat<usize, f64>,
1567 perm_fwd: &[usize],
1568 perm_inv: &[usize],
1569 options: &AptpOptions,
1570 scaling: Option<&[f64]>,
1571 n: usize,
1572 contributions: &mut [Option<ContributionBlock>],
1573 assembly_maps: &AssemblyMaps,
1574) -> Result<Vec<(usize, FrontFactors, PerSupernodeStats)>, SparseError> {
1575 let max_front = subtree.max_front_size;
1576 let mut results = Vec::with_capacity(subtree.nodes.len());
1577
1578 // Shared workspace for the subtree:
1579 // - l_storage: rectangular m×k buffer, reused per node (max_front × max_front covers all)
1580 // - global_to_local: length n, reused per node
1581 // - contrib_buffer: for contribution GEMM output, recycled
1582 // - ld_workspace: for L·D product in contribution GEMM
1583 // - frontal_row_indices: reusable buffer for building front structure
1584 // - delayed_cols_buf: reusable buffer for collecting child delays
1585 let mut l_storage = Mat::<f64>::zeros(max_front, max_front);
1586 let mut global_to_local = vec![NOT_IN_FRONT; n];
1587 let mut contrib_buffer = Mat::<f64>::new();
1588 let mut ld_workspace = Mat::<f64>::new();
1589 let mut frontal_row_indices = Vec::<usize>::with_capacity(max_front);
1590 let mut delayed_cols_buf = Vec::<usize>::new();
1591
1592 let symbolic_matrix = matrix.symbolic();
1593 let col_ptrs = symbolic_matrix.col_ptr();
1594 let row_indices_csc = symbolic_matrix.row_idx();
1595 let values = matrix.val();
1596
1597 for &node_id in &subtree.nodes {
1598 let sn = &supernodes[node_id];
1599
1600 // 1. Collect delayed columns from children
1601 delayed_cols_buf.clear();
1602 for &c in &children[node_id] {
1603 if let Some(ref cb) = contributions[c] {
1604 delayed_cols_buf.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
1605 }
1606 }
1607
1608 // 2. Compute frontal matrix structure
1609 let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
1610 let k = sn_ncols + delayed_cols_buf.len(); // num_fully_summed
1611
1612 frontal_row_indices.clear();
1613 frontal_row_indices.reserve(k + sn.pattern.len());
1614 for range in &sn.owned_ranges {
1615 frontal_row_indices.extend(range.clone());
1616 }
1617 frontal_row_indices.extend_from_slice(&delayed_cols_buf);
1618 frontal_row_indices.extend_from_slice(&sn.pattern);
1619 let m = frontal_row_indices.len();
1620
1621 // 3. Ensure l_storage capacity and zero rectangular m×k region
1622 #[cfg(feature = "diagnostic")]
1623 let zero_start = std::time::Instant::now();
1624
1625 if m > l_storage.nrows() || k > l_storage.ncols() {
1626 l_storage = Mat::zeros(m, k.max(l_storage.ncols()));
1627 }
1628 // Zero only the m×k rectangle (smaller than m×m for NFS nodes)
1629 for j in 0..k {
1630 l_storage.col_as_slice_mut(j)[0..m].fill(0.0);
1631 }
1632
1633 #[cfg(feature = "diagnostic")]
1634 let zero_time = zero_start.elapsed();
1635
1636 // 4. Build global-to-local mapping
1637 #[cfg(feature = "diagnostic")]
1638 let g2l_start = std::time::Instant::now();
1639
1640 for (i, &global) in frontal_row_indices.iter().enumerate() {
1641 global_to_local[global] = i;
1642 }
1643
1644 #[cfg(feature = "diagnostic")]
1645 let g2l_time = g2l_start.elapsed();
1646
1647 // 5. Split assembly: scatter into l_storage (FS columns) and contrib_buffer (NFS×NFS)
1648 #[cfg(feature = "diagnostic")]
1649 let assembly_start = std::time::Instant::now();
1650 #[cfg(feature = "diagnostic")]
1651 let scatter_start = std::time::Instant::now();
1652
1653 let ndelay_in = delayed_cols_buf.len();
1654 let total_owned = sn_ncols;
1655 let nfs = m - k;
1656
1657 // Pre-allocate and zero contrib_buffer if there will be NFS rows.
1658 // This allows both original matrix scatter and child extend-add to
1659 // deposit NFS×NFS entries directly, avoiding a re-scatter pass.
1660 let has_nfs = nfs > 0 && sn.parent.is_some();
1661 if has_nfs {
1662 if contrib_buffer.nrows() < nfs || contrib_buffer.ncols() < nfs {
1663 contrib_buffer = Mat::zeros(
1664 nfs.max(contrib_buffer.nrows()),
1665 nfs.max(contrib_buffer.ncols()),
1666 );
1667 }
1668 for j in 0..nfs {
1669 contrib_buffer.col_as_slice_mut(j)[0..nfs].fill(0.0);
1670 }
1671 }
1672
1673 // 5a. Scatter original CSC entries — FS columns → l_storage, NFS×NFS → contrib_buffer
1674 if ndelay_in == 0 {
1675 let amap_start = assembly_maps.amap_offsets[node_id];
1676 let amap_end = assembly_maps.amap_offsets[node_id + 1];
1677 let amap = &assembly_maps.amap_entries[amap_start * 4..amap_end * 4];
1678
1679 if let Some(sc) = scaling {
1680 for entry in amap.chunks_exact(4) {
1681 let src_idx = entry[0] as usize;
1682 let dest_linear = entry[1] as usize;
1683 let scale_row = entry[2] as usize;
1684 let scale_col = entry[3] as usize;
1685 let val = values[src_idx] * sc[scale_row] * sc[scale_col];
1686 let amap_col = dest_linear / m;
1687 let amap_row = dest_linear % m;
1688 if amap_col < k {
1689 l_storage[(amap_row, amap_col)] += val;
1690 } else if has_nfs && amap_row >= k {
1691 // NFS×NFS entry → contrib_buffer
1692 contrib_buffer[(amap_row - k, amap_col - k)] += val;
1693 }
1694 // FS×NFS cross-terms (amap_col >= k, amap_row < k) are handled
1695 // by the symmetric counterpart where amap_col < k.
1696 }
1697 } else {
1698 for entry in amap.chunks_exact(4) {
1699 let src_idx = entry[0] as usize;
1700 let dest_linear = entry[1] as usize;
1701 let amap_col = dest_linear / m;
1702 let amap_row = dest_linear % m;
1703 if amap_col < k {
1704 l_storage[(amap_row, amap_col)] += values[src_idx];
1705 } else if has_nfs && amap_row >= k {
1706 contrib_buffer[(amap_row - k, amap_col - k)] += values[src_idx];
1707 }
1708 }
1709 }
1710 } else {
1711 // Fallback: delayed columns shift positions, use per-entry g2l scatter
1712 for range in &sn.owned_ranges {
1713 for pj in range.clone() {
1714 let orig_col = perm_fwd[pj];
1715 let local_col = global_to_local[pj];
1716
1717 let start = col_ptrs[orig_col];
1718 let end = col_ptrs[orig_col + 1];
1719 for idx in start..end {
1720 let orig_row = row_indices_csc[idx];
1721 if orig_row < orig_col {
1722 let perm_row = perm_inv[orig_row];
1723 let local_peer = global_to_local[perm_row];
1724 if local_peer != NOT_IN_FRONT && local_peer < total_owned {
1725 continue;
1726 }
1727 }
1728 let global_row = perm_inv[orig_row];
1729 let local_row = global_to_local[global_row];
1730 if local_row == NOT_IN_FRONT {
1731 continue;
1732 }
1733 if local_row >= total_owned && local_row < k {
1734 continue;
1735 }
1736 let mut val = values[idx];
1737 if let Some(s) = scaling {
1738 val *= s[perm_inv[orig_row]] * s[perm_inv[orig_col]];
1739 }
1740 let (r, c) = if local_row >= local_col {
1741 (local_row, local_col)
1742 } else {
1743 (local_col, local_row)
1744 };
1745 if c < k {
1746 l_storage[(r, c)] += val;
1747 } else if has_nfs && r >= k {
1748 contrib_buffer[(r - k, c - k)] += val;
1749 }
1750 }
1751 }
1752 }
1753 }
1754
1755 #[cfg(feature = "diagnostic")]
1756 let scatter_time = scatter_start.elapsed();
1757
1758 // 5b. Extend-add child contributions
1759 // FS columns → l_storage, NFS×NFS → contrib_buffer
1760 #[cfg(feature = "diagnostic")]
1761 let extend_add_start = std::time::Instant::now();
1762
1763 for &c in &children[node_id] {
1764 if let Some(cb) = contributions[c].take() {
1765 let cb_size = cb.row_indices.len();
1766 for i in 0..cb_size {
1767 let gi = cb.row_indices[i];
1768 let li = global_to_local[gi];
1769 debug_assert!(li != NOT_IN_FRONT, "child row {} not in parent", gi);
1770 for j in 0..=i {
1771 let gj = cb.row_indices[j];
1772 let lj = global_to_local[gj];
1773 debug_assert!(lj != NOT_IN_FRONT, "child col {} not in parent", gj);
1774 let val = cb.data[(i, j)];
1775 if val != 0.0 {
1776 let (li, lj) = if li >= lj { (li, lj) } else { (lj, li) };
1777 if lj < k {
1778 // FS column → l_storage
1779 l_storage[(li, lj)] += val;
1780 } else if has_nfs {
1781 // NFS×NFS → contrib_buffer
1782 contrib_buffer[(li - k, lj - k)] += val;
1783 }
1784 }
1785 }
1786 }
1787 // Only recycle buffer if we're NOT using contrib_buffer for NFS assembly
1788 if !has_nfs && cb.data.nrows() >= contrib_buffer.nrows() {
1789 contrib_buffer = cb.data;
1790 }
1791 }
1792 }
1793
1794 #[cfg(feature = "diagnostic")]
1795 let extend_add_time = extend_add_start.elapsed();
1796 #[cfg(feature = "diagnostic")]
1797 let assembly_time = assembly_start.elapsed();
1798
1799 // 6. Factor using rectangular TPP kernel
1800 #[cfg(feature = "diagnostic")]
1801 let kernel_start = std::time::Instant::now();
1802
1803 let result = tpp_factor_rect(l_storage.as_mut().submatrix_mut(0, 0, m, k), k, options)?;
1804 let ne = result.num_eliminated;
1805
1806 #[cfg(feature = "diagnostic")]
1807 let kernel_time = kernel_start.elapsed();
1808
1809 // 7. Build contribution block
1810 #[cfg(feature = "diagnostic")]
1811 let mut contrib_gemm_time = std::time::Duration::ZERO;
1812 #[cfg(feature = "diagnostic")]
1813 let mut extract_contrib_time = std::time::Duration::ZERO;
1814 let contribution = if sn.parent.is_some() && ne < m {
1815 // Ensure contrib_buffer is large enough for extract_contribution_rect
1816 // which may need to assemble delayed + NFS into a single buffer
1817 if contrib_buffer.nrows() < m || contrib_buffer.ncols() < m {
1818 // Grow if delayed columns expanded the need
1819 let new_size = m.max(contrib_buffer.nrows());
1820 let mut new_buf = Mat::zeros(new_size, new_size);
1821 // Preserve NFS×NFS data already assembled
1822 if nfs > 0 {
1823 for j in 0..nfs {
1824 let col_len = nfs - j;
1825 let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
1826 new_buf.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
1827 }
1828 }
1829 contrib_buffer = new_buf;
1830 }
1831
1832 // Apply contribution GEMM: updates NFS×NFS with -L21_NFS * D * L21_NFS^T
1833 #[cfg(feature = "diagnostic")]
1834 let gemm_start = std::time::Instant::now();
1835
1836 if nfs > 0 && ne > 0 {
1837 compute_contribution_gemm_rect(
1838 &l_storage,
1839 k,
1840 ne,
1841 m,
1842 &result.d,
1843 &mut contrib_buffer,
1844 &mut ld_workspace,
1845 Par::Seq,
1846 );
1847 }
1848
1849 #[cfg(feature = "diagnostic")]
1850 {
1851 contrib_gemm_time = gemm_start.elapsed();
1852 }
1853
1854 #[cfg(feature = "diagnostic")]
1855 let extract_contrib_start = std::time::Instant::now();
1856
1857 let cb = extract_contribution_rect(
1858 &l_storage,
1859 m,
1860 k,
1861 &frontal_row_indices,
1862 &result,
1863 std::mem::replace(&mut contrib_buffer, Mat::new()),
1864 );
1865
1866 #[cfg(feature = "diagnostic")]
1867 {
1868 extract_contrib_time = extract_contrib_start.elapsed();
1869 }
1870
1871 Some(cb)
1872 } else {
1873 None
1874 };
1875
1876 // 8. Extract FrontFactors from rectangular L storage
1877 #[cfg(feature = "diagnostic")]
1878 let extract_factors_start = std::time::Instant::now();
1879
1880 let ff = extract_front_factors_rect(&l_storage, m, k, &frontal_row_indices, &result);
1881
1882 #[cfg(feature = "diagnostic")]
1883 let extract_factors_time = extract_factors_start.elapsed();
1884
1885 let stats = PerSupernodeStats {
1886 snode_id: node_id,
1887 front_size: m,
1888 num_fully_summed: k,
1889 num_eliminated: ne,
1890 num_delayed: k - ne,
1891 num_1x1: result.stats.num_1x1,
1892 num_2x2: result.stats.num_2x2,
1893 max_l_entry: result.stats.max_l_entry,
1894 #[cfg(feature = "diagnostic")]
1895 assembly_time,
1896 #[cfg(feature = "diagnostic")]
1897 kernel_time,
1898 #[cfg(feature = "diagnostic")]
1899 extraction_time: extract_factors_time + extract_contrib_time,
1900 #[cfg(feature = "diagnostic")]
1901 zero_time,
1902 #[cfg(feature = "diagnostic")]
1903 g2l_time,
1904 #[cfg(feature = "diagnostic")]
1905 scatter_time,
1906 #[cfg(feature = "diagnostic")]
1907 extend_add_time,
1908 #[cfg(feature = "diagnostic")]
1909 extract_factors_time,
1910 #[cfg(feature = "diagnostic")]
1911 extract_contrib_time,
1912 #[cfg(feature = "diagnostic")]
1913 contrib_gemm_time,
1914 #[cfg(feature = "diagnostic")]
1915 g2l_reset_time: std::time::Duration::ZERO, // filled below
1916 };
1917
1918 // 9. Reset g2l
1919 #[cfg(feature = "diagnostic")]
1920 let g2l_reset_start = std::time::Instant::now();
1921
1922 for &global in &frontal_row_indices[..m] {
1923 global_to_local[global] = NOT_IN_FRONT;
1924 }
1925
1926 #[cfg(feature = "diagnostic")]
1927 let stats = {
1928 let mut s = stats;
1929 s.g2l_reset_time = g2l_reset_start.elapsed();
1930 s
1931 };
1932
1933 contributions[node_id] = contribution;
1934 results.push((node_id, ff, stats));
1935 }
1936
1937 Ok(results)
1938}
1939
1940/// Iterative level-set factorization of the assembly tree.
1941///
1942/// Processes supernodes bottom-up: first all leaves (in parallel if enabled),
1943/// then all nodes whose children are complete, and so on up to the roots.
1944/// This avoids deep recursion that overflows rayon's 2MB worker stacks on
1945/// matrices with deep elimination trees (e.g., c-71 with 76K supernodes).
1946///
1947/// The parallelism is "level-set" style: at each wave, all ready nodes are
1948/// independent (their children are already factored) and can be processed
1949/// concurrently. This provides the same tree-level parallelism as the
1950/// recursive approach but without stack depth concerns.
1951///
1952/// # References
1953///
1954/// - Liu (1992), Section 5: level-set scheduling for multifrontal methods
1955/// - Duff & Reid (1983): assembly tree parallelism
1956#[allow(clippy::too_many_arguments)]
1957fn factor_tree_levelset(
1958 supernodes: &[SupernodeInfo],
1959 children: &[Vec<usize>],
1960 matrix: &SparseColMat<usize, f64>,
1961 perm_fwd: &[usize],
1962 perm_inv: &[usize],
1963 options: &AptpOptions,
1964 scaling: Option<&[f64]>,
1965 n: usize,
1966 assembly_maps: &AssemblyMaps,
1967 small_leaf_subtrees: &[SmallLeafSubtree],
1968) -> Result<Vec<(usize, FrontFactors, PerSupernodeStats)>, SparseError> {
1969 let n_supernodes = supernodes.len();
1970
1971 // Track contributions from completed children
1972 let mut contributions: Vec<Option<ContributionBlock>> =
1973 (0..n_supernodes).map(|_| None).collect();
1974
1975 // Track how many children remain unprocessed for each supernode
1976 let mut remaining_children: Vec<usize> = children.iter().map(|c| c.len()).collect();
1977
1978 // Collect all results
1979 let mut all_results: Vec<(usize, FrontFactors, PerSupernodeStats)> =
1980 Vec::with_capacity(n_supernodes);
1981
1982 // --- Small-leaf subtree pre-pass (rectangular workspace fast path) ---
1983 // Factor all classified small-leaf subtrees before the main level-set loop.
1984 // Each subtree uses a rectangular m×k L storage kernel instead of the general
1985 // m×m frontal matrix, eliminating the large square allocation, frontal zeroing,
1986 // and extract_front_factors copy for every node.
1987 for subtree in small_leaf_subtrees {
1988 let subtree_results = factor_small_leaf_subtree(
1989 subtree,
1990 supernodes,
1991 children,
1992 matrix,
1993 perm_fwd,
1994 perm_inv,
1995 options,
1996 scaling,
1997 n,
1998 &mut contributions,
1999 assembly_maps,
2000 )?;
2001 all_results.extend(subtree_results);
2002
2003 // Decrement remaining_children for the subtree root's parent
2004 if let Some(parent) = subtree.parent_of_root {
2005 remaining_children[parent] -= 1;
2006 }
2007 }
2008
2009 let is_parallel = !matches!(options.par, Par::Seq);
2010 let max_front = estimate_max_front_size(supernodes);
2011
2012 // Sequential mode: pre-allocate a single workspace at max_front to avoid
2013 // repeated reallocations as the postorder traversal encounters progressively
2014 // larger fronts. No pool needed — only one workspace exists.
2015 //
2016 // Parallel mode: use a pool of grow-on-demand workspaces. Pre-allocating
2017 // a separate shared_workspace alongside the pool would waste max_front²×8
2018 // bytes (685 MB for H2O) during parallel waves when only the pool is used.
2019 // Single-node batches (sequential fallback near the tree root) pop from
2020 // the pool instead.
2021 //
2022 // The pool is dropped when factor_tree_levelset returns, freeing all
2023 // workspace memory (unlike thread_local! which leaks for the lifetime
2024 // of the rayon thread pool).
2025 let mut seq_workspace = if !is_parallel {
2026 FactorizationWorkspace::new(max_front, n)
2027 } else {
2028 FactorizationWorkspace::empty()
2029 };
2030 let workspace_pool: std::sync::Mutex<Vec<FactorizationWorkspace>> =
2031 std::sync::Mutex::new(Vec::new());
2032
2033 // Initial ready set: all leaf supernodes (no children) that are NOT in small-leaf subtrees
2034 let mut ready: Vec<usize> = (0..n_supernodes)
2035 .filter(|&s| remaining_children[s] == 0 && !supernodes[s].in_small_leaf)
2036 .collect();
2037
2038 while !ready.is_empty() {
2039 // Collect child contributions for each ready node. Children of different
2040 // ready nodes are disjoint (tree structure), so each contribution is
2041 // taken exactly once.
2042 let mut batch_inputs: Vec<(usize, Vec<Option<ContributionBlock>>)> =
2043 Vec::with_capacity(ready.len());
2044 for &s in &ready {
2045 let child_contribs: Vec<Option<ContributionBlock>> = children[s]
2046 .iter()
2047 .map(|&c| contributions[c].take())
2048 .collect();
2049 batch_inputs.push((s, child_contribs));
2050 }
2051
2052 // Factor all ready nodes (parallel if enabled and batch > 1)
2053 let batch_results: Vec<SupernodeResult> = if !is_parallel {
2054 // Pure sequential: use the pre-allocated workspace (no pool
2055 // overhead, no reallocations during postorder traversal).
2056 batch_inputs
2057 .into_iter()
2058 .map(|(s, contribs)| {
2059 factor_single_supernode(
2060 s,
2061 &supernodes[s],
2062 contribs,
2063 matrix,
2064 perm_fwd,
2065 perm_inv,
2066 options,
2067 scaling,
2068 &mut seq_workspace,
2069 assembly_maps,
2070 )
2071 })
2072 .collect::<Result<_, _>>()?
2073 } else if batch_inputs.len() == 1 {
2074 // Parallel mode, single-node batch (near tree root): pop a
2075 // workspace from the pool. Drop remaining pool workspaces to
2076 // free memory before processing large supernodes.
2077 let mut pool = workspace_pool.lock().unwrap();
2078 let mut ws = pool.pop().unwrap_or_else(FactorizationWorkspace::empty);
2079 pool.clear();
2080 drop(pool);
2081 ws.ensure_g2l(n);
2082 let results = batch_inputs
2083 .into_iter()
2084 .map(|(s, contribs)| {
2085 factor_single_supernode(
2086 s,
2087 &supernodes[s],
2088 contribs,
2089 matrix,
2090 perm_fwd,
2091 perm_inv,
2092 options,
2093 scaling,
2094 &mut ws,
2095 assembly_maps,
2096 )
2097 })
2098 .collect::<Result<_, _>>();
2099 workspace_pool.lock().unwrap().push(ws);
2100 results?
2101 } else {
2102 // Parallel: take workspaces from the caller-owned pool so each
2103 // rayon worker gets exclusive ownership during factorization.
2104 // The pool is reused across level-set waves and dropped when
2105 // factor_tree_levelset returns (unlike thread_local! which leaks
2106 // for the lifetime of the rayon thread pool).
2107 //
2108 // If rayon work-steals another par_iter task onto the same thread
2109 // (due to nested Par::rayon BLAS), that task takes a separate
2110 // workspace from the pool — no aliasing.
2111 use rayon::iter::{IntoParallelIterator, ParallelIterator};
2112 batch_inputs
2113 .into_par_iter()
2114 .map(|(s, contribs)| {
2115 // Poisoned mutex means a worker panicked — already fatal.
2116 let mut ws = workspace_pool
2117 .lock()
2118 .unwrap()
2119 .pop()
2120 .unwrap_or_else(FactorizationWorkspace::empty);
2121 // Only ensure global_to_local is sized; frontal_data
2122 // grows on demand in factor_single_supernode. Avoids
2123 // eagerly allocating max_front² per thread (OOM on
2124 // large-front matrices like H2O: 9258² × 8 = 685 MB).
2125 ws.ensure_g2l(n);
2126 let result = factor_single_supernode(
2127 s,
2128 &supernodes[s],
2129 contribs,
2130 matrix,
2131 perm_fwd,
2132 perm_inv,
2133 options,
2134 scaling,
2135 &mut ws,
2136 assembly_maps,
2137 );
2138 // Poisoned mutex means a worker panicked — already fatal.
2139 workspace_pool.lock().unwrap().push(ws);
2140 result
2141 })
2142 .collect::<Result<_, _>>()?
2143 };
2144
2145 // Store results and determine next ready set
2146 let mut next_ready = Vec::new();
2147 for (result, &s) in batch_results.into_iter().zip(ready.iter()) {
2148 contributions[s] = result.contribution;
2149 all_results.push((s, result.ff, result.stats));
2150
2151 if let Some(parent) = supernodes[s].parent {
2152 remaining_children[parent] -= 1;
2153 if remaining_children[parent] == 0 {
2154 next_ready.push(parent);
2155 }
2156 }
2157 }
2158
2159 ready = next_ready;
2160 }
2161
2162 Ok(all_results)
2163}
2164
2165// ---------------------------------------------------------------------------
2166// Internal functions
2167// ---------------------------------------------------------------------------
2168
2169/// Build unified supernode descriptors from symbolic analysis.
2170///
2171/// For supernodal decompositions, maps directly from faer's supernode structure.
2172/// For simplicial decompositions, each column becomes a trivial 1-column supernode
2173/// with pattern derived from the elimination tree and L structure.
2174///
2175/// # Postcondition
2176///
2177/// `info[s].parent.map_or(true, |p| p > s)` for all `s` (postorder).
2178#[allow(clippy::single_range_in_vec_init)]
2179fn build_supernode_info(symbolic: &AptpSymbolic) -> Vec<SupernodeInfo> {
2180 match symbolic.raw() {
2181 SymbolicCholeskyRaw::Supernodal(sn) => {
2182 let ns = sn.n_supernodes();
2183 let begin = sn.supernode_begin();
2184 let end = sn.supernode_end();
2185 (0..ns)
2186 .map(|s| {
2187 let pattern = sn.supernode(s).pattern().to_vec();
2188 let parent = symbolic.supernode_parent(s);
2189 SupernodeInfo {
2190 col_begin: begin[s],
2191 col_end: end[s],
2192 pattern,
2193 parent,
2194 owned_ranges: vec![begin[s]..end[s]],
2195 in_small_leaf: false,
2196 }
2197 })
2198 .collect()
2199 }
2200 SymbolicCholeskyRaw::Simplicial(simp) => {
2201 let n = symbolic.nrows();
2202 let etree = symbolic.etree();
2203 // Get L structure from simplicial symbolic factorization
2204 let l_symbolic = simp.factor();
2205 let col_ptr = l_symbolic.col_ptr();
2206 let row_idx = l_symbolic.row_idx();
2207 (0..n)
2208 .map(|j| {
2209 // Pattern = row indices in column j of L that are > j
2210 // (off-diagonal entries below the diagonal)
2211 let start = col_ptr[j];
2212 let end = col_ptr[j + 1];
2213 let pattern: Vec<usize> = row_idx[start..end]
2214 .iter()
2215 .copied()
2216 .filter(|&r| r > j)
2217 .collect();
2218 // Parent from etree
2219 let parent = if etree[j] < 0 {
2220 None
2221 } else {
2222 Some(etree[j] as usize)
2223 };
2224 SupernodeInfo {
2225 col_begin: j,
2226 col_end: j + 1,
2227 pattern,
2228 parent,
2229 owned_ranges: vec![j..j + 1],
2230 in_small_leaf: false,
2231 }
2232 })
2233 .collect()
2234 }
2235 }
2236}
2237
2238/// Build a map from each supernode to its children in the assembly tree.
2239fn build_children_map(infos: &[SupernodeInfo]) -> Vec<Vec<usize>> {
2240 let n = infos.len();
2241 let mut children = vec![Vec::new(); n];
2242 for (s, info) in infos.iter().enumerate() {
2243 if let Some(p) = info.parent {
2244 children[p].push(s);
2245 }
2246 }
2247 children
2248}
2249
2250/// Classify post-amalgamation supernodes into small-leaf subtrees for the fast path.
2251///
2252/// Performs a single O(n_supernodes) bottom-up pass to identify contiguous leaf
2253/// subtrees where every supernode has `front_size < threshold`. Sets `in_small_leaf`
2254/// on qualifying supernodes and returns a `Vec<SmallLeafSubtree>` describing each
2255/// identified subtree (with at least 2 nodes).
2256///
2257/// # Arguments
2258///
2259/// - `supernodes`: Post-amalgamation supernode descriptors (mutated: `in_small_leaf` set).
2260/// - `children_map`: Children map (`children[s]` = list of child supernode indices).
2261/// - `threshold`: Front-size threshold. Supernodes with `front_size >= threshold` are
2262/// excluded. If `threshold == 0`, returns empty (fast path disabled).
2263///
2264/// # Algorithm
2265///
2266/// 1. Compute `front_size[s] = sum(owned_ranges[s].len()) + pattern[s].len()`.
2267/// 2. Bottom-up (postorder): mark `in_small_leaf[s] = true` iff `front_size[s] < threshold`
2268/// AND (s is a leaf OR all children have `in_small_leaf = true`).
2269/// 3. Identify subtree roots: `in_small_leaf[s] = true` and parent has `in_small_leaf = false`
2270/// (or no parent).
2271/// 4. Collect descendant nodes in postorder via iterative stack-based traversal.
2272/// 5. Filter subtrees with < 2 nodes.
2273///
2274/// # References
2275///
2276/// - SPRAL `SymbolicSubtree.hxx:57-84` (BSD-3): bottom-up classification
2277/// - Duff, Hogg & Lopez (2020), Algorithm 6.1: find_subtree_partition
2278fn classify_small_leaf_subtrees(
2279 supernodes: &mut [SupernodeInfo],
2280 children_map: &[Vec<usize>],
2281 threshold: usize,
2282) -> Vec<SmallLeafSubtree> {
2283 if threshold == 0 {
2284 return Vec::new();
2285 }
2286
2287 let n_supernodes = supernodes.len();
2288
2289 // Step 1-2: Bottom-up classification (supernodes are in postorder: s=0 is leftmost leaf)
2290 for s in 0..n_supernodes {
2291 let owned_cols: usize = supernodes[s].owned_ranges.iter().map(|r| r.len()).sum();
2292 let front_size = owned_cols + supernodes[s].pattern.len();
2293
2294 if front_size >= threshold {
2295 supernodes[s].in_small_leaf = false;
2296 continue;
2297 }
2298
2299 if children_map[s].is_empty() {
2300 // Leaf node with small front
2301 supernodes[s].in_small_leaf = true;
2302 } else if children_map[s].iter().all(|&c| supernodes[c].in_small_leaf) {
2303 // Interior node: all children are in_small_leaf and this node is small
2304 supernodes[s].in_small_leaf = true;
2305 } else {
2306 supernodes[s].in_small_leaf = false;
2307 }
2308 }
2309
2310 // Step 3: Identify subtree roots
2311 let mut subtrees = Vec::new();
2312 for s in 0..n_supernodes {
2313 if !supernodes[s].in_small_leaf {
2314 continue;
2315 }
2316 let is_root = match supernodes[s].parent {
2317 None => true,
2318 Some(p) => !supernodes[p].in_small_leaf,
2319 };
2320 if !is_root {
2321 continue;
2322 }
2323
2324 // Step 4: Collect descendant nodes in postorder via iterative DFS.
2325 // Two-stack approach: first stack drives DFS, second collects in reverse postorder.
2326 let mut nodes = Vec::new();
2327 let mut stack = vec![s];
2328 let mut visit_stack = Vec::new();
2329 while let Some(node) = stack.pop() {
2330 visit_stack.push(node);
2331 // Push children (they'll be processed before parent due to stack LIFO)
2332 for &c in &children_map[node] {
2333 if supernodes[c].in_small_leaf {
2334 stack.push(c);
2335 }
2336 }
2337 }
2338 // visit_stack is in reverse postorder — reverse to get postorder
2339 while let Some(node) = visit_stack.pop() {
2340 nodes.push(node);
2341 }
2342
2343 // Step 5: Filter subtrees with < 2 nodes
2344 if nodes.len() < 2 {
2345 // Single-node "subtree" — not worth the fast path overhead
2346 supernodes[s].in_small_leaf = false;
2347 continue;
2348 }
2349
2350 // Compute max_front_size across all nodes in the subtree
2351 let max_front_size = nodes
2352 .iter()
2353 .map(|&node| {
2354 let owned: usize = supernodes[node].owned_ranges.iter().map(|r| r.len()).sum();
2355 owned + supernodes[node].pattern.len()
2356 })
2357 .max()
2358 .unwrap_or(0);
2359
2360 subtrees.push(SmallLeafSubtree {
2361 root: s,
2362 nodes,
2363 max_front_size,
2364 parent_of_root: supernodes[s].parent,
2365 });
2366 }
2367
2368 subtrees
2369}
2370
2371/// Scatter original sparse matrix entries into a frontal matrix from multiple owned ranges.
2372///
2373/// After amalgamation, a supernode may own multiple non-contiguous column ranges.
2374/// This function iterates all owned columns and correctly handles upper-triangle
2375/// deduplication across ranges.
2376fn scatter_original_entries_multi(
2377 frontal: &mut FrontalMatrix<'_>,
2378 matrix: &SparseColMat<usize, f64>,
2379 perm_fwd: &[usize],
2380 perm_inv: &[usize],
2381 global_to_local: &[usize],
2382 owned_ranges: &[std::ops::Range<usize>],
2383 scaling: Option<&[f64]>,
2384) {
2385 let symbolic = matrix.symbolic();
2386 let col_ptrs = symbolic.col_ptr();
2387 let row_indices_csc = symbolic.row_idx();
2388 let values = matrix.val();
2389 let total_owned: usize = owned_ranges.iter().map(|r| r.len()).sum();
2390 let k = frontal.num_fully_summed;
2391
2392 // Build a fast check for "is this permuted column an owned supernode column?"
2393 // We reuse global_to_local: if g2l[pj] < total_owned, it's an owned column
2394 // (since frontal_rows puts owned columns first). This works because owned
2395 // columns always occupy the first `total_owned` positions in frontal_rows.
2396
2397 for range in owned_ranges {
2398 for pj in range.clone() {
2399 let orig_col = perm_fwd[pj];
2400 let local_col = global_to_local[pj];
2401
2402 let start = col_ptrs[orig_col];
2403 let end = col_ptrs[orig_col + 1];
2404 for idx in start..end {
2405 let orig_row = row_indices_csc[idx];
2406 // Skip upper-triangle if both are owned supernode cols
2407 if orig_row < orig_col {
2408 let perm_row = perm_inv[orig_row];
2409 let local_peer = global_to_local[perm_row];
2410 if local_peer != NOT_IN_FRONT && local_peer < total_owned {
2411 continue; // both owned cols — avoid double-count
2412 }
2413 }
2414 let global_row = perm_inv[orig_row];
2415 let local_row = global_to_local[global_row];
2416 if local_row == NOT_IN_FRONT {
2417 continue;
2418 }
2419 // Skip delayed columns from children (indices total_owned..k)
2420 if local_row >= total_owned && local_row < k {
2421 continue;
2422 }
2423 let mut val = values[idx];
2424 if let Some(s) = scaling {
2425 val *= s[perm_inv[orig_row]] * s[perm_inv[orig_col]];
2426 }
2427 if local_row >= local_col {
2428 frontal.data[(local_row, local_col)] += val;
2429 } else {
2430 frontal.data[(local_col, local_row)] += val;
2431 }
2432 }
2433 }
2434 }
2435}
2436
2437/// Perform extend-add: merge a child's contribution block into the parent frontal matrix.
2438///
2439/// For each lower-triangle entry in the child contribution, maps global indices
2440/// to parent local positions and adds the value.
2441///
2442/// # References
2443///
2444/// - Liu (1992), Section 4.2: extend-add operator
2445pub(crate) fn extend_add(
2446 parent: &mut FrontalMatrix<'_>,
2447 child: ContributionBlock,
2448 global_to_local: &[usize],
2449) -> Mat<f64> {
2450 let cb_size = child.row_indices.len();
2451 // Column-oriented iteration: for each child column j, scatter the
2452 // lower-triangle entries (rows j..cb_size) into the parent. Source
2453 // reads are contiguous in column-major layout. We keep the zero-check
2454 // and symmetry swap since delayed columns can break monotonicity.
2455 for j in 0..cb_size {
2456 let gj = child.row_indices[j];
2457 let lj = global_to_local[gj];
2458 debug_assert!(
2459 lj != NOT_IN_FRONT,
2460 "extend_add: child col {} not in parent",
2461 gj
2462 );
2463 let child_col = &child.data.col_as_slice(j)[j..cb_size];
2464 let row_indices = &child.row_indices[j..cb_size];
2465
2466 for (&val, &gi) in child_col.iter().zip(row_indices) {
2467 if val != 0.0 {
2468 let li = global_to_local[gi];
2469 debug_assert!(
2470 li != NOT_IN_FRONT,
2471 "extend_add: child row {} not in parent",
2472 gi
2473 );
2474 // Map to parent: ensure lower triangle (li >= lj)
2475 if li >= lj {
2476 parent.data[(li, lj)] += val;
2477 } else {
2478 parent.data[(lj, li)] += val;
2479 }
2480 }
2481 }
2482 }
2483 // Return the consumed data buffer for recycling into workspace.contrib_buffer
2484 child.data
2485}
2486
2487/// Extend-add using a precomputed child→parent row mapping.
2488///
2489/// Used when the child had zero delayed columns, so the contribution block
2490/// rows correspond exactly to the child's symbolic pattern and the precomputed
2491/// `ea_row_map` provides direct local index lookup without `global_to_local`.
2492///
2493/// Uses column-oriented processing: for each child
2494/// column `j`, scatters the lower-triangle entries `(j..cb_size, j)` into
2495/// the parent. Source reads are contiguous in column-major layout.
2496///
2497/// # References
2498///
2499/// - SPRAL `assemble.hxx:27-38` — `asm_col` column-oriented scatter (BSD-3)
2500///
2501/// # Arguments
2502///
2503/// - `parent`: Parent frontal matrix to accumulate into (lower triangle).
2504/// - `child`: Child contribution block.
2505/// - `ea_row_map`: Precomputed mapping from child contribution row index to
2506/// parent frontal local row index. Length equals the child's pattern size.
2507fn extend_add_mapped(
2508 parent: &mut FrontalMatrix<'_>,
2509 child: ContributionBlock,
2510 ea_row_map: &[u32],
2511) -> Mat<f64> {
2512 let cb_size = child.row_indices.len();
2513 debug_assert!(
2514 ea_row_map.len() >= cb_size,
2515 "extend_add_mapped: ea_row_map len {} < cb_size {}",
2516 ea_row_map.len(),
2517 cb_size
2518 );
2519
2520 // Column-oriented scatter.
2521 // For each child column j, scatter lower-triangle entries (rows j..cb_size)
2522 // into the parent. Source reads via col_as_slice are contiguous in column-
2523 // major layout; writes target a single parent column when li >= lj (common
2524 // case), with a symmetry swap for the rare li < lj case.
2525 for j in 0..cb_size {
2526 let lj = ea_row_map[j] as usize;
2527 let child_col = child.data.col_as_slice(j);
2528 let row_map = &ea_row_map[j..cb_size];
2529 let src = &child_col[j..cb_size];
2530
2531 // 4× unrolled inner loop for ILP
2532 let n = cb_size - j;
2533 let n4 = n / 4 * 4;
2534 let mut k = 0;
2535 while k < n4 {
2536 ea_scatter_one(&mut parent.data, row_map[k] as usize, lj, src[k]);
2537 ea_scatter_one(&mut parent.data, row_map[k + 1] as usize, lj, src[k + 1]);
2538 ea_scatter_one(&mut parent.data, row_map[k + 2] as usize, lj, src[k + 2]);
2539 ea_scatter_one(&mut parent.data, row_map[k + 3] as usize, lj, src[k + 3]);
2540 k += 4;
2541 }
2542 while k < n {
2543 ea_scatter_one(&mut parent.data, row_map[k] as usize, lj, src[k]);
2544 k += 1;
2545 }
2546 }
2547 // Return the consumed data buffer for recycling into workspace.contrib_buffer
2548 child.data
2549}
2550
2551/// Scatter a single extend-add value into the parent's lower triangle.
2552#[inline(always)]
2553fn ea_scatter_one(parent: &mut MatMut<'_, f64>, li: usize, lj: usize, val: f64) {
2554 if li >= lj {
2555 parent[(li, lj)] += val;
2556 } else {
2557 parent[(lj, li)] += val;
2558 }
2559}
2560
2561/// Extract per-supernode factors from a factored frontal matrix.
2562///
2563/// Copies L11, D11, L21, and permutation information from the in-place
2564/// factored frontal matrix into a persistent [`FrontFactors`] struct.
2565///
2566/// Uses per-column `col_as_slice` + `copy_from_slice` for L21 extraction and
2567/// per-column slice copies for L11 1x1-pivot columns, replacing element-by-element
2568/// indexing with bulk memory operations.
2569///
2570/// # Arguments
2571///
2572/// - `frontal_data`: The factored frontal matrix data (owned `Mat<f64>`).
2573/// Columns must be contiguous (guaranteed by `Mat` layout). Only the first
2574/// `m` rows and columns are used.
2575/// - `m`: Front size (number of rows/columns used in `frontal_data`).
2576/// - `k`: Number of fully-summed rows/columns.
2577/// - `frontal_row_indices`: Global permuted column indices for each local row (length >= m).
2578/// - `result`: APTP factorization result (pivot types, permutation, etc.).
2579pub(crate) fn extract_front_factors(
2580 frontal_data: &Mat<f64>,
2581 m: usize,
2582 k: usize,
2583 frontal_row_indices: &[usize],
2584 result: &AptpFactorResult,
2585) -> FrontFactors {
2586 let ne = result.num_eliminated;
2587
2588 // Extract L11 (ne × ne) — unit lower triangular part
2589 // For 2x2 pivots, a[(col+1, col)] is the D off-diagonal, NOT an L entry.
2590 // Uses per-column slice copies for 1x1 pivot columns where the L entries
2591 // form a contiguous segment.
2592 let l11 = if ne > 0 {
2593 let mut l = Mat::zeros(ne, ne);
2594 let mut col = 0;
2595 while col < ne {
2596 l[(col, col)] = 1.0; // unit diagonal
2597 match result.d.pivot_type(col) {
2598 PivotType::OneByOne => {
2599 // L entries: rows (col+1)..ne — contiguous in both source and dest
2600 let n_entries = ne - (col + 1);
2601 if n_entries > 0 {
2602 let src = &frontal_data.col_as_slice(col)[col + 1..ne];
2603 l.col_as_slice_mut(col)[col + 1..ne].copy_from_slice(src);
2604 }
2605 col += 1;
2606 }
2607 PivotType::TwoByTwo { partner } if partner > col => {
2608 l[(col + 1, col + 1)] = 1.0; // unit diagonal for partner
2609 // a[(col+1, col)] is D off-diagonal, skip it
2610 // L entries for both columns start at row col+2
2611 let n_entries = ne - (col + 2);
2612 if n_entries > 0 {
2613 let src0 = &frontal_data.col_as_slice(col)[col + 2..ne];
2614 l.col_as_slice_mut(col)[col + 2..ne].copy_from_slice(src0);
2615 let src1 = &frontal_data.col_as_slice(col + 1)[col + 2..ne];
2616 l.col_as_slice_mut(col + 1)[col + 2..ne].copy_from_slice(src1);
2617 }
2618 col += 2;
2619 }
2620 PivotType::TwoByTwo { .. } => {
2621 // Second column of a 2x2 pair — already handled above
2622 col += 1;
2623 }
2624 PivotType::Delayed => {
2625 // Should not appear for columns 0..ne (they were eliminated)
2626 unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
2627 }
2628 }
2629 }
2630 l
2631 } else {
2632 Mat::zeros(0, 0)
2633 };
2634
2635 // Build truncated D11 (first ne pivots from result.d)
2636 let mut d11 = MixedDiagonal::new(ne);
2637 let mut col = 0;
2638 while col < ne {
2639 match result.d.pivot_type(col) {
2640 PivotType::OneByOne => {
2641 d11.set_1x1(col, result.d.diagonal_1x1(col));
2642 col += 1;
2643 }
2644 PivotType::TwoByTwo { partner: _ } => {
2645 let block = result.d.diagonal_2x2(col);
2646 d11.set_2x2(Block2x2 {
2647 first_col: col,
2648 a: block.a,
2649 b: block.b,
2650 c: block.c,
2651 });
2652 col += 2;
2653 }
2654 PivotType::Delayed => {
2655 // Should not happen for columns 0..ne (they were eliminated)
2656 unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
2657 }
2658 }
2659 }
2660
2661 // Extract L21 (r × ne) where r = m - ne (includes delayed rows ne..k AND
2662 // non-fully-summed rows k..m). The factored frontal has valid L entries at rows
2663 // ne..m for columns 0..ne: rows ne..k are delayed fully-summed rows that received
2664 // TRSM updates before being delayed; rows k..m are non-fully-summed rows.
2665 //
2666 // Uses per-column copy_from_slice for bulk extraction.
2667 //
2668 // NOTE: extract_contribution also includes delayed rows (see lines below).
2669 // These two functions MUST be consistent in their row treatment.
2670 let r = m - ne;
2671 let l21 = if ne > 0 && r > 0 {
2672 let mut l = Mat::zeros(r, ne);
2673 for j in 0..ne {
2674 let src = &frontal_data.col_as_slice(j)[ne..m];
2675 l.col_as_slice_mut(j)[..r].copy_from_slice(src);
2676 }
2677 l
2678 } else {
2679 Mat::zeros(r, ne)
2680 };
2681
2682 // Local permutation (maps factored position to original front-local column)
2683 let local_perm = result.perm[..k].to_vec();
2684
2685 // Column indices: the global permuted indices of the eliminated columns
2686 // local_perm[0..ne] gives the front-local columns that were eliminated,
2687 // mapped through frontal_row_indices to get global permuted indices
2688 let col_indices: Vec<usize> = local_perm[..ne]
2689 .iter()
2690 .map(|&lp| frontal_row_indices[lp])
2691 .collect();
2692
2693 // Row indices: global permuted indices for L21 rows.
2694 // First num_delayed entries: delayed fully-summed columns (ne..k), mapped through
2695 // result.perm and frontal_row_indices to get global permuted indices.
2696 // Remaining entries: non-fully-summed rows (k..m).
2697 // This matches extract_contribution's row_indices construction.
2698 let mut row_indices = Vec::with_capacity(m - ne);
2699 for &lp in &result.perm[ne..k] {
2700 row_indices.push(frontal_row_indices[lp]);
2701 }
2702 row_indices.extend_from_slice(&frontal_row_indices[k..m]);
2703
2704 FrontFactors {
2705 l11,
2706 d11,
2707 l21,
2708 local_perm,
2709 num_eliminated: ne,
2710 col_indices,
2711 row_indices,
2712 }
2713}
2714
2715/// Extract the contribution block from a factored frontal matrix.
2716///
2717/// The NFS×NFS Schur complement data is already in `contrib_buffer` (written
2718/// by [`compute_contribution_gemm`]). This function adds any delayed-column
2719/// data from the workspace and builds the index metadata.
2720///
2721/// # Layout within returned `ContributionBlock.data`
2722///
2723/// ```text
2724/// [0..num_delayed, 0..num_delayed] → delayed × delayed (from workspace)
2725/// [num_delayed..size, 0..num_delayed] → NFS × delayed cross-terms (from workspace)
2726/// [num_delayed..size, num_delayed..size] → NFS × NFS Schur complement (from contrib_buffer)
2727/// ```
2728///
2729/// # Arguments
2730///
2731/// - `frontal_data`: The factored frontal matrix data (owned `Mat<f64>`).
2732/// - `m`: Front size (number of rows/columns used).
2733/// - `k`: Number of fully-summed rows/columns.
2734/// - `frontal_row_indices`: Global permuted column indices for each local row.
2735/// - `result`: APTP factorization result.
2736/// - `contrib_buffer`: Pre-allocated buffer already containing NFS×NFS data
2737/// in `[0..nfs, 0..nfs]` (from deferred GEMM). Moved in; becomes the
2738/// `ContributionBlock.data`.
2739pub(crate) fn extract_contribution(
2740 frontal_data: &Mat<f64>,
2741 m: usize,
2742 k: usize,
2743 frontal_row_indices: &[usize],
2744 result: &AptpFactorResult,
2745 mut contrib_buffer: Mat<f64>,
2746) -> ContributionBlock {
2747 let ne = result.num_eliminated;
2748 let size = m - ne;
2749 let num_delayed = k - ne;
2750 let nfs = m - k; // = size - num_delayed
2751
2752 // The NFS×NFS region (contrib_buffer[0..nfs, 0..nfs]) is already filled by
2753 // compute_contribution_gemm. We need to handle delayed columns (if any) by
2754 // shifting the NFS×NFS data to make room for the delayed portion.
2755 if num_delayed > 0 {
2756 // Delayed columns exist: we need to assemble the full (size × size) contribution.
2757 // Layout: [delayed × delayed | NFS × delayed | NFS × NFS]
2758 //
2759 // Strategy: allocate a new buffer of the correct size, copy delayed regions
2760 // from workspace, and NFS×NFS from contrib_buffer.
2761 let mut data = Mat::zeros(size, size);
2762
2763 // Copy delayed × delayed from workspace: frontal_data[ne..k, ne..k]
2764 // These positions are in the APTP-permuted order. The delayed columns
2765 // are at positions ne..k in the factored frontal matrix.
2766 for j in 0..num_delayed {
2767 let col_len = num_delayed - j;
2768 let src = &frontal_data.col_as_slice(ne + j)[ne + j..ne + j + col_len];
2769 data.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
2770 }
2771
2772 // Copy NFS × delayed cross-terms from workspace: frontal_data[k..m, ne..k]
2773 for j in 0..num_delayed {
2774 let src = &frontal_data.col_as_slice(ne + j)[k..m];
2775 data.col_as_slice_mut(j)[num_delayed..size].copy_from_slice(src);
2776 }
2777
2778 // Copy NFS × NFS from contrib_buffer[0..nfs, 0..nfs] into data[num_delayed..size, num_delayed..size]
2779 for j in 0..nfs {
2780 let col_len = nfs - j;
2781 let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
2782 data.col_as_slice_mut(num_delayed + j)[num_delayed + j..size].copy_from_slice(src);
2783 }
2784
2785 contrib_buffer = data;
2786 }
2787 // else: num_delayed == 0 → contrib_buffer already has the full contribution
2788 // (NFS×NFS = entire contribution), no copy needed. True zero-copy path.
2789
2790 // Build row indices:
2791 // - First num_delayed entries: delayed columns mapped through result.perm and frontal_row_indices
2792 // - Remaining entries: non-fully-summed rows from frontal_row_indices[k..m]
2793 let mut row_indices = Vec::with_capacity(size);
2794
2795 // Delayed columns: these are at positions ne..k in the APTP-permuted frontal matrix
2796 // result.perm[ne..k] gives the original front-local indices of delayed columns
2797 for &lp in &result.perm[ne..k] {
2798 row_indices.push(frontal_row_indices[lp]);
2799 }
2800
2801 // Non-fully-summed rows
2802 row_indices.extend_from_slice(&frontal_row_indices[k..m]);
2803
2804 ContributionBlock {
2805 data: contrib_buffer,
2806 row_indices,
2807 num_delayed,
2808 }
2809}
2810
2811// ---------------------------------------------------------------------------
2812// Tests
2813// ---------------------------------------------------------------------------
2814
2815#[cfg(test)]
2816mod tests {
2817 use super::*;
2818 use crate::symmetric::factor::AptpOptions;
2819
2820 /// Helper: build frontal matrix data from a dense lower-triangle matrix.
2821 ///
2822 /// Returns owned `(data, row_indices)` — construct a `FrontalMatrix` view from these.
2823 /// `k` = number of fully-summed columns.
2824 /// `row_indices` = global permuted indices for each local row (length m).
2825 fn make_frontal_data(
2826 lower: &Mat<f64>,
2827 k: usize,
2828 row_indices: Vec<usize>,
2829 ) -> (Mat<f64>, Vec<usize>, usize) {
2830 let m = lower.nrows();
2831 assert_eq!(lower.ncols(), m);
2832 assert_eq!(row_indices.len(), m);
2833 assert!(k <= m);
2834
2835 // Build full symmetric from lower triangle
2836 let mut data = Mat::zeros(m, m);
2837 for i in 0..m {
2838 for j in 0..=i {
2839 data[(i, j)] = lower[(i, j)];
2840 data[(j, i)] = lower[(i, j)];
2841 }
2842 }
2843
2844 (data, row_indices, k)
2845 }
2846
2847 /// Helper: build a symmetric indefinite matrix (lower triangle) that forces
2848 /// delays. Strategy from SPRAL's `cause_delays`: scale selected rows/cols
2849 /// by a large factor, making their off-diagonal entries dominate the diagonal
2850 /// so the threshold test fails.
2851 ///
2852 /// Returns (matrix, expected_to_have_delays).
2853 fn make_delay_matrix(m: usize, k: usize, delay_rows: &[usize]) -> Mat<f64> {
2854 let mut a = Mat::zeros(m, m);
2855
2856 // Start with a diagonally dominant indefinite matrix
2857 for i in 0..m {
2858 // Alternate positive/negative diagonal for indefiniteness
2859 a[(i, i)] = if i % 2 == 0 { 4.0 } else { -4.0 };
2860 // Add some off-diagonal coupling
2861 for j in 0..i {
2862 let val = 0.5 / ((i - j) as f64 + 1.0);
2863 a[(i, j)] = val;
2864 }
2865 }
2866
2867 // Scale delay_rows to create threshold failures:
2868 // Multiply row/col by 1000 but NOT the diagonal, so off-diag >> diag
2869 for &r in delay_rows {
2870 if r < k {
2871 // Scale off-diagonal entries in row r
2872 for j in 0..r {
2873 a[(r, j)] *= 1000.0;
2874 }
2875 for i in (r + 1)..m {
2876 a[(i, r)] *= 1000.0;
2877 }
2878 // DON'T scale diagonal — this makes |a_rr| << |a_ir| => threshold failure
2879 }
2880 }
2881
2882 a
2883 }
2884
2885 /// Test that extract_front_factors produces correct L21 dimensions when
2886 /// delays are present: L21 should have (m - ne) rows, NOT (m - k) rows.
2887 /// This is the regression test for the bug fixed in this commit.
2888 #[test]
2889 fn test_extract_front_factors_l21_includes_delayed_rows() {
2890 let m = 12;
2891 let k = 8;
2892 let delay_rows = &[1, 5]; // Columns 1 and 5 should be delayed
2893
2894 let lower = make_delay_matrix(m, k, delay_rows);
2895 let row_indices: Vec<usize> = (100..100 + m).collect(); // arbitrary global indices
2896
2897 let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
2898
2899 let options = AptpOptions::default();
2900 let result =
2901 aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
2902
2903 let ne = result.num_eliminated;
2904 let num_delayed = k - ne;
2905
2906 // The bug: if delays occurred, the old code would make L21 have (m - k) rows
2907 // instead of (m - ne) rows, missing the delayed-row entries.
2908 if num_delayed > 0 {
2909 let ff = extract_front_factors(&data, m, k, &row_indices, &result);
2910
2911 // L21 must have (m - ne) rows = delayed_rows + non-fully-summed rows
2912 assert_eq!(
2913 ff.l21.nrows(),
2914 m - ne,
2915 "L21 should have m-ne={} rows (not m-k={}), ne={}, delays={}",
2916 m - ne,
2917 m - k,
2918 ne,
2919 num_delayed
2920 );
2921
2922 // row_indices must match L21 rows
2923 assert_eq!(
2924 ff.row_indices.len(),
2925 ff.l21.nrows(),
2926 "row_indices length must match L21 rows"
2927 );
2928
2929 // col_indices must have ne entries
2930 assert_eq!(ff.col_indices.len(), ne);
2931 } else {
2932 // No delays — still verify basic shapes
2933 let ff = extract_front_factors(&data, m, k, &row_indices, &result);
2934 assert_eq!(ff.l21.nrows(), m - ne);
2935 assert_eq!(ff.row_indices.len(), ff.l21.nrows());
2936 }
2937 }
2938
2939 /// Test that delayed rows in extract_front_factors and extract_contribution
2940 /// have consistent row_indices. Both functions MUST agree on which global
2941 /// indices correspond to delayed rows.
2942 #[test]
2943 fn test_extract_front_factors_contribution_row_indices_consistent() {
2944 let m = 10;
2945 let k = 6;
2946 let delay_rows = &[2]; // Force column 2 to be delayed
2947
2948 let lower = make_delay_matrix(m, k, delay_rows);
2949 let row_indices: Vec<usize> = (200..200 + m).collect();
2950
2951 let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
2952
2953 let options = AptpOptions::default();
2954 let result =
2955 aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
2956
2957 let ne = result.num_eliminated;
2958 let num_delayed = k - ne;
2959
2960 if num_delayed > 0 {
2961 let ff = extract_front_factors(&data, m, k, &row_indices, &result);
2962 let nfs = m - k;
2963 let mut contrib_buf = Mat::zeros(m, m);
2964 if nfs > 0 {
2965 let mut ld_ws = Mat::new();
2966 compute_contribution_gemm(
2967 &data,
2968 k,
2969 ne,
2970 m,
2971 &result.d,
2972 &mut contrib_buf,
2973 &mut ld_ws,
2974 Par::Seq,
2975 );
2976 }
2977 let cb = extract_contribution(&data, m, k, &row_indices, &result, contrib_buf);
2978
2979 // Both must have the same number of delayed rows
2980 assert_eq!(cb.num_delayed, num_delayed);
2981
2982 // The first num_delayed entries of row_indices must match
2983 let ff_delayed_indices = &ff.row_indices[..num_delayed];
2984 let cb_delayed_indices = &cb.row_indices[..num_delayed];
2985 assert_eq!(
2986 ff_delayed_indices, cb_delayed_indices,
2987 "Delayed row indices must be identical between extract_front_factors \
2988 and extract_contribution.\n front_factors: {:?}\n contribution: {:?}",
2989 ff_delayed_indices, cb_delayed_indices
2990 );
2991
2992 // The non-fully-summed row indices must also match
2993 let ff_nfs_indices = &ff.row_indices[num_delayed..];
2994 let cb_nfs_indices = &cb.row_indices[num_delayed..];
2995 assert_eq!(
2996 ff_nfs_indices, cb_nfs_indices,
2997 "Non-fully-summed row indices must match"
2998 );
2999 }
3000 }
3001
3002 /// Test that L21 entries at delayed row positions are non-zero.
3003 /// When APTP delays a column, the TRSM (apply_and_check) has already
3004 /// computed valid L entries at those row positions. These must be preserved
3005 /// in the extracted L21.
3006 #[test]
3007 fn test_extract_front_factors_delayed_l21_entries_populated() {
3008 let m = 12;
3009 let k = 8;
3010 // Use multiple delay candidates to increase chance of delays
3011 let delay_rows = &[0, 3, 5];
3012
3013 let lower = make_delay_matrix(m, k, delay_rows);
3014 let row_indices: Vec<usize> = (0..m).collect();
3015
3016 let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
3017
3018 let options = AptpOptions::default();
3019 let result =
3020 aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
3021
3022 let ne = result.num_eliminated;
3023 let num_delayed = k - ne;
3024
3025 if num_delayed > 0 && ne > 0 {
3026 let ff = extract_front_factors(&data, m, k, &row_indices, &result);
3027
3028 // Check that L21 entries in the delayed-row portion (rows 0..num_delayed)
3029 // are not all zero. These rows received TRSM updates before being delayed.
3030 let delayed_l21 = ff.l21.as_ref().subrows(0, num_delayed);
3031 let mut has_nonzero = false;
3032 for i in 0..delayed_l21.nrows() {
3033 for j in 0..delayed_l21.ncols() {
3034 if delayed_l21[(i, j)].abs() > 1e-16 {
3035 has_nonzero = true;
3036 }
3037 }
3038 }
3039 assert!(
3040 has_nonzero,
3041 "L21 delayed rows should have non-zero entries from TRSM. \
3042 ne={}, delays={}, L21 shape=({},{})",
3043 ne,
3044 num_delayed,
3045 ff.l21.nrows(),
3046 ff.l21.ncols()
3047 );
3048 }
3049 }
3050
3051 /// Reconstruction test for extract_front_factors with delays.
3052 /// Verify that the extracted L11, D11, L21 correctly reconstruct the
3053 /// factored portion of the frontal matrix: PAP^T[0..ne, 0..ne] = L11 * D11 * L11^T.
3054 #[test]
3055 fn test_extract_front_factors_reconstruction_with_delays() {
3056 let m = 16;
3057 let k = 10;
3058 let delay_rows = &[1, 4, 7];
3059
3060 let lower = make_delay_matrix(m, k, delay_rows);
3061 let row_indices: Vec<usize> = (0..m).collect();
3062
3063 // Save a copy of the original symmetric matrix before factoring
3064 let mut original = Mat::zeros(m, m);
3065 for i in 0..m {
3066 for j in 0..=i {
3067 original[(i, j)] = lower[(i, j)];
3068 original[(j, i)] = lower[(i, j)];
3069 }
3070 }
3071
3072 let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
3073
3074 let options = AptpOptions::default();
3075 let result =
3076 aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
3077
3078 let ne = result.num_eliminated;
3079 if ne == 0 {
3080 return; // Nothing to reconstruct
3081 }
3082
3083 let ff = extract_front_factors(&data, m, k, &row_indices, &result);
3084
3085 // Compute L11 * D * L11^T manually
3086 // Step 1: LD = L11 * D (column-by-column, handling 1x1 and 2x2 pivots)
3087 let mut ld = Mat::zeros(ne, ne);
3088 let mut col = 0;
3089 while col < ne {
3090 match ff.d11.pivot_type(col) {
3091 PivotType::OneByOne => {
3092 let d = ff.d11.diagonal_1x1(col);
3093 for i in 0..ne {
3094 ld[(i, col)] = ff.l11[(i, col)] * d;
3095 }
3096 col += 1;
3097 }
3098 PivotType::TwoByTwo { .. } => {
3099 let block = ff.d11.diagonal_2x2(col);
3100 for i in 0..ne {
3101 let l0 = ff.l11[(i, col)];
3102 let l1 = ff.l11[(i, col + 1)];
3103 ld[(i, col)] = l0 * block.a + l1 * block.b;
3104 ld[(i, col + 1)] = l0 * block.b + l1 * block.c;
3105 }
3106 col += 2;
3107 }
3108 PivotType::Delayed => {
3109 col += 1;
3110 }
3111 }
3112 }
3113
3114 // Step 2: reconstructed = LD * L11^T
3115 let mut reconstructed = Mat::zeros(ne, ne);
3116 for i in 0..ne {
3117 for j in 0..=i {
3118 let mut val = 0.0;
3119 for p in 0..ne {
3120 val += ld[(i, p)] * ff.l11[(j, p)];
3121 }
3122 reconstructed[(i, j)] = val;
3123 reconstructed[(j, i)] = val;
3124 }
3125 }
3126
3127 // Build the permuted original submatrix for comparison
3128 let mut perm_original = Mat::zeros(ne, ne);
3129 for i in 0..ne {
3130 for j in 0..=i {
3131 let oi = result.perm[i];
3132 let oj = result.perm[j];
3133 let val = if oi >= oj {
3134 original[(oi, oj)]
3135 } else {
3136 original[(oj, oi)]
3137 };
3138 perm_original[(i, j)] = val;
3139 perm_original[(j, i)] = val;
3140 }
3141 }
3142
3143 // Check reconstruction: ||PAP^T[0..ne,0..ne] - L11*D11*L11^T|| / ||PAP^T||
3144 let mut diff_norm = 0.0f64;
3145 let mut orig_norm = 0.0f64;
3146 for i in 0..ne {
3147 for j in 0..=i {
3148 let d = reconstructed[(i, j)] - perm_original[(i, j)];
3149 diff_norm += d * d;
3150 orig_norm += perm_original[(i, j)] * perm_original[(i, j)];
3151 }
3152 }
3153 let rel_err = diff_norm.sqrt() / orig_norm.sqrt().max(1e-15);
3154
3155 assert!(
3156 rel_err < 1e-10,
3157 "L11*D11*L11^T reconstruction error: {:.2e} (ne={}, delays={})",
3158 rel_err,
3159 ne,
3160 k - ne
3161 );
3162 }
3163
3164 /// Verify that the solve path produces correct results when delays are
3165 /// present. Uses a single dense frontal matrix (simulating one supernode),
3166 /// factors with delays, extracts front_factors, and runs forward/diagonal/
3167 /// backward solve steps to verify Ax = b.
3168 ///
3169 /// This directly tests the code path that the extract_front_factors bug
3170 /// would break: delayed-row L21 entries participate in forward and backward
3171 /// solve via gather/scatter operations.
3172 #[test]
3173 fn test_extract_front_factors_solve_roundtrip_with_delays() {
3174 let m = 12;
3175 let k = 8; // fully-summed columns
3176
3177 let lower = make_delay_matrix(m, k, &[0, 3, 5]);
3178 let row_indices: Vec<usize> = (0..m).collect();
3179
3180 // Build full symmetric matrix for reference
3181 let mut a_full = Mat::zeros(m, m);
3182 for i in 0..m {
3183 for j in 0..=i {
3184 a_full[(i, j)] = lower[(i, j)];
3185 a_full[(j, i)] = lower[(i, j)];
3186 }
3187 }
3188
3189 let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
3190
3191 let options = AptpOptions::default();
3192 let result =
3193 aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
3194
3195 let ne = result.num_eliminated;
3196 let num_delayed = k - ne;
3197
3198 // Only meaningful if we actually got some delays and some eliminations
3199 if num_delayed == 0 || ne == 0 {
3200 return;
3201 }
3202
3203 let ff = extract_front_factors(&data, m, k, &row_indices, &result);
3204
3205 // Check that L11, D11, L21 are internally consistent:
3206 // L * D * L^T (where L = [L11; L21]) should reconstruct the factored
3207 // portion of A when permuted.
3208
3209 // Build full L = [L11; L21] and check L * D * L^T = P * A[0..m, 0..ne] * P^T
3210 let r = ff.l21.nrows();
3211 let full_l_rows = ne + r;
3212 assert_eq!(full_l_rows, m, "L should cover all rows");
3213
3214 // Compute L*D*L^T for the eliminated portion
3215 let mut ld = Mat::zeros(full_l_rows, ne);
3216
3217 // L = [L11; L21]
3218 let mut full_l = Mat::zeros(full_l_rows, ne);
3219 for i in 0..ne {
3220 for j in 0..ne {
3221 full_l[(i, j)] = ff.l11[(i, j)];
3222 }
3223 }
3224 for i in 0..r {
3225 for j in 0..ne {
3226 full_l[(ne + i, j)] = ff.l21[(i, j)];
3227 }
3228 }
3229
3230 // LD = L * D (handling 1x1 and 2x2)
3231 let mut col = 0;
3232 while col < ne {
3233 match ff.d11.pivot_type(col) {
3234 PivotType::OneByOne => {
3235 let d = ff.d11.diagonal_1x1(col);
3236 for i in 0..full_l_rows {
3237 ld[(i, col)] = full_l[(i, col)] * d;
3238 }
3239 col += 1;
3240 }
3241 PivotType::TwoByTwo { .. } => {
3242 let block = ff.d11.diagonal_2x2(col);
3243 for i in 0..full_l_rows {
3244 let l0 = full_l[(i, col)];
3245 let l1 = full_l[(i, col + 1)];
3246 ld[(i, col)] = l0 * block.a + l1 * block.b;
3247 ld[(i, col + 1)] = l0 * block.b + l1 * block.c;
3248 }
3249 col += 2;
3250 }
3251 PivotType::Delayed => {
3252 col += 1;
3253 }
3254 }
3255 }
3256
3257 // reconstructed = LD * L^T
3258 let mut reconstructed = Mat::zeros(full_l_rows, full_l_rows);
3259 for i in 0..full_l_rows {
3260 for j in 0..=i {
3261 let mut val = 0.0;
3262 for p in 0..ne {
3263 val += ld[(i, p)] * full_l[(j, p)];
3264 }
3265 reconstructed[(i, j)] = val;
3266 reconstructed[(j, i)] = val;
3267 }
3268 }
3269
3270 // Build permuted original: row_order = col_indices ++ row_indices
3271 let mut perm_rows = Vec::with_capacity(full_l_rows);
3272 perm_rows.extend_from_slice(ff.col_indices());
3273 perm_rows.extend_from_slice(ff.row_indices());
3274
3275 // Check that L*D*L^T[0..ne, 0..ne] ≈ A_perm[0..ne, 0..ne]
3276 let mut diff_norm = 0.0f64;
3277 let mut orig_norm = 0.0f64;
3278 for i in 0..ne {
3279 for j in 0..=i {
3280 let gi = perm_rows[i];
3281 let gj = perm_rows[j];
3282 let orig_val = if gi >= gj {
3283 a_full[(gi, gj)]
3284 } else {
3285 a_full[(gj, gi)]
3286 };
3287 let d = reconstructed[(i, j)] - orig_val;
3288 diff_norm += d * d;
3289 orig_norm += orig_val * orig_val;
3290 }
3291 }
3292 let rel_err = diff_norm.sqrt() / orig_norm.sqrt().max(1e-15);
3293 assert!(
3294 rel_err < 1e-10,
3295 "Full L*D*L^T reconstruction error: {:.2e} (ne={}, delays={}, L21_rows={})",
3296 rel_err,
3297 ne,
3298 num_delayed,
3299 r
3300 );
3301 }
3302
3303 /// Test scatter_original_entries_multi with non-contiguous owned_ranges.
3304 ///
3305 /// After amalgamation, a merged supernode may own columns from multiple
3306 /// non-contiguous ranges (e.g., [0..2, 4..6] when child columns 2,3 were
3307 /// NOT merged). This test verifies:
3308 ///
3309 /// 1. Entries from all owned ranges are scattered into the frontal matrix
3310 /// 2. Upper-triangle deduplication works correctly across non-contiguous
3311 /// ranges (an entry with both row and col in owned ranges is only counted once)
3312 /// 3. Entries where one index is owned and the other is a non-fully-summed
3313 /// row are scattered correctly
3314 #[test]
3315 fn test_scatter_original_entries_multi_non_contiguous() {
3316 use faer::sparse::{SparseColMat, Triplet};
3317
3318 // Build a small 6×6 symmetric matrix with identity permutation.
3319 // Owned ranges: [0..2, 4..6] (columns 0,1,4,5 owned; 2,3 not owned).
3320 //
3321 // The frontal has 6 rows: locals 0..4 are owned (4 total_owned),
3322 // no delayed children, rows 4..6 are non-fully-summed (rows for
3323 // global indices 2 and 3).
3324 //
3325 // Matrix (lower triangle values, symmetric):
3326 // col: 0 1 2 3 4 5
3327 // 0: [10.0 ]
3328 // 1: [ 1.0 11.0 ]
3329 // 2: [ 2.0 0.0 12.0 ]
3330 // 3: [ 0.0 3.0 0.0 13.0 ]
3331 // 4: [ 4.0 0.0 0.0 0.0 14.0 ]
3332 // 5: [ 0.0 5.0 6.0 0.0 7.0 15.0]
3333 let triplets = vec![
3334 // Diagonal
3335 Triplet::new(0, 0, 10.0),
3336 Triplet::new(1, 1, 11.0),
3337 Triplet::new(2, 2, 12.0),
3338 Triplet::new(3, 3, 13.0),
3339 Triplet::new(4, 4, 14.0),
3340 Triplet::new(5, 5, 15.0),
3341 // Off-diagonal (both triangles for full CSC)
3342 Triplet::new(1, 0, 1.0),
3343 Triplet::new(0, 1, 1.0), // mirror
3344 Triplet::new(2, 0, 2.0),
3345 Triplet::new(0, 2, 2.0), // mirror
3346 Triplet::new(3, 1, 3.0),
3347 Triplet::new(1, 3, 3.0), // mirror
3348 Triplet::new(4, 0, 4.0),
3349 Triplet::new(0, 4, 4.0), // mirror
3350 Triplet::new(5, 1, 5.0),
3351 Triplet::new(1, 5, 5.0), // mirror
3352 Triplet::new(5, 2, 6.0),
3353 Triplet::new(2, 5, 6.0), // mirror
3354 Triplet::new(5, 4, 7.0),
3355 Triplet::new(4, 5, 7.0), // mirror
3356 ];
3357 let matrix = SparseColMat::try_new_from_triplets(6, 6, &triplets).expect("valid CSC");
3358
3359 // Identity permutation
3360 let perm_fwd: Vec<usize> = (0..6).collect();
3361 let perm_inv: Vec<usize> = (0..6).collect();
3362
3363 // Frontal rows: owned columns first (0,1,4,5), then non-owned rows (2,3)
3364 let frontal_rows = vec![0, 1, 4, 5, 2, 3];
3365 let total_owned = 4; // columns 0,1,4,5
3366 let m = frontal_rows.len();
3367 let k = total_owned; // num_fully_summed = total_owned (no delayed children)
3368
3369 // Build global_to_local mapping
3370 let mut global_to_local = vec![NOT_IN_FRONT; 6];
3371 for (local, &global) in frontal_rows.iter().enumerate() {
3372 global_to_local[global] = local;
3373 }
3374
3375 let mut frontal_data = Mat::zeros(m, m);
3376
3377 // Non-contiguous owned ranges (the key case)
3378 let owned_ranges = vec![0..2, 4..6];
3379
3380 {
3381 let mut frontal = FrontalMatrix {
3382 data: frontal_data.as_mut(),
3383 row_indices: &frontal_rows,
3384 num_fully_summed: k,
3385 };
3386
3387 scatter_original_entries_multi(
3388 &mut frontal,
3389 &matrix,
3390 &perm_fwd,
3391 &perm_inv,
3392 &global_to_local,
3393 &owned_ranges,
3394 None,
3395 );
3396 }
3397
3398 // Verify scattered values.
3399 // Local indices: 0→global 0, 1→global 1, 2→global 4, 3→global 5, 4→global 2, 5→global 3
3400 //
3401 // Expected entries in the frontal (lower triangle):
3402 // (0,0)=10.0 diagonal of global 0
3403 // (1,0)=1.0 global(1,0) → local(1,0)
3404 // (1,1)=11.0 diagonal of global 1
3405 // (2,0)=4.0 global(4,0) → local(2,0)
3406 // (2,2)=14.0 diagonal of global 4
3407 // (3,1)=5.0 global(5,1) → local(3,1)
3408 // (3,2)=7.0 global(5,4) → local(3,2)
3409 // (3,3)=15.0 diagonal of global 5
3410 // (4,0)=2.0 global(2,0) → local(4,0) [non-owned row, owned col]
3411 // (4,3)=6.0 global(2,5) → local(4,3) [non-owned row, owned col]
3412 // (5,1)=3.0 global(3,1) → local(5,1) [non-owned row, owned col]
3413 //
3414 // Upper-triangle dedup: global(0,1) should be skipped because both
3415 // 0 and 1 are owned. Similarly global(0,4), global(1,5), global(4,5)
3416 // are upper-triangle entries between owned cols and should be skipped.
3417
3418 let f = &frontal_data;
3419
3420 // Diagonal entries
3421 assert_eq!(f[(0, 0)], 10.0, "diag global 0");
3422 assert_eq!(f[(1, 1)], 11.0, "diag global 1");
3423 assert_eq!(f[(2, 2)], 14.0, "diag global 4");
3424 assert_eq!(f[(3, 3)], 15.0, "diag global 5");
3425
3426 // Off-diagonal between owned cols (lower triangle only)
3427 assert_eq!(f[(1, 0)], 1.0, "global(1,0) → local(1,0)");
3428 assert_eq!(f[(2, 0)], 4.0, "global(4,0) → local(2,0)");
3429 assert_eq!(f[(3, 1)], 5.0, "global(5,1) → local(3,1)");
3430 assert_eq!(f[(3, 2)], 7.0, "global(5,4) → local(3,2)");
3431
3432 // Non-owned rows (global 2 → local 4, global 3 → local 5)
3433 assert_eq!(f[(4, 0)], 2.0, "global(2,0) → local(4,0)");
3434 assert_eq!(f[(4, 3)], 6.0, "global(2,5) → local(4,3)");
3435 assert_eq!(f[(5, 1)], 3.0, "global(3,1) → local(5,1)");
3436
3437 // Verify no spurious double-counting: the (0,1) position should be 1.0,
3438 // not 2.0 (which would happen if upper-triangle entry was also scattered)
3439 assert_eq!(
3440 f[(1, 0)],
3441 1.0,
3442 "upper-triangle dedup: (1,0) should be 1.0, not 2.0"
3443 );
3444
3445 // Cross-range upper-triangle dedup: global(4,0) upper entry global(0,4)
3446 // should NOT be scattered separately, so local(2,0) should be 4.0
3447 assert_eq!(
3448 f[(2, 0)],
3449 4.0,
3450 "cross-range upper-triangle dedup: (2,0) should be 4.0, not 8.0"
3451 );
3452
3453 // Verify zeros where no entries exist
3454 assert_eq!(f[(4, 1)], 0.0, "no global(2,1) entry");
3455 assert_eq!(f[(4, 2)], 0.0, "no global(2,4) entry");
3456 assert_eq!(f[(5, 0)], 0.0, "no global(3,0) entry");
3457 assert_eq!(f[(5, 2)], 0.0, "no global(3,4) entry");
3458 assert_eq!(f[(5, 3)], 0.0, "no global(3,5) entry");
3459 }
3460
3461 // -----------------------------------------------------------------------
3462 // extend_add_mapped: precomputed-map scatter (zero-delay fast path)
3463 // -----------------------------------------------------------------------
3464
3465 /// Test that `extend_add_mapped` correctly scatters a child contribution
3466 /// into a parent frontal matrix using a precomputed u32 row mapping,
3467 /// handling the lower-triangle symmetry swap when child ordering differs
3468 /// from parent ordering (non-monotonic map from amalgamated supernodes).
3469 #[test]
3470 fn test_extend_add_mapped_hand_constructed() {
3471 // Parent frontal: 5×5, initially zero (represents assembled parent).
3472 let mut parent_data = Mat::<f64>::zeros(5, 5);
3473 let parent_row_indices = vec![10, 20, 30, 40, 50]; // global indices (unused by mapped path)
3474 let mut parent = FrontalMatrix {
3475 data: parent_data.as_mut(),
3476 row_indices: &parent_row_indices,
3477 num_fully_summed: 3,
3478 };
3479
3480 // Child contribution: 3×3 lower triangle.
3481 // [1.0 ]
3482 // [2.0 3.0 ]
3483 // [4.0 5.0 6.0]
3484 let mut child_data = Mat::<f64>::zeros(3, 3);
3485 child_data[(0, 0)] = 1.0;
3486 child_data[(1, 0)] = 2.0;
3487 child_data[(1, 1)] = 3.0;
3488 child_data[(2, 0)] = 4.0;
3489 child_data[(2, 1)] = 5.0;
3490 child_data[(2, 2)] = 6.0;
3491
3492 let child = ContributionBlock {
3493 data: child_data,
3494 row_indices: vec![30, 10, 40], // global indices
3495 num_delayed: 0,
3496 };
3497
3498 // Precomputed map: child row 0 → parent local 2 (global 30),
3499 // child row 1 → parent local 0 (global 10),
3500 // child row 2 → parent local 3 (global 40).
3501 // Note: child row 1 maps to a LOWER parent index than child row 0,
3502 // so extend_add_mapped must handle the li < lj swap case.
3503 let ea_row_map: Vec<u32> = vec![2, 0, 3];
3504
3505 let recycled = extend_add_mapped(&mut parent, child, &ea_row_map);
3506
3507 // Verify buffer is returned for recycling
3508 assert_eq!(recycled.nrows(), 3);
3509 assert_eq!(recycled.ncols(), 3);
3510
3511 // Expected scatter (lower triangle of parent):
3512 //
3513 // child(0,0)=1.0: li=2, lj=2 → parent(2,2) += 1.0
3514 // child(1,0)=2.0: li=0, lj=2 → li < lj, swap → parent(2,0) += 2.0
3515 // child(1,1)=3.0: li=0, lj=0 → parent(0,0) += 3.0
3516 // child(2,0)=4.0: li=3, lj=2 → parent(3,2) += 4.0
3517 // child(2,1)=5.0: li=3, lj=0 → parent(3,0) += 5.0
3518 // child(2,2)=6.0: li=3, lj=3 → parent(3,3) += 6.0
3519
3520 let p = parent.data.as_ref();
3521 assert_eq!(p[(0, 0)], 3.0, "child(1,1)=3.0 → parent(0,0)");
3522 assert_eq!(p[(2, 0)], 2.0, "child(1,0)=2.0 swapped → parent(2,0)");
3523 assert_eq!(p[(2, 2)], 1.0, "child(0,0)=1.0 → parent(2,2)");
3524 assert_eq!(p[(3, 0)], 5.0, "child(2,1)=5.0 → parent(3,0)");
3525 assert_eq!(p[(3, 2)], 4.0, "child(2,0)=4.0 → parent(3,2)");
3526 assert_eq!(p[(3, 3)], 6.0, "child(2,2)=6.0 → parent(3,3)");
3527
3528 // Verify no spurious entries (untouched positions remain zero)
3529 assert_eq!(p[(1, 0)], 0.0, "row 1 untouched");
3530 assert_eq!(p[(1, 1)], 0.0, "row 1 untouched");
3531 assert_eq!(p[(4, 0)], 0.0, "row 4 untouched");
3532 assert_eq!(p[(4, 4)], 0.0, "row 4 untouched");
3533 }
3534
3535 /// Test column-oriented `extend_add_mapped` with a larger 10×10
3536 /// contribution (monotonic map), comparing against a naive reference.
3537 #[test]
3538 fn test_extend_add_mapped_10x10_vs_reference() {
3539 let cb_size = 10;
3540 let parent_size = 15;
3541
3542 // Monotonically increasing map: child rows map to
3543 // parent rows [1, 3, 4, 5, 7, 8, 10, 11, 12, 14].
3544 let ea_row_map: Vec<u32> = vec![1, 3, 4, 5, 7, 8, 10, 11, 12, 14];
3545
3546 // Build a child contribution with known lower-triangle values.
3547 // Use value = (i+1)*100 + (j+1) for easy identification.
3548 let mut child_data = Mat::<f64>::zeros(cb_size, cb_size);
3549 for j in 0..cb_size {
3550 for i in j..cb_size {
3551 child_data[(i, j)] = (i as f64 + 1.0) * 100.0 + (j as f64 + 1.0);
3552 }
3553 }
3554
3555 // Reference: naive element-by-element scatter with lower-triangle swap
3556 let mut ref_parent = Mat::<f64>::zeros(parent_size, parent_size);
3557 for j in 0..cb_size {
3558 let lj = ea_row_map[j] as usize;
3559 for i in j..cb_size {
3560 let li = ea_row_map[i] as usize;
3561 if li >= lj {
3562 ref_parent[(li, lj)] += child_data[(i, j)];
3563 } else {
3564 ref_parent[(lj, li)] += child_data[(i, j)];
3565 }
3566 }
3567 }
3568
3569 // Actual: column-oriented extend_add_mapped
3570 let mut parent_data = Mat::<f64>::zeros(parent_size, parent_size);
3571 let parent_row_indices: Vec<usize> = (0..parent_size).collect();
3572 let mut parent = FrontalMatrix {
3573 data: parent_data.as_mut(),
3574 row_indices: &parent_row_indices,
3575 num_fully_summed: 5,
3576 };
3577 let child = ContributionBlock {
3578 data: child_data,
3579 row_indices: (0..cb_size).collect(),
3580 num_delayed: 0,
3581 };
3582 let _ = extend_add_mapped(&mut parent, child, &ea_row_map);
3583
3584 // Compare every element
3585 for j in 0..parent_size {
3586 for i in 0..parent_size {
3587 assert_eq!(
3588 parent.data[(i, j)],
3589 ref_parent[(i, j)],
3590 "mismatch at ({}, {})",
3591 i,
3592 j
3593 );
3594 }
3595 }
3596 }
3597
3598 /// Test column-oriented `extend_add_mapped` with a non-monotonic map
3599 /// (simulating post-amalgamation parent with non-contiguous owned ranges).
3600 #[test]
3601 fn test_extend_add_mapped_non_monotonic() {
3602 let cb_size = 4;
3603 let parent_size = 8;
3604
3605 // Non-monotonic map: simulates child pattern rows mapping into a
3606 // parent whose owned_ranges are interleaved with pattern rows.
3607 let ea_row_map: Vec<u32> = vec![5, 2, 7, 1];
3608
3609 // Build child contribution with lower-triangle values
3610 let mut child_data = Mat::<f64>::zeros(cb_size, cb_size);
3611 for j in 0..cb_size {
3612 for i in j..cb_size {
3613 child_data[(i, j)] = (i * 10 + j + 1) as f64;
3614 }
3615 }
3616
3617 // Reference: naive scatter with swap
3618 let mut ref_parent = Mat::<f64>::zeros(parent_size, parent_size);
3619 for j in 0..cb_size {
3620 let lj = ea_row_map[j] as usize;
3621 for i in j..cb_size {
3622 let li = ea_row_map[i] as usize;
3623 if li >= lj {
3624 ref_parent[(li, lj)] += child_data[(i, j)];
3625 } else {
3626 ref_parent[(lj, li)] += child_data[(i, j)];
3627 }
3628 }
3629 }
3630
3631 // Actual
3632 let mut parent_data = Mat::<f64>::zeros(parent_size, parent_size);
3633 let parent_row_indices: Vec<usize> = (0..parent_size).collect();
3634 let mut parent = FrontalMatrix {
3635 data: parent_data.as_mut(),
3636 row_indices: &parent_row_indices,
3637 num_fully_summed: 4,
3638 };
3639 let child = ContributionBlock {
3640 data: child_data,
3641 row_indices: (0..cb_size).collect(),
3642 num_delayed: 0,
3643 };
3644 let _ = extend_add_mapped(&mut parent, child, &ea_row_map);
3645
3646 // Compare every element
3647 for j in 0..parent_size {
3648 for i in 0..parent_size {
3649 assert_eq!(
3650 parent.data[(i, j)],
3651 ref_parent[(i, j)],
3652 "mismatch at ({}, {})",
3653 i,
3654 j
3655 );
3656 }
3657 }
3658 }
3659
3660 /// Test that `extend_add_mapped` accumulates contributions from multiple
3661 /// children into the same parent.
3662 #[test]
3663 fn test_extend_add_mapped_accumulates() {
3664 let mut parent_data = Mat::<f64>::zeros(4, 4);
3665 let parent_row_indices = vec![0, 1, 2, 3];
3666 let mut parent = FrontalMatrix {
3667 data: parent_data.as_mut(),
3668 row_indices: &parent_row_indices,
3669 num_fully_summed: 2,
3670 };
3671
3672 // First child: 2×2, maps to parent rows [1, 3]
3673 let mut c1_data = Mat::<f64>::zeros(2, 2);
3674 c1_data[(0, 0)] = 10.0;
3675 c1_data[(1, 0)] = 20.0;
3676 c1_data[(1, 1)] = 30.0;
3677 let child1 = ContributionBlock {
3678 data: c1_data,
3679 row_indices: vec![1, 3],
3680 num_delayed: 0,
3681 };
3682 let map1: Vec<u32> = vec![1, 3];
3683
3684 let _ = extend_add_mapped(&mut parent, child1, &map1);
3685
3686 // Second child: 2×2, also maps to parent rows [1, 3]
3687 let mut c2_data = Mat::<f64>::zeros(2, 2);
3688 c2_data[(0, 0)] = 5.0;
3689 c2_data[(1, 0)] = 7.0;
3690 c2_data[(1, 1)] = 9.0;
3691 let child2 = ContributionBlock {
3692 data: c2_data,
3693 row_indices: vec![1, 3],
3694 num_delayed: 0,
3695 };
3696 let map2: Vec<u32> = vec![1, 3];
3697
3698 let _ = extend_add_mapped(&mut parent, child2, &map2);
3699
3700 let p = parent.data.as_ref();
3701 assert_eq!(p[(1, 1)], 15.0, "10 + 5 accumulated");
3702 assert_eq!(p[(3, 1)], 27.0, "20 + 7 accumulated");
3703 assert_eq!(p[(3, 3)], 39.0, "30 + 9 accumulated");
3704 }
3705
3706 // -----------------------------------------------------------------------
3707 // Small-leaf subtree classification tests
3708 // -----------------------------------------------------------------------
3709
3710 /// Helper: create a SupernodeInfo with given parameters for classification tests.
3711 /// `ncols` = number of owned columns, `pattern_len` = number of off-diagonal rows.
3712 /// front_size = ncols + pattern_len.
3713 #[allow(clippy::single_range_in_vec_init)]
3714 fn make_snode(
3715 col_begin: usize,
3716 ncols: usize,
3717 pattern_len: usize,
3718 parent: Option<usize>,
3719 ) -> SupernodeInfo {
3720 SupernodeInfo {
3721 col_begin,
3722 col_end: col_begin + ncols,
3723 pattern: (0..pattern_len).map(|i| 1000 + col_begin + i).collect(),
3724 parent,
3725 owned_ranges: vec![col_begin..col_begin + ncols],
3726 in_small_leaf: false,
3727 }
3728 }
3729
3730 /// Linear chain of 5 small supernodes — all should be in_small_leaf.
3731 #[test]
3732 fn test_classify_all_small() {
3733 // Chain: 0 → 1 → 2 → 3 → 4 (each has 2 owned cols, 3 pattern → front_size=5)
3734 let mut supernodes = vec![
3735 make_snode(0, 2, 3, Some(1)),
3736 make_snode(2, 2, 3, Some(2)),
3737 make_snode(4, 2, 3, Some(3)),
3738 make_snode(6, 2, 3, Some(4)),
3739 make_snode(8, 2, 3, None),
3740 ];
3741 let children = build_children_map(&supernodes);
3742
3743 let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3744
3745 // All 5 nodes should be in_small_leaf
3746 for sn in &supernodes {
3747 assert!(
3748 sn.in_small_leaf,
3749 "snode {} should be in_small_leaf",
3750 sn.col_begin
3751 );
3752 }
3753 // One subtree with all 5 nodes in postorder
3754 assert_eq!(subtrees.len(), 1);
3755 assert_eq!(subtrees[0].nodes.len(), 5);
3756 assert_eq!(subtrees[0].nodes, vec![0, 1, 2, 3, 4]);
3757 assert_eq!(subtrees[0].root, 4);
3758 assert_eq!(subtrees[0].parent_of_root, None);
3759 }
3760
3761 /// Mixed tree — small leaves, large root.
3762 #[test]
3763 fn test_classify_mixed_tree() {
3764 // Tree: leaves 0,1 → parent 2 (large)
3765 // Nodes 0,1: front_size=5 (small)
3766 // Node 2: front_size=300 (large, >= 256)
3767 let mut supernodes = vec![
3768 make_snode(0, 2, 3, Some(2)), // front=5, small
3769 make_snode(2, 2, 3, Some(2)), // front=5, small
3770 make_snode(4, 100, 200, None), // front=300, large
3771 ];
3772 let children = build_children_map(&supernodes);
3773
3774 let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3775
3776 // Node 2 is large → not in_small_leaf
3777 assert!(!supernodes[2].in_small_leaf);
3778 // Nodes 0,1 are small leaves but they don't form a subtree with ≥2 nodes
3779 // that has a common root — they are separate single-leaf subtrees
3780 // Each leaf alone is 1 node which is < 2, so no subtrees
3781 assert!(!supernodes[0].in_small_leaf, "single node excluded");
3782 assert!(!supernodes[1].in_small_leaf, "single node excluded");
3783 assert_eq!(subtrees.len(), 0);
3784 }
3785
3786 /// Single isolated small leaves — no subtree formed (min 2 nodes).
3787 #[test]
3788 fn test_classify_single_node_excluded() {
3789 // Three independent small leaves (no parent linkage between them)
3790 let mut supernodes = vec![
3791 make_snode(0, 2, 3, None), // root, small, standalone
3792 make_snode(2, 2, 3, None), // root, small, standalone
3793 make_snode(4, 2, 3, None), // root, small, standalone
3794 ];
3795 let children = build_children_map(&supernodes);
3796
3797 let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3798
3799 // Each is a single-node subtree → excluded
3800 assert_eq!(subtrees.len(), 0);
3801 // in_small_leaf should be false for single-node subtrees (cleared by filter)
3802 for sn in &supernodes {
3803 assert!(!sn.in_small_leaf);
3804 }
3805 }
3806
3807 /// Threshold boundary — front_size == 256 excluded, 255 included.
3808 #[test]
3809 fn test_classify_threshold_boundary() {
3810 // Node 0: front_size=255 (qualifies), child of node 1
3811 // Node 1: front_size=256 (does NOT qualify — strict less-than)
3812 let mut supernodes = vec![
3813 make_snode(0, 100, 155, Some(1)), // front=255
3814 make_snode(100, 128, 128, None), // front=256
3815 ];
3816 let children = build_children_map(&supernodes);
3817
3818 let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3819
3820 assert!(
3821 !supernodes[1].in_small_leaf,
3822 "front_size=256 must NOT qualify"
3823 );
3824 // Node 0 is a small leaf but its parent is NOT small → single-node subtree → excluded
3825 assert!(!supernodes[0].in_small_leaf, "single node excluded");
3826 assert_eq!(subtrees.len(), 0);
3827
3828 // Now with front_size=255 for the parent too
3829 let mut supernodes2 = vec![
3830 make_snode(0, 100, 155, Some(1)), // front=255
3831 make_snode(100, 127, 128, None), // front=255
3832 ];
3833 let children2 = build_children_map(&supernodes2);
3834 let subtrees2 = classify_small_leaf_subtrees(&mut supernodes2, &children2, 256);
3835
3836 assert!(supernodes2[0].in_small_leaf);
3837 assert!(supernodes2[1].in_small_leaf);
3838 assert_eq!(subtrees2.len(), 1);
3839 assert_eq!(subtrees2[0].nodes, vec![0, 1]);
3840 }
3841
3842 /// Disabled when threshold=0.
3843 #[test]
3844 fn test_classify_disabled() {
3845 let mut supernodes = vec![make_snode(0, 2, 3, Some(1)), make_snode(2, 2, 3, None)];
3846 let children = build_children_map(&supernodes);
3847
3848 let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 0);
3849
3850 assert_eq!(subtrees.len(), 0);
3851 assert!(!supernodes[0].in_small_leaf);
3852 assert!(!supernodes[1].in_small_leaf);
3853 }
3854
3855 /// Two independent small-leaf subtrees.
3856 #[test]
3857 fn test_classify_multiple_subtrees() {
3858 // Tree:
3859 // 0 → 2 (subtree A)
3860 // 1 → 2 (subtree A)
3861 // 2 → 5 (large root)
3862 // 3 → 4 (subtree B)
3863 // 4 → 5 (subtree B is just 3→4)
3864 // 5 = large root
3865 let mut supernodes = vec![
3866 make_snode(0, 2, 3, Some(2)), // 0: front=5, small
3867 make_snode(2, 2, 3, Some(2)), // 1: front=5, small
3868 make_snode(4, 2, 3, Some(5)), // 2: front=5, small (parent is large)
3869 make_snode(6, 2, 3, Some(4)), // 3: front=5, small
3870 make_snode(8, 2, 3, Some(5)), // 4: front=5, small (parent is large)
3871 make_snode(10, 100, 200, None), // 5: front=300, large
3872 ];
3873 let children = build_children_map(&supernodes);
3874
3875 let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3876
3877 // Two subtrees: {0,1,2} and {3,4}
3878 assert_eq!(subtrees.len(), 2);
3879
3880 // Subtree A: root=2, nodes=[0,1,2] in postorder
3881 assert_eq!(subtrees[0].root, 2);
3882 assert_eq!(subtrees[0].nodes, vec![0, 1, 2]);
3883 assert_eq!(subtrees[0].parent_of_root, Some(5));
3884
3885 // Subtree B: root=4, nodes=[3,4] in postorder
3886 assert_eq!(subtrees[1].root, 4);
3887 assert_eq!(subtrees[1].nodes, vec![3, 4]);
3888 assert_eq!(subtrees[1].parent_of_root, Some(5));
3889
3890 // Large root is not in_small_leaf
3891 assert!(!supernodes[5].in_small_leaf);
3892 // All others are
3893 for (s, sn) in supernodes.iter().enumerate().take(5) {
3894 assert!(sn.in_small_leaf, "snode {} should be in_small_leaf", s);
3895 }
3896 }
3897}