Skip to main content

rivrs_sparse/symmetric/
numeric.rs

1//! Multifrontal numeric factorization for sparse symmetric indefinite matrices.
2//!
3//! Implements the multifrontal method with A Posteriori Threshold Pivoting (APTP)
4//! for computing P^T A P = L D L^T factorizations of sparse symmetric indefinite
5//! matrices. The factorization traverses the assembly tree in postorder, assembling
6//! dense frontal matrices and factoring them using the [`aptp_factor_in_place`]
7//! kernel.
8//!
9//! # Algorithm Overview
10//!
11//! For each supernode in assembly-tree postorder:
12//! 1. **Assemble** a dense frontal matrix by scattering original sparse entries
13//!    and extend-adding child contribution blocks
14//! 2. **Factor** the fully-summed portion using APTP (1x1/2x2 pivoting with delays)
15//! 3. **Extract** per-supernode factors (L11, D11, L21) and contribution block
16//! 4. **Propagate** contribution block (including delayed columns) to parent
17//!
18//! # Key Types
19//!
20//! - [`AptpNumeric`] — complete factorization result (public API)
21//! - [`FrontFactors`] — per-supernode factors (public, used by triangular solve)
22//! - [`FactorizationStats`] — aggregate statistics (public)
23//! - `SupernodeInfo` — unified supernode descriptor (internal)
24//! - `FrontalMatrix` — temporary dense assembly matrix (internal)
25//! - `ContributionBlock` — Schur complement passed to parent (internal)
26//!
27//! # References
28//!
29//! - Duff & Reid (1983), "The multifrontal solution of indefinite sparse symmetric
30//!   linear equations" — foundational multifrontal method
31//! - Liu (1992), "The Multifrontal Method for Sparse Matrix Solution: Theory and
32//!   Practice" — frontal matrices, elimination trees, extend-add
33//! - Hogg, Duff & Lopez (2020), "A New Sparse LDL^T Solver Using A Posteriori
34//!   Threshold Pivoting" — APTP algorithm integration with multifrontal framework
35
36use faer::sparse::SparseColMat;
37use faer::sparse::linalg::cholesky::SymbolicCholeskyRaw;
38use faer::{Mat, MatMut, Par};
39
40use super::diagonal::{MixedDiagonal, PivotEntry};
41use super::factor::{
42    AptpFactorResult, AptpOptions, aptp_factor_in_place, compute_contribution_gemm,
43    compute_contribution_gemm_rect, extract_contribution_rect, extract_front_factors_rect,
44    tpp_factor_rect,
45};
46use super::pivot::{Block2x2, PivotType};
47use super::symbolic::AptpSymbolic;
48use crate::error::SparseError;
49
50#[cfg(feature = "diagnostic")]
51use crate::profiling::{FinishedSession, ProfileSession};
52
53/// Sentinel value indicating a global column is not part of the current front.
54const NOT_IN_FRONT: usize = usize::MAX;
55
56// ---------------------------------------------------------------------------
57// Workspace types
58// ---------------------------------------------------------------------------
59
60/// Pre-allocated reusable buffers for the per-supernode factorization loop.
61///
62/// Eliminates per-supernode heap allocations for the frontal matrix, row indices,
63/// delayed columns buffer, and global-to-local mapping. Sized to the maximum
64/// front dimension from symbolic analysis and reused across supernodes.
65///
66/// # Invariant
67///
68/// Never shared between concurrent supernodes. In sequential mode, passed as
69/// `&mut`. In parallel mode, each rayon worker has its own instance via
70/// `Cell`-based move semantics.
71///
72/// # References
73///
74/// - Duff, Hogg & Lopez (2020), Section 5: workspace reuse strategy
75/// - Liu (1992), Section 4: multifrontal workspace management
76pub(crate) struct FactorizationWorkspace {
77    /// Dense buffer, max_front × max_front. Reused as each supernode's frontal matrix.
78    frontal_data: Mat<f64>,
79    /// Buffer for frontal matrix row indices, capacity max_front.
80    frontal_row_indices: Vec<usize>,
81    /// Buffer for collecting delayed columns from children.
82    delayed_cols_buf: Vec<usize>,
83    /// Global-to-local row index mapping, length n (matrix dimension).
84    global_to_local: Vec<usize>,
85    /// Pre-allocated contribution buffer, max_front × max_front. Receives the
86    /// deferred contribution GEMM output. Reused across supernodes via swap protocol:
87    /// final GEMM writes here → buffer moved into ContributionBlock → parent's
88    /// extend-add recycles buffer back.
89    contrib_buffer: Mat<f64>,
90    /// Pre-allocated L·D product workspace for `compute_contribution_gemm`.
91    /// Sized nfs × ne (grows on demand). Eliminates per-supernode allocation
92    /// in the hot loop.
93    ld_workspace: Mat<f64>,
94}
95
96impl Default for FactorizationWorkspace {
97    fn default() -> Self {
98        Self::empty()
99    }
100}
101
102impl FactorizationWorkspace {
103    /// Create a new workspace sized for the given maximum front dimension.
104    ///
105    /// # Arguments
106    ///
107    /// - `max_front`: Maximum frontal matrix dimension across all supernodes
108    /// - `n`: Matrix dimension (for global-to-local mapping)
109    pub(crate) fn new(max_front: usize, n: usize) -> Self {
110        if max_front == 0 {
111            return Self {
112                frontal_data: Mat::zeros(0, 0),
113                frontal_row_indices: Vec::new(),
114                delayed_cols_buf: Vec::new(),
115                global_to_local: vec![NOT_IN_FRONT; n],
116                contrib_buffer: Mat::zeros(0, 0),
117                ld_workspace: Mat::new(),
118            };
119        }
120        Self {
121            frontal_data: Mat::zeros(max_front, max_front),
122            frontal_row_indices: Vec::with_capacity(max_front),
123            delayed_cols_buf: Vec::with_capacity(max_front),
124            global_to_local: vec![NOT_IN_FRONT; n],
125            // Lazily allocated on first use in factor_single_supernode.
126            // Recycled via extend_add buffer return, so typically allocates once.
127            contrib_buffer: Mat::new(),
128            ld_workspace: Mat::new(),
129        }
130    }
131
132    /// Empty workspace for `Cell` default in thread-local storage.
133    const fn empty() -> Self {
134        Self {
135            frontal_data: Mat::new(),
136            frontal_row_indices: Vec::new(),
137            delayed_cols_buf: Vec::new(),
138            global_to_local: Vec::new(),
139            contrib_buffer: Mat::new(),
140            ld_workspace: Mat::new(),
141        }
142    }
143
144    /// Zero the lower triangle of the m × m subregion of `frontal_data`.
145    ///
146    /// Called after the front size `m` is known and row indices are populated.
147    /// Does NOT clear `frontal_row_indices` or `delayed_cols_buf` — those
148    /// are managed by the caller before determining `m`.
149    ///
150    /// # Panics
151    ///
152    /// Panics if `m > self.frontal_data.nrows()`.
153    fn zero_frontal(&mut self, m: usize) {
154        assert!(
155            m <= self.frontal_data.nrows(),
156            "front size {} exceeds workspace capacity {}",
157            m,
158            self.frontal_data.nrows()
159        );
160        // Zero the lower triangle of the m × m subregion via per-column fill.
161        // col_as_slice_mut gives a contiguous &mut [f64] for each column of the
162        // owned Mat, so fill(0.0) compiles to memset — much faster than indexed writes.
163        for j in 0..m {
164            self.frontal_data.col_as_slice_mut(j)[j..m].fill(0.0);
165        }
166    }
167
168    /// Ensure `global_to_local` mapping is at least `n` entries.
169    ///
170    /// Used by the parallel path where frontal_data grows on demand in
171    /// `factor_single_supernode` (avoiding eager max_front² allocation
172    /// per thread that causes OOM on large-front matrices like H2O).
173    fn ensure_g2l(&mut self, n: usize) {
174        if self.global_to_local.len() < n {
175            self.global_to_local.resize(n, NOT_IN_FRONT);
176        }
177    }
178}
179
180/// Front dimension below which intra-node BLAS uses `Par::Seq` regardless of
181/// the user-supplied parallelism setting. Fronts smaller than this threshold
182/// do not benefit from parallel BLAS (overhead exceeds computation).
183pub(crate) const INTRA_NODE_THRESHOLD: usize = 256;
184
185/// Compute an upper bound on the maximum front size across all supernodes.
186///
187/// The front size for a supernode is `owned_cols + delayed_cols + pattern.len()`.
188/// Since delayed columns depend on the actual factorization, we estimate using
189/// `owned_cols + pattern.len()` (front size without delays) and add a safety
190/// factor. If delayed columns cause a front to exceed this estimate, the
191/// workspace will be resized at that point.
192fn estimate_max_front_size(supernodes: &[SupernodeInfo]) -> usize {
193    supernodes
194        .iter()
195        .map(|sn| {
196            let owned: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
197            owned + sn.pattern.len()
198        })
199        .max()
200        .unwrap_or(0)
201}
202
203// ---------------------------------------------------------------------------
204// Assembly maps (precomputed scatter indices)
205// ---------------------------------------------------------------------------
206
207/// Precomputed index mappings for assembly operations.
208///
209/// Eliminates per-entry index arithmetic in `scatter_original_entries_multi`
210/// and `extend_add` by storing source→destination index pairs computed from
211/// the symbolic structure. Computed once after amalgamation at the start of
212/// factorization and read-only during the assembly loop.
213///
214/// Uses `u32` indices to halve memory (frontal matrices are bounded by
215/// max_front_size, well within u32 range).
216///
217/// # Layout
218///
219/// Both `amap_entries` and `ea_map` use CSC-style compressed storage with
220/// per-supernode/per-child offset arrays for O(1) slice access.
221///
222/// # References
223///
224/// - SPRAL `assemble.hxx:51-80` (`add_a_block`): amap-based assembly pattern
225pub(crate) struct AssemblyMaps {
226    /// Flattened tuples for original matrix scatter. Each entry is 4 u32 values:
227    /// `[src_csc_index, dest_frontal_linear_index, scale_row, scale_col]` where
228    /// `src_csc_index` is the position in the CSC values array, `dest_frontal_linear_index`
229    /// is `col * m + row` in column-major layout, and `scale_row`/`scale_col` are
230    /// the permuted indices for MC64 scaling lookup (`scaling[scale_row] * scaling[scale_col]`).
231    /// Length: 4 * total_entries across all supernodes.
232    pub amap_entries: Vec<u32>,
233    /// Per-supernode start offset into `amap_entries` (in entry units; each entry
234    /// is 4 u32 values). `amap_entries[amap_offsets[s]*4 .. amap_offsets[s+1]*4]`
235    /// gives the flattened entries for supernode s.
236    /// Length: num_supernodes + 1.
237    pub amap_offsets: Vec<usize>,
238
239    /// Flattened child→parent row index mappings for extend-add.
240    /// For each child's contribution row, stores the local row index in the
241    /// parent's frontal matrix (assuming zero delays).
242    pub ea_map: Vec<u32>,
243    /// Per-child start offset into `ea_map`. Indexed by the flattened child
244    /// enumeration (all children of all supernodes in tree order).
245    /// Length: total_children + 1.
246    pub ea_offsets: Vec<usize>,
247    /// Per-supernode start offset into the children enumeration.
248    /// `ea_snode_child_begin[s]..ea_snode_child_begin[s+1]` gives the range
249    /// in `ea_offsets` for supernode s's children.
250    /// Length: num_supernodes + 1.
251    pub ea_snode_child_begin: Vec<usize>,
252}
253
254/// Build precomputed assembly maps from post-amalgamation supernode structure.
255///
256/// # Arguments
257///
258/// - `supernodes`: Post-amalgamation supernode descriptors.
259/// - `children`: Children map (children[s] = list of child supernode indices).
260/// - `matrix`: Original sparse matrix (for CSC structure).
261/// - `perm_fwd`: Forward permutation (perm_fwd[new] = old).
262/// - `perm_inv`: Inverse permutation (perm_inv[old] = new).
263/// - `n`: Matrix dimension.
264fn build_assembly_maps(
265    supernodes: &[SupernodeInfo],
266    children: &[Vec<usize>],
267    matrix: &SparseColMat<usize, f64>,
268    perm_fwd: &[usize],
269    perm_inv: &[usize],
270    n: usize,
271) -> AssemblyMaps {
272    let n_supernodes = supernodes.len();
273    let symbolic = matrix.symbolic();
274    let col_ptrs = symbolic.col_ptr();
275    let row_indices_csc = symbolic.row_idx();
276
277    // ---- Build amap (original entry scatter maps) ----
278    // Each entry is 4 u32: [src_csc_index, dest_frontal_linear, scale_row, scale_col]
279    // where scale_row = perm_inv[orig_row], scale_col = perm_inv[orig_col]
280    // (precomputed for efficient MC64 scaling at assembly time).
281
282    let mut amap_entries = Vec::new();
283    let mut amap_offsets = vec![0usize; n_supernodes + 1];
284
285    // Temporary global-to-local mapping, reused per supernode
286    let mut g2l = vec![NOT_IN_FRONT; n];
287
288    for (s, sn) in supernodes.iter().enumerate() {
289        // Compute front structure for this supernode (without delays)
290        let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
291        let mut frontal_rows: Vec<usize> = Vec::with_capacity(sn_ncols + sn.pattern.len());
292        for range in &sn.owned_ranges {
293            frontal_rows.extend(range.clone());
294        }
295        frontal_rows.extend_from_slice(&sn.pattern);
296        let m = frontal_rows.len();
297
298        // Build global-to-local
299        for (i, &global) in frontal_rows.iter().enumerate() {
300            g2l[global] = i;
301        }
302
303        let total_owned = sn_ncols;
304
305        // Iterate owned columns and build amap entries
306        for range in &sn.owned_ranges {
307            for pj in range.clone() {
308                let orig_col = perm_fwd[pj];
309                let local_col = g2l[pj];
310
311                let start = col_ptrs[orig_col];
312                let end = col_ptrs[orig_col + 1];
313                for (idx, &orig_row) in row_indices_csc.iter().enumerate().take(end).skip(start) {
314                    // Upper-triangle dedup: skip if both are owned supernode cols
315                    if orig_row < orig_col {
316                        let perm_row = perm_inv[orig_row];
317                        let local_peer = g2l[perm_row];
318                        if local_peer != NOT_IN_FRONT && local_peer < total_owned {
319                            continue;
320                        }
321                    }
322                    let global_row = perm_inv[orig_row];
323                    let local_row = g2l[global_row];
324                    if local_row == NOT_IN_FRONT {
325                        continue;
326                    }
327
328                    // Determine destination in lower triangle
329                    let (dest_row, dest_col) = if local_row >= local_col {
330                        (local_row, local_col)
331                    } else {
332                        (local_col, local_row)
333                    };
334                    let dest_linear = dest_col * m + dest_row;
335
336                    amap_entries.push(idx as u32);
337                    amap_entries.push(dest_linear as u32);
338                    amap_entries.push(global_row as u32); // perm_inv[orig_row]
339                    amap_entries.push(pj as u32); // perm_inv[orig_col] = pj
340                }
341            }
342        }
343
344        // Reset g2l
345        for &global in &frontal_rows {
346            g2l[global] = NOT_IN_FRONT;
347        }
348
349        let entry_end = amap_entries.len() / 4;
350        amap_offsets[s + 1] = entry_end;
351    }
352
353    // ---- Build extend-add maps ----
354
355    let mut ea_map = Vec::new();
356    let mut ea_offsets = vec![0usize];
357    let mut ea_snode_child_begin = vec![0usize; n_supernodes + 1];
358
359    for (s, sn) in supernodes.iter().enumerate() {
360        // Build parent's frontal row structure (without delays)
361        let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
362        let mut parent_rows: Vec<usize> = Vec::with_capacity(sn_ncols + sn.pattern.len());
363        for range in &sn.owned_ranges {
364            parent_rows.extend(range.clone());
365        }
366        parent_rows.extend_from_slice(&sn.pattern);
367
368        // Build g2l for parent
369        for (i, &global) in parent_rows.iter().enumerate() {
370            g2l[global] = i;
371        }
372
373        for &c in &children[s] {
374            let child_sn = &supernodes[c];
375
376            // Child's contribution rows = child's pattern (off-diagonal rows).
377            // Without delays, the contribution block rows are exactly the pattern.
378            // With delays, additional delayed rows appear — the precomputed map
379            // won't cover those (fallback to g2l at factorization time).
380            for &child_row in &child_sn.pattern {
381                let parent_local = g2l[child_row];
382                debug_assert!(
383                    parent_local != NOT_IN_FRONT,
384                    "child pattern row {} not in parent supernode {}",
385                    child_row,
386                    s
387                );
388                ea_map.push(parent_local as u32);
389            }
390            ea_offsets.push(ea_map.len());
391        }
392
393        // Reset g2l
394        for &global in &parent_rows {
395            g2l[global] = NOT_IN_FRONT;
396        }
397
398        ea_snode_child_begin[s + 1] = ea_offsets.len() - 1;
399    }
400
401    AssemblyMaps {
402        amap_entries,
403        amap_offsets,
404        ea_map,
405        ea_offsets,
406        ea_snode_child_begin,
407    }
408}
409
410// ---------------------------------------------------------------------------
411// Internal types
412// ---------------------------------------------------------------------------
413
414/// Precomputed supernode descriptor unifying faer's supernodal and simplicial
415/// decompositions.
416///
417/// Built once from [`AptpSymbolic`] before the factorization loop via
418/// [`build_supernode_info`]. For supernodal decompositions, maps directly to
419/// faer's supernode structure. For simplicial decompositions, each column
420/// becomes a trivial 1-column supernode.
421///
422/// # Invariants
423///
424/// - `col_end > col_begin`
425/// - `parent.map_or(true, |p| p > s)` for supernode index `s` (postorder)
426///
427/// # References
428///
429/// - Liu (1992), Section 3: supernode definitions and assembly trees
430#[derive(Clone)]
431pub(crate) struct SupernodeInfo {
432    /// First column index of this supernode (inclusive).
433    pub col_begin: usize,
434    /// Past-the-end column index (exclusive).
435    pub col_end: usize,
436    /// Off-diagonal row indices (global permuted column indices of non-fully-summed rows).
437    pub pattern: Vec<usize>,
438    /// Parent supernode index in assembly tree, or `None` for root.
439    pub parent: Option<usize>,
440    /// Column ranges actually owned by this supernode. After amalgamation of
441    /// non-adjacent supernodes, `col_begin..col_end` may span columns belonging
442    /// to other supernodes. `owned_ranges` tracks the true owned columns for
443    /// scatter. Default: `[col_begin..col_end]`.
444    pub owned_ranges: Vec<std::ops::Range<usize>>,
445    /// True if this supernode belongs to a classified small-leaf subtree.
446    /// Set by `classify_small_leaf_subtrees()` after amalgamation.
447    pub in_small_leaf: bool,
448}
449
450/// A classified small-leaf subtree eligible for the fast-path factorization.
451///
452/// Produced by [`classify_small_leaf_subtrees`] after amalgamation and consumed
453/// by `factor_small_leaf_subtree` during the pre-pass before the main level-set
454/// loop. All supernodes in the subtree have front_size < `small_leaf_threshold`,
455/// enabling a streamlined code path with a single small reusable workspace.
456///
457/// # Invariants
458///
459/// - `nodes.len() >= 2` (single-node "subtrees" are excluded)
460/// - `*nodes.last().unwrap() == root`
461/// - All nodes have `in_small_leaf = true`
462/// - `max_front_size < small_leaf_threshold`
463/// - Nodes are in postorder: children appear before parents
464///
465/// # References
466///
467/// - SPRAL `SymbolicSubtree.hxx:57-84` (BSD-3): subtree classification
468/// - SPRAL `SmallLeafNumericSubtree.hxx:187-446` (BSD-3): fast-path factorization
469pub(crate) struct SmallLeafSubtree {
470    /// Supernode ID of the subtree root (topmost node in the subtree).
471    /// Used by tests for invariant verification; always equals `*nodes.last()`.
472    #[allow(dead_code)]
473    pub root: usize,
474    /// Supernode IDs in postorder (leaves first, root last).
475    pub nodes: Vec<usize>,
476    /// Maximum front size across all nodes in the subtree.
477    pub max_front_size: usize,
478    /// Parent supernode outside the subtree that receives the root's contribution,
479    /// or `None` if the subtree root is also a tree root.
480    pub parent_of_root: Option<usize>,
481}
482
483// No methods needed — fields accessed directly within this module.
484
485/// Temporary dense matrix for assembling and factoring one supernode.
486///
487/// References workspace buffers during the factorization loop, or owns its
488/// data in diagnostic contexts (`export_assembled_frontal`).
489///
490/// The matrix is partitioned into:
491/// - `F11 = data[0..k, 0..k]` — fully-summed block (factored by APTP)
492/// - `F21 = data[k..m, 0..k]` — subdiagonal block
493/// - `F22 = data[k..m, k..m]` — contribution block (Schur complement)
494///
495/// where `k = num_fully_summed` and `m = data.nrows()`.
496pub(crate) struct FrontalMatrix<'a> {
497    /// Dense m × m storage (lower triangle used). Borrows from workspace.
498    pub data: MatMut<'a, f64>,
499    /// Global permuted column indices for each local row (length m).
500    /// Used by callers that need index mapping (e.g., scatter maps).
501    #[allow(dead_code)]
502    pub row_indices: &'a [usize],
503    /// Number of fully-summed rows/columns (supernode cols + delayed from children).
504    pub num_fully_summed: usize,
505}
506
507/// Schur complement and delayed columns from a factored supernode.
508///
509/// Created by [`extract_contribution`] and consumed by [`extend_add`] into
510/// the parent supernode's frontal matrix.
511///
512/// # Structure
513///
514/// - Positions `0..num_delayed`: delayed columns (become fully-summed at parent)
515/// - Positions `num_delayed..size`: non-fully-summed rows with Schur complement
516pub(crate) struct ContributionBlock {
517    /// Dense (m - ne) × (m - ne) trailing submatrix from factored frontal matrix.
518    pub data: Mat<f64>,
519    /// Global permuted column indices for rows/columns of the contribution.
520    pub row_indices: Vec<usize>,
521    /// Number of delayed columns at the start of this block.
522    pub num_delayed: usize,
523}
524
525// ---------------------------------------------------------------------------
526// Public types
527// ---------------------------------------------------------------------------
528
529/// Per-supernode factorization result.
530///
531/// Stores the L11, D11, and L21 blocks extracted from the factored frontal
532/// matrix, along with permutation and index information needed by the
533/// triangular solve.
534///
535/// # Invariants
536///
537/// - `l11.nrows() == l11.ncols() == num_eliminated`
538/// - `d11.dimension() == num_eliminated`
539/// - `l21.ncols() == num_eliminated`
540/// - `l21.nrows() == row_indices.len()`
541/// - `col_indices.len() == num_eliminated`
542#[derive(Debug)]
543pub struct FrontFactors {
544    /// Unit lower triangular factor (ne × ne).
545    pub(crate) l11: Mat<f64>,
546    /// Block diagonal with 1x1 and 2x2 pivots (ne entries).
547    pub(crate) d11: MixedDiagonal,
548    /// Subdiagonal factor block (r × ne).
549    pub(crate) l21: Mat<f64>,
550    /// APTP local pivot permutation (length k). Maps factored position to
551    /// original front-local column.
552    pub(crate) local_perm: Vec<usize>,
553    /// Number of columns successfully eliminated (ne ≤ k).
554    pub(crate) num_eliminated: usize,
555    /// Global permuted column indices for the eliminated columns (length ne).
556    pub(crate) col_indices: Vec<usize>,
557    /// Global permuted row indices for L21 rows (length r).
558    pub(crate) row_indices: Vec<usize>,
559}
560
561impl FrontFactors {
562    /// Unit lower triangular factor L11 (ne × ne).
563    pub fn l11(&self) -> &Mat<f64> {
564        &self.l11
565    }
566
567    /// Block diagonal D11 with mixed 1x1/2x2 pivots.
568    pub fn d11(&self) -> &MixedDiagonal {
569        &self.d11
570    }
571
572    /// Subdiagonal factor block L21 (r × ne).
573    pub fn l21(&self) -> &Mat<f64> {
574        &self.l21
575    }
576
577    /// APTP local pivot permutation within this front.
578    pub fn local_perm(&self) -> &[usize] {
579        &self.local_perm
580    }
581
582    /// Number of successfully eliminated columns.
583    pub fn num_eliminated(&self) -> usize {
584        self.num_eliminated
585    }
586
587    /// Global permuted column indices for eliminated columns.
588    pub fn col_indices(&self) -> &[usize] {
589        &self.col_indices
590    }
591
592    /// Global permuted row indices for L21 rows.
593    pub fn row_indices(&self) -> &[usize] {
594        &self.row_indices
595    }
596}
597
598/// Per-supernode diagnostic statistics.
599///
600/// Collected during [`AptpNumeric::factor`] for each supernode processed.
601/// Provides visibility into per-front behavior for analysis and comparison
602/// with reference solvers (e.g., SPRAL).
603#[derive(Debug, Clone)]
604pub struct PerSupernodeStats {
605    /// Supernode index in postorder.
606    pub snode_id: usize,
607    /// Frontal matrix dimension (m).
608    pub front_size: usize,
609    /// Number of fully-summed rows/columns (k = supernode cols + delayed from children).
610    pub num_fully_summed: usize,
611    /// Number of columns successfully eliminated (ne <= k).
612    pub num_eliminated: usize,
613    /// Number of delayed columns (k - ne).
614    pub num_delayed: usize,
615    /// Number of 1x1 pivots accepted at this supernode.
616    pub num_1x1: usize,
617    /// Number of 2x2 pivot pairs accepted at this supernode.
618    pub num_2x2: usize,
619    /// Maximum absolute L entry at this supernode (stability metric).
620    pub max_l_entry: f64,
621    /// Wall-clock time for scatter + extend-add assembly.
622    #[cfg(feature = "diagnostic")]
623    pub assembly_time: std::time::Duration,
624    /// Wall-clock time for dense APTP kernel.
625    #[cfg(feature = "diagnostic")]
626    pub kernel_time: std::time::Duration,
627    /// Wall-clock time for front factor extraction.
628    #[cfg(feature = "diagnostic")]
629    pub extraction_time: std::time::Duration,
630    // -- Sub-phase timing (finer-grained diagnostic breakdown) --
631    /// Wall-clock time for zeroing the frontal matrix (memset-equivalent).
632    #[cfg(feature = "diagnostic")]
633    pub zero_time: std::time::Duration,
634    /// Wall-clock time for building the global-to-local mapping.
635    #[cfg(feature = "diagnostic")]
636    pub g2l_time: std::time::Duration,
637    /// Wall-clock time for scattering original CSC entries into the frontal matrix.
638    #[cfg(feature = "diagnostic")]
639    pub scatter_time: std::time::Duration,
640    /// Wall-clock time for extend-add (merging child contributions).
641    #[cfg(feature = "diagnostic")]
642    pub extend_add_time: std::time::Duration,
643    /// Wall-clock time for extracting L11, D11, L21 from the factored frontal matrix.
644    #[cfg(feature = "diagnostic")]
645    pub extract_factors_time: std::time::Duration,
646    /// Wall-clock time for extracting the contribution block.
647    #[cfg(feature = "diagnostic")]
648    pub extract_contrib_time: std::time::Duration,
649    /// Wall-clock time for the deferred contribution GEMM.
650    #[cfg(feature = "diagnostic")]
651    pub contrib_gemm_time: std::time::Duration,
652    /// Wall-clock time for resetting the global-to-local mapping after factoring.
653    #[cfg(feature = "diagnostic")]
654    pub g2l_reset_time: std::time::Duration,
655}
656
657/// Aggregate statistics from multifrontal factorization.
658///
659/// Reports pivot counts, delay events, and front sizes across all supernodes.
660#[derive(Debug, Clone)]
661pub struct FactorizationStats {
662    /// Total 1x1 pivots across all supernodes.
663    pub total_1x1_pivots: usize,
664    /// Total 2x2 pivot pairs across all supernodes.
665    pub total_2x2_pivots: usize,
666    /// Total delay events across all supernodes.
667    pub total_delayed: usize,
668    /// Columns that could not be eliminated at root supernodes (zero pivots).
669    /// These represent rank deficiency in the matrix. A nonzero value means
670    /// the matrix is numerically singular (or nearly so) — the solve phase
671    /// must handle this appropriately.
672    pub zero_pivots: usize,
673    /// Largest frontal matrix dimension encountered.
674    pub max_front_size: usize,
675    /// Number of supernodes before amalgamation.
676    pub supernodes_before_amalgamation: usize,
677    /// Number of supernodes after amalgamation.
678    pub supernodes_after_amalgamation: usize,
679    /// Number of merge operations performed during amalgamation.
680    pub merges_performed: usize,
681    /// Sum of assembly times across all supernodes.
682    #[cfg(feature = "diagnostic")]
683    pub total_assembly_time: std::time::Duration,
684    /// Sum of kernel times across all supernodes.
685    #[cfg(feature = "diagnostic")]
686    pub total_kernel_time: std::time::Duration,
687    /// Sum of extraction times across all supernodes.
688    #[cfg(feature = "diagnostic")]
689    pub total_extraction_time: std::time::Duration,
690    // -- Sub-phase totals --
691    /// Sum of zeroing times across all supernodes.
692    #[cfg(feature = "diagnostic")]
693    pub total_zero_time: std::time::Duration,
694    /// Sum of g2l build times across all supernodes.
695    #[cfg(feature = "diagnostic")]
696    pub total_g2l_time: std::time::Duration,
697    /// Sum of scatter times across all supernodes.
698    #[cfg(feature = "diagnostic")]
699    pub total_scatter_time: std::time::Duration,
700    /// Sum of extend-add times across all supernodes.
701    #[cfg(feature = "diagnostic")]
702    pub total_extend_add_time: std::time::Duration,
703    /// Sum of factor extraction times across all supernodes.
704    #[cfg(feature = "diagnostic")]
705    pub total_extract_factors_time: std::time::Duration,
706    /// Sum of contribution extraction times across all supernodes.
707    #[cfg(feature = "diagnostic")]
708    pub total_extract_contrib_time: std::time::Duration,
709    /// Sum of deferred contribution GEMM times across all supernodes.
710    #[cfg(feature = "diagnostic")]
711    pub total_contrib_gemm_time: std::time::Duration,
712    /// Sum of g2l reset times across all supernodes.
713    #[cfg(feature = "diagnostic")]
714    pub total_g2l_reset_time: std::time::Duration,
715    /// Number of small-leaf subtrees identified by classification.
716    pub small_leaf_subtrees: usize,
717    /// Number of supernodes processed via the small-leaf fast path.
718    pub small_leaf_nodes: usize,
719}
720
721/// Assembled frontal matrix exported for diagnostic comparison.
722#[cfg(feature = "diagnostic")]
723pub struct ExportedFrontal {
724    /// Dense m x m frontal matrix (lower triangle populated).
725    pub data: Mat<f64>,
726    /// Total front size.
727    pub front_size: usize,
728    /// Number of fully-summed columns.
729    pub num_fully_summed: usize,
730    /// Global permuted column indices for each local row.
731    pub row_indices: Vec<usize>,
732}
733
734/// Complete multifrontal numeric factorization result.
735///
736/// Contains per-supernode factors and aggregate statistics. Created by
737/// [`AptpNumeric::factor`] and used by the triangular solve to
738/// perform forward/backward substitution.
739///
740/// # Usage
741///
742/// ```
743/// use faer::sparse::{SparseColMat, Triplet};
744/// use faer::sparse::linalg::cholesky::SymmetricOrdering;
745/// use rivrs_sparse::symmetric::{AptpSymbolic, AptpOptions, AptpNumeric};
746///
747/// // Analyze
748/// # let triplets = vec![Triplet::new(0, 0, 1.0)];
749/// # let matrix = SparseColMat::try_new_from_triplets(1, 1, &triplets).unwrap();
750/// let symbolic = AptpSymbolic::analyze(matrix.symbolic(), SymmetricOrdering::Amd).unwrap();
751///
752/// // Factor
753/// let numeric = AptpNumeric::factor(&symbolic, &matrix, &AptpOptions::default(), None).unwrap();
754/// println!("Stats: {:?}", numeric.stats());
755/// ```
756pub struct AptpNumeric {
757    /// Per-supernode factors, indexed by supernode ID.
758    front_factors: Vec<FrontFactors>,
759    /// Aggregate factorization statistics.
760    stats: FactorizationStats,
761    /// Per-supernode diagnostic statistics (same order as `front_factors`).
762    per_supernode_stats: Vec<PerSupernodeStats>,
763    /// Matrix dimension.
764    n: usize,
765    /// Profiling session from factorization (Chrome Trace export).
766    #[cfg(feature = "diagnostic")]
767    profile_session: Option<FinishedSession>,
768}
769
770impl std::fmt::Debug for AptpNumeric {
771    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
772        f.debug_struct("AptpNumeric")
773            .field("n", &self.n)
774            .field("stats", &self.stats)
775            .field("front_factors_count", &self.front_factors.len())
776            .finish()
777    }
778}
779
780impl AptpNumeric {
781    /// Per-supernode factors, indexed by supernode ID.
782    pub fn front_factors(&self) -> &[FrontFactors] {
783        &self.front_factors
784    }
785
786    /// Aggregate factorization statistics.
787    pub fn stats(&self) -> &FactorizationStats {
788        &self.stats
789    }
790
791    /// Per-supernode diagnostic statistics.
792    ///
793    /// One entry per supernode in postorder, matching [`front_factors`](Self::front_factors).
794    pub fn per_supernode_stats(&self) -> &[PerSupernodeStats] {
795        &self.per_supernode_stats
796    }
797
798    /// Matrix dimension.
799    pub fn n(&self) -> usize {
800        self.n
801    }
802
803    /// Profiling session from factorization, for Chrome Trace export.
804    ///
805    /// Returns `None` if factorization was not profiled.
806    /// Only available with `diagnostic` feature.
807    #[cfg(feature = "diagnostic")]
808    pub fn profile_session(&self) -> Option<&FinishedSession> {
809        self.profile_session.as_ref()
810    }
811
812    /// Factor a sparse symmetric matrix using the multifrontal method with APTP.
813    ///
814    /// Traverses the assembly tree in postorder, assembling and factoring dense
815    /// frontal matrices at each supernode using the dense APTP kernel.
816    ///
817    /// # Arguments
818    ///
819    /// - `symbolic`: Symbolic analysis result from [`AptpSymbolic::analyze`]
820    /// - `matrix`: Sparse symmetric matrix (lower triangle stored). Dimensions
821    ///   must match `symbolic.nrows()`
822    /// - `options`: APTP configuration (threshold, fallback strategy)
823    ///
824    /// # Errors
825    ///
826    /// - [`SparseError::DimensionMismatch`] if matrix dimensions don't match symbolic
827    /// - [`SparseError::AnalysisFailure`] if symbolic analysis is inconsistent
828    ///
829    /// # Zero Pivots
830    ///
831    /// If a root supernode has columns that cannot be eliminated (all pivots
832    /// delayed to root but still fail the threshold), these are recorded as
833    /// zero pivots in [`FactorizationStats::zero_pivots`] rather than returning
834    /// an error. The factorization succeeds;
835    /// the solve phase must handle the rank deficiency.
836    ///
837    /// # References
838    ///
839    /// - Duff & Reid (1983), "The multifrontal solution of indefinite sparse
840    ///   symmetric linear equations"
841    /// - Liu (1992), "The Multifrontal Method for Sparse Matrix Solution"
842    /// - Hogg, Duff & Lopez (2020), "A New Sparse LDL^T Solver Using A Posteriori
843    ///   Threshold Pivoting"
844    pub fn factor(
845        symbolic: &AptpSymbolic,
846        matrix: &SparseColMat<usize, f64>,
847        options: &AptpOptions,
848        scaling: Option<&[f64]>,
849    ) -> Result<Self, SparseError> {
850        let nemin = options.nemin;
851        let n = symbolic.nrows();
852
853        // Validate dimensions
854        if matrix.nrows() != n || matrix.ncols() != n {
855            return Err(SparseError::DimensionMismatch {
856                expected: (n, n),
857                got: (matrix.nrows(), matrix.ncols()),
858                context: "Matrix dimensions must match symbolic analysis".to_string(),
859            });
860        }
861
862        // Build supernode info (unified abstraction over supernodal/simplicial)
863        let supernodes = build_supernode_info(symbolic);
864        let n_supernodes_before = supernodes.len();
865        let mut supernodes = super::amalgamation::amalgamate(supernodes, nemin);
866        let n_supernodes = supernodes.len();
867        let merges_performed = n_supernodes_before - n_supernodes;
868        let children = build_children_map(&supernodes);
869
870        // Classify small-leaf subtrees for the fast-path pre-pass
871        let small_leaf_subtrees =
872            classify_small_leaf_subtrees(&mut supernodes, &children, options.small_leaf_threshold);
873
874        // Get fill-reducing permutation
875        let (perm_fwd, perm_inv) = symbolic.perm_vecs();
876
877        // Precompute assembly index maps (scatter + extend-add) for the
878        // zero-delay case. Supernodes with delayed children fall back to
879        // per-entry index arithmetic.
880        let assembly_maps =
881            build_assembly_maps(&supernodes, &children, matrix, &perm_fwd, &perm_inv, n);
882
883        // Profiling session (diagnostic only)
884        #[cfg(feature = "diagnostic")]
885        let session = ProfileSession::new();
886        #[cfg(feature = "diagnostic")]
887        let _factor_loop_guard = session.enter_section("factor_loop");
888
889        // Iterative level-set factorization: process all ready (leaf) nodes first,
890        // then propagate upward. No recursion — avoids stack overflow on deep
891        // elimination trees (e.g., c-71 with 76K supernodes on rayon's 2MB stacks).
892        let all_node_results = factor_tree_levelset(
893            &supernodes,
894            &children,
895            matrix,
896            &perm_fwd,
897            &perm_inv,
898            options,
899            scaling,
900            n,
901            &assembly_maps,
902            &small_leaf_subtrees,
903        )?;
904
905        #[cfg(feature = "diagnostic")]
906        drop(_factor_loop_guard);
907
908        // Scatter returned results into supernode-indexed vectors
909        let mut front_factors_vec: Vec<Option<FrontFactors>> =
910            (0..n_supernodes).map(|_| None).collect();
911        let mut per_sn_stats_vec: Vec<Option<PerSupernodeStats>> =
912            (0..n_supernodes).map(|_| None).collect();
913        for (idx, ff, stats) in all_node_results {
914            front_factors_vec[idx] = Some(ff);
915            per_sn_stats_vec[idx] = Some(stats);
916        }
917
918        let front_factors: Vec<FrontFactors> = front_factors_vec
919            .into_iter()
920            .enumerate()
921            .map(|(i, opt)| opt.unwrap_or_else(|| panic!("supernode {} not factored", i)))
922            .collect();
923        let per_sn_stats: Vec<PerSupernodeStats> = per_sn_stats_vec
924            .into_iter()
925            .enumerate()
926            .map(|(i, opt)| opt.unwrap_or_else(|| panic!("supernode {} missing stats", i)))
927            .collect();
928
929        // Compute aggregate stats from per-supernode stats
930        let mut stats = FactorizationStats {
931            total_1x1_pivots: 0,
932            total_2x2_pivots: 0,
933            total_delayed: 0,
934            zero_pivots: 0,
935            max_front_size: 0,
936            supernodes_before_amalgamation: n_supernodes_before,
937            supernodes_after_amalgamation: n_supernodes,
938            merges_performed,
939            #[cfg(feature = "diagnostic")]
940            total_assembly_time: std::time::Duration::ZERO,
941            #[cfg(feature = "diagnostic")]
942            total_kernel_time: std::time::Duration::ZERO,
943            #[cfg(feature = "diagnostic")]
944            total_extraction_time: std::time::Duration::ZERO,
945            #[cfg(feature = "diagnostic")]
946            total_zero_time: std::time::Duration::ZERO,
947            #[cfg(feature = "diagnostic")]
948            total_g2l_time: std::time::Duration::ZERO,
949            #[cfg(feature = "diagnostic")]
950            total_scatter_time: std::time::Duration::ZERO,
951            #[cfg(feature = "diagnostic")]
952            total_extend_add_time: std::time::Duration::ZERO,
953            #[cfg(feature = "diagnostic")]
954            total_extract_factors_time: std::time::Duration::ZERO,
955            #[cfg(feature = "diagnostic")]
956            total_extract_contrib_time: std::time::Duration::ZERO,
957            #[cfg(feature = "diagnostic")]
958            total_contrib_gemm_time: std::time::Duration::ZERO,
959            #[cfg(feature = "diagnostic")]
960            total_g2l_reset_time: std::time::Duration::ZERO,
961            small_leaf_subtrees: small_leaf_subtrees.len(),
962            small_leaf_nodes: small_leaf_subtrees.iter().map(|s| s.nodes.len()).sum(),
963        };
964        for sn_stat in &per_sn_stats {
965            stats.total_1x1_pivots += sn_stat.num_1x1;
966            stats.total_2x2_pivots += sn_stat.num_2x2;
967            stats.total_delayed += sn_stat.num_delayed;
968            if sn_stat.front_size > stats.max_front_size {
969                stats.max_front_size = sn_stat.front_size;
970            }
971            #[cfg(feature = "diagnostic")]
972            {
973                stats.total_assembly_time += sn_stat.assembly_time;
974                stats.total_kernel_time += sn_stat.kernel_time;
975                stats.total_extraction_time += sn_stat.extraction_time;
976                stats.total_zero_time += sn_stat.zero_time;
977                stats.total_g2l_time += sn_stat.g2l_time;
978                stats.total_scatter_time += sn_stat.scatter_time;
979                stats.total_extend_add_time += sn_stat.extend_add_time;
980                stats.total_extract_factors_time += sn_stat.extract_factors_time;
981                stats.total_extract_contrib_time += sn_stat.extract_contrib_time;
982                stats.total_contrib_gemm_time += sn_stat.contrib_gemm_time;
983                stats.total_g2l_reset_time += sn_stat.g2l_reset_time;
984            }
985        }
986
987        // Count zero pivots from front factors
988        for ff in &front_factors {
989            for (_, entry) in ff.d11().iter_pivots() {
990                if let PivotEntry::OneByOne(val) = entry {
991                    if val == 0.0 {
992                        stats.zero_pivots += 1;
993                    }
994                }
995            }
996        }
997        // Also count unresolved delays at root supernodes
998        for (s, sn) in supernodes.iter().enumerate() {
999            if sn.parent.is_none() {
1000                let sn_stat = &per_sn_stats[s];
1001                let m = sn_stat.front_size;
1002                let ne = sn_stat.num_eliminated;
1003                if ne < m {
1004                    let k = sn_stat.num_fully_summed;
1005                    let n_unresolved = k - ne;
1006                    stats.zero_pivots += n_unresolved;
1007                }
1008            }
1009        }
1010
1011        // Finish profiling session
1012        #[cfg(feature = "diagnostic")]
1013        let finished_session = session.finish();
1014
1015        Ok(AptpNumeric {
1016            front_factors,
1017            stats,
1018            per_supernode_stats: per_sn_stats,
1019            n,
1020            #[cfg(feature = "diagnostic")]
1021            profile_session: Some(finished_session),
1022        })
1023    }
1024
1025    /// Export the assembled (pre-factorization) frontal matrix for a specific supernode.
1026    ///
1027    /// Runs the multifrontal assembly up to the target supernode, factoring all
1028    /// preceding supernodes to generate their contributions, then returns the
1029    /// assembled frontal matrix for the target **before** APTP factoring.
1030    ///
1031    /// # Selecting the Target Supernode
1032    ///
1033    /// If `target_snode` is `None`, the supernode with the largest front size is used.
1034    /// Otherwise, the specified supernode index is used.
1035    ///
1036    /// # Use Case
1037    ///
1038    /// This is a diagnostic tool for comparing our APTP dense kernel with SPRAL's
1039    /// `ldlt_app_factor` on the same assembled input. The exported matrix can be
1040    /// written to a file and fed to a SPRAL driver.
1041    #[cfg(feature = "diagnostic")]
1042    pub fn export_assembled_frontal(
1043        symbolic: &AptpSymbolic,
1044        matrix: &SparseColMat<usize, f64>,
1045        options: &AptpOptions,
1046        scaling: Option<&[f64]>,
1047        target_snode: Option<usize>,
1048    ) -> Result<ExportedFrontal, SparseError> {
1049        let n = symbolic.nrows();
1050
1051        if matrix.nrows() != n || matrix.ncols() != n {
1052            return Err(SparseError::DimensionMismatch {
1053                expected: (n, n),
1054                got: (matrix.nrows(), matrix.ncols()),
1055                context: "Matrix dimensions must match symbolic analysis".to_string(),
1056            });
1057        }
1058
1059        let supernodes = build_supernode_info(symbolic);
1060        let n_supernodes = supernodes.len();
1061        let children = build_children_map(&supernodes);
1062        let (perm_fwd, perm_inv) = symbolic.perm_vecs();
1063
1064        // Determine target supernode
1065        let target = match target_snode {
1066            Some(t) => {
1067                if t >= n_supernodes {
1068                    return Err(SparseError::AnalysisFailure {
1069                        reason: format!(
1070                            "Target supernode {} out of range (n_supernodes={})",
1071                            t, n_supernodes
1072                        ),
1073                    });
1074                }
1075                t
1076            }
1077            None => {
1078                // Find supernode with largest front
1079                let mut best = 0;
1080                let mut best_m = 0;
1081                for (s, sn) in supernodes.iter().enumerate() {
1082                    let sn_ncols = sn.col_end - sn.col_begin;
1083                    // Approximate: delayed cols not counted here, just pattern + cols
1084                    let m_approx = sn_ncols + sn.pattern.len();
1085                    if m_approx > best_m {
1086                        best_m = m_approx;
1087                        best = s;
1088                    }
1089                }
1090                best
1091            }
1092        };
1093
1094        let mut global_to_local = vec![NOT_IN_FRONT; n];
1095        let mut contributions: Vec<Option<ContributionBlock>> =
1096            (0..n_supernodes).map(|_| None).collect();
1097
1098        // Process supernodes in postorder up to and including target
1099        for s in 0..=target {
1100            let sn = &supernodes[s];
1101
1102            // 1. Collect delayed columns from children
1103            let mut delayed_cols: Vec<usize> = Vec::new();
1104            for &c in &children[s] {
1105                if let Some(ref cb) = contributions[c] {
1106                    delayed_cols.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
1107                }
1108            }
1109
1110            // 2. Compute frontal matrix structure
1111            let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
1112            let k = sn_ncols + delayed_cols.len();
1113            let mut frontal_rows: Vec<usize> = Vec::with_capacity(k + sn.pattern.len());
1114            for range in &sn.owned_ranges {
1115                frontal_rows.extend(range.clone());
1116            }
1117            frontal_rows.extend_from_slice(&delayed_cols);
1118            frontal_rows.extend_from_slice(&sn.pattern);
1119            let m = frontal_rows.len();
1120
1121            // 3. Build global-to-local mapping
1122            for (i, &global) in frontal_rows.iter().enumerate() {
1123                global_to_local[global] = i;
1124            }
1125
1126            // 4. Assemble frontal matrix (owned data — diagnostic path)
1127            let mut frontal_data = Mat::zeros(m, m);
1128            {
1129                let mut frontal = FrontalMatrix {
1130                    data: frontal_data.as_mut(),
1131                    row_indices: &frontal_rows,
1132                    num_fully_summed: k,
1133                };
1134
1135                scatter_original_entries_multi(
1136                    &mut frontal,
1137                    matrix,
1138                    &perm_fwd,
1139                    &perm_inv,
1140                    &global_to_local,
1141                    &sn.owned_ranges,
1142                    scaling,
1143                );
1144
1145                for &c in &children[s] {
1146                    if let Some(cb) = contributions[c].take() {
1147                        let _ = extend_add(&mut frontal, cb, &global_to_local);
1148                    }
1149                }
1150            }
1151
1152            if s == target {
1153                // Return the assembled frontal matrix BEFORE factoring
1154                // Clean up global_to_local
1155                for &global in &frontal_rows {
1156                    global_to_local[global] = NOT_IN_FRONT;
1157                }
1158                return Ok(ExportedFrontal {
1159                    data: frontal_data,
1160                    front_size: m,
1161                    num_fully_summed: k,
1162                    row_indices: frontal_rows,
1163                });
1164            }
1165
1166            // Factor this supernode (needed for contribution propagation)
1167            let result = aptp_factor_in_place(frontal_data.as_mut(), k, options)?;
1168            let ne = result.num_eliminated;
1169
1170            if sn.parent.is_some() && ne < m {
1171                let nfs = m - k;
1172                let mut cb = Mat::zeros(m, m);
1173                if nfs > 0 {
1174                    let mut ld_ws = Mat::zeros(m, ne.max(1));
1175                    compute_contribution_gemm(
1176                        &frontal_data,
1177                        k,
1178                        ne,
1179                        m,
1180                        &result.d,
1181                        &mut cb,
1182                        &mut ld_ws,
1183                        Par::Seq,
1184                    );
1185                }
1186                contributions[s] = Some(extract_contribution(
1187                    &frontal_data,
1188                    m,
1189                    k,
1190                    &frontal_rows,
1191                    &result,
1192                    cb,
1193                ));
1194            }
1195
1196            // Cleanup
1197            for &global in &frontal_rows {
1198                global_to_local[global] = NOT_IN_FRONT;
1199            }
1200        }
1201
1202        // Should not reach here — we return inside the loop at s == target
1203        unreachable!("Target supernode {} not reached in postorder loop", target)
1204    }
1205}
1206
1207// ---------------------------------------------------------------------------
1208// Tree-level factorization functions
1209// ---------------------------------------------------------------------------
1210
1211/// Result of factoring a single supernode.
1212struct SupernodeResult {
1213    ff: FrontFactors,
1214    contribution: Option<ContributionBlock>,
1215    stats: PerSupernodeStats,
1216}
1217
1218/// Factor a single supernode given its children's contribution blocks.
1219///
1220/// Assembles the frontal matrix from sparse entries and child contributions,
1221/// factors it with APTP, and extracts the front factors and contribution block.
1222///
1223/// Uses the pre-allocated `workspace` for the frontal matrix buffer, row indices,
1224/// delayed columns, and global-to-local mapping to avoid per-supernode allocations.
1225#[allow(clippy::too_many_arguments)]
1226fn factor_single_supernode(
1227    s: usize,
1228    sn: &SupernodeInfo,
1229    child_contributions: Vec<Option<ContributionBlock>>,
1230    matrix: &SparseColMat<usize, f64>,
1231    perm_fwd: &[usize],
1232    perm_inv: &[usize],
1233    options: &AptpOptions,
1234    scaling: Option<&[f64]>,
1235    workspace: &mut FactorizationWorkspace,
1236    assembly_maps: &AssemblyMaps,
1237) -> Result<SupernodeResult, SparseError> {
1238    // 1. Collect delayed columns from children into workspace buffer
1239    workspace.delayed_cols_buf.clear();
1240    for cb in child_contributions.iter().flatten() {
1241        workspace
1242            .delayed_cols_buf
1243            .extend_from_slice(&cb.row_indices[..cb.num_delayed]);
1244    }
1245
1246    // 2. Compute frontal matrix structure
1247    // Use owned_ranges (not col_begin..col_end) to enumerate fully-summed columns.
1248    // After amalgamation of non-adjacent supernodes, col_begin..col_end may span
1249    // columns belonging to other supernodes. Only owned columns should be in the front.
1250    let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
1251    let k = sn_ncols + workspace.delayed_cols_buf.len(); // num_fully_summed
1252
1253    workspace.frontal_row_indices.clear();
1254    workspace.frontal_row_indices.reserve(k + sn.pattern.len());
1255    for range in &sn.owned_ranges {
1256        workspace.frontal_row_indices.extend(range.clone());
1257    }
1258    workspace
1259        .frontal_row_indices
1260        .extend_from_slice(&workspace.delayed_cols_buf);
1261    workspace.frontal_row_indices.extend_from_slice(&sn.pattern);
1262    let m = workspace.frontal_row_indices.len();
1263
1264    // 3. Ensure workspace capacity and zero frontal data for this supernode
1265    // If delayed columns make this front larger than estimated, grow the workspace
1266    if m > workspace.frontal_data.nrows() {
1267        workspace.frontal_data = Mat::zeros(m, m);
1268    }
1269
1270    #[cfg(feature = "diagnostic")]
1271    let zero_start = std::time::Instant::now();
1272
1273    workspace.zero_frontal(m);
1274
1275    #[cfg(feature = "diagnostic")]
1276    let zero_time = zero_start.elapsed();
1277
1278    // 4. Build global-to-local mapping
1279    #[cfg(feature = "diagnostic")]
1280    let g2l_start = std::time::Instant::now();
1281
1282    for (i, &global) in workspace.frontal_row_indices.iter().enumerate() {
1283        workspace.global_to_local[global] = i;
1284    }
1285
1286    #[cfg(feature = "diagnostic")]
1287    let g2l_time = g2l_start.elapsed();
1288
1289    // 5. Assemble frontal matrix using workspace buffer
1290    #[cfg(feature = "diagnostic")]
1291    let assembly_start = std::time::Instant::now();
1292
1293    let ndelay_in = workspace.delayed_cols_buf.len();
1294
1295    let mut frontal = FrontalMatrix {
1296        data: workspace.frontal_data.as_mut().submatrix_mut(0, 0, m, m),
1297        row_indices: &workspace.frontal_row_indices,
1298        num_fully_summed: k,
1299    };
1300
1301    // 5a. Scatter original CSC entries
1302    #[cfg(feature = "diagnostic")]
1303    let scatter_start = std::time::Instant::now();
1304
1305    // Use precomputed amap when no delays shift row positions
1306    if ndelay_in == 0 {
1307        // Fast path: scatter using precomputed index tuples (4 u32 per entry)
1308        let amap_start = assembly_maps.amap_offsets[s];
1309        let amap_end = assembly_maps.amap_offsets[s + 1];
1310        let values = matrix.val();
1311        let amap = &assembly_maps.amap_entries[amap_start * 4..amap_end * 4];
1312
1313        if let Some(sc) = scaling {
1314            for entry in amap.chunks_exact(4) {
1315                let src_idx = entry[0] as usize;
1316                let dest_linear = entry[1] as usize;
1317                let scale_row = entry[2] as usize;
1318                let scale_col = entry[3] as usize;
1319                let val = values[src_idx] * sc[scale_row] * sc[scale_col];
1320                let dest_col = dest_linear / m;
1321                let dest_row = dest_linear % m;
1322                frontal.data[(dest_row, dest_col)] += val;
1323            }
1324        } else {
1325            for entry in amap.chunks_exact(4) {
1326                let src_idx = entry[0] as usize;
1327                let dest_linear = entry[1] as usize;
1328                let dest_col = dest_linear / m;
1329                let dest_row = dest_linear % m;
1330                frontal.data[(dest_row, dest_col)] += values[src_idx];
1331            }
1332        }
1333    } else {
1334        // Fallback: delayed columns shift positions, use per-entry assembly
1335        scatter_original_entries_multi(
1336            &mut frontal,
1337            matrix,
1338            perm_fwd,
1339            perm_inv,
1340            &workspace.global_to_local,
1341            &sn.owned_ranges,
1342            scaling,
1343        );
1344    }
1345
1346    #[cfg(feature = "diagnostic")]
1347    let scatter_time = scatter_start.elapsed();
1348
1349    // 5b. Extend-add: merge child contributions into parent frontal matrix
1350    #[cfg(feature = "diagnostic")]
1351    let ea_start_time = std::time::Instant::now();
1352
1353    let ea_children_begin = assembly_maps.ea_snode_child_begin[s];
1354    let mut ea_child_idx = ea_children_begin;
1355
1356    for opt_cb in child_contributions {
1357        if let Some(cb) = opt_cb {
1358            // Check if this child had delays — if so, fall back to g2l
1359            let recycled = if cb.num_delayed > 0 || ndelay_in > 0 {
1360                extend_add(&mut frontal, cb, &workspace.global_to_local)
1361            } else {
1362                // Fast path: use precomputed row mapping
1363                let ea_start = assembly_maps.ea_offsets[ea_child_idx];
1364                let ea_end = assembly_maps.ea_offsets[ea_child_idx + 1];
1365                let ea_row_map = &assembly_maps.ea_map[ea_start..ea_end];
1366                extend_add_mapped(&mut frontal, cb, ea_row_map)
1367            };
1368            // Recycle the returned buffer into workspace if it's larger than
1369            // what we currently have (or if ours was moved out).
1370            if recycled.nrows() >= workspace.contrib_buffer.nrows() {
1371                workspace.contrib_buffer = recycled;
1372            }
1373        }
1374        // Always increment: ea_offsets indexes all children, including those
1375        // with no contribution (fully eliminated, ne == m).
1376        ea_child_idx += 1;
1377    }
1378
1379    #[cfg(feature = "diagnostic")]
1380    let extend_add_time = ea_start_time.elapsed();
1381
1382    #[cfg(feature = "diagnostic")]
1383    let assembly_time = assembly_start.elapsed();
1384
1385    // 6. Factor the frontal matrix
1386    #[cfg(feature = "diagnostic")]
1387    let kernel_start = std::time::Instant::now();
1388
1389    let effective_par = if m < INTRA_NODE_THRESHOLD {
1390        Par::Seq
1391    } else {
1392        options.par
1393    };
1394    let per_sn_options = AptpOptions {
1395        par: effective_par,
1396        ..options.clone()
1397    };
1398    let result = aptp_factor_in_place(
1399        workspace.frontal_data.as_mut().submatrix_mut(0, 0, m, m),
1400        k,
1401        &per_sn_options,
1402    )?;
1403    let ne = result.num_eliminated;
1404
1405    #[cfg(feature = "diagnostic")]
1406    let kernel_time = kernel_start.elapsed();
1407
1408    // 6b. Deferred contribution GEMM: compute NFS×NFS Schur complement directly
1409    //     into workspace.contrib_buffer. This replaces the per-block NFS×NFS
1410    //     updates that were skipped during the blocking loop.
1411    #[cfg(feature = "diagnostic")]
1412    let contrib_gemm_start = std::time::Instant::now();
1413
1414    let nfs = m - k;
1415    if sn.parent.is_some() && ne < m && nfs > 0 {
1416        // Ensure contrib_buffer is large enough (rare: delayed cascades may exceed estimate)
1417        if workspace.contrib_buffer.nrows() < m || workspace.contrib_buffer.ncols() < m {
1418            workspace.contrib_buffer = Mat::zeros(m, m);
1419        }
1420        compute_contribution_gemm(
1421            &workspace.frontal_data,
1422            k,
1423            ne,
1424            m,
1425            &result.d,
1426            &mut workspace.contrib_buffer,
1427            &mut workspace.ld_workspace,
1428            effective_par,
1429        );
1430    }
1431
1432    #[cfg(feature = "diagnostic")]
1433    let contrib_gemm_time = contrib_gemm_start.elapsed();
1434
1435    // 7. Extract front factors (reads from workspace frontal data)
1436    #[cfg(feature = "diagnostic")]
1437    let extraction_start = std::time::Instant::now();
1438    #[cfg(feature = "diagnostic")]
1439    let extract_factors_start = std::time::Instant::now();
1440
1441    // Use owned Mat<f64> directly for col_as_slice-based bulk extraction
1442    let ff = extract_front_factors(
1443        &workspace.frontal_data,
1444        m,
1445        k,
1446        &workspace.frontal_row_indices,
1447        &result,
1448    );
1449
1450    #[cfg(feature = "diagnostic")]
1451    let extract_factors_time = extract_factors_start.elapsed();
1452
1453    // 8. Prepare contribution for parent (if not root and not fully eliminated)
1454    //    NFS×NFS data is in workspace.contrib_buffer (from deferred GEMM).
1455    //    Delayed-column data (if any) is still in workspace.frontal_data.
1456    #[cfg(feature = "diagnostic")]
1457    let extract_contrib_start = std::time::Instant::now();
1458
1459    let contribution = if sn.parent.is_some() && ne < m {
1460        Some(extract_contribution(
1461            &workspace.frontal_data,
1462            m,
1463            k,
1464            &workspace.frontal_row_indices,
1465            &result,
1466            std::mem::replace(&mut workspace.contrib_buffer, Mat::new()),
1467        ))
1468    } else {
1469        None
1470    };
1471
1472    #[cfg(feature = "diagnostic")]
1473    let extract_contrib_time = extract_contrib_start.elapsed();
1474
1475    #[cfg(feature = "diagnostic")]
1476    let extraction_time = extraction_start.elapsed();
1477
1478    let stats = PerSupernodeStats {
1479        snode_id: s,
1480        front_size: m,
1481        num_fully_summed: k,
1482        num_eliminated: ne,
1483        num_delayed: k - ne,
1484        num_1x1: result.stats.num_1x1,
1485        num_2x2: result.stats.num_2x2,
1486        max_l_entry: result.stats.max_l_entry,
1487        #[cfg(feature = "diagnostic")]
1488        assembly_time,
1489        #[cfg(feature = "diagnostic")]
1490        kernel_time,
1491        #[cfg(feature = "diagnostic")]
1492        extraction_time,
1493        #[cfg(feature = "diagnostic")]
1494        zero_time,
1495        #[cfg(feature = "diagnostic")]
1496        g2l_time,
1497        #[cfg(feature = "diagnostic")]
1498        scatter_time,
1499        #[cfg(feature = "diagnostic")]
1500        extend_add_time,
1501        #[cfg(feature = "diagnostic")]
1502        extract_factors_time,
1503        #[cfg(feature = "diagnostic")]
1504        extract_contrib_time,
1505        #[cfg(feature = "diagnostic")]
1506        contrib_gemm_time,
1507        #[cfg(feature = "diagnostic")]
1508        g2l_reset_time: std::time::Duration::ZERO, // filled below after reset
1509    };
1510
1511    // Reset global_to_local entries for reuse (O(m), not O(n))
1512    #[cfg(feature = "diagnostic")]
1513    let g2l_reset_start = std::time::Instant::now();
1514
1515    for &global in &workspace.frontal_row_indices[..m] {
1516        workspace.global_to_local[global] = NOT_IN_FRONT;
1517    }
1518
1519    #[cfg(feature = "diagnostic")]
1520    let g2l_reset_time = g2l_reset_start.elapsed();
1521    #[cfg(feature = "diagnostic")]
1522    let stats = {
1523        let mut s = stats;
1524        s.g2l_reset_time = g2l_reset_time;
1525        s
1526    };
1527
1528    Ok(SupernodeResult {
1529        ff,
1530        contribution,
1531        stats,
1532    })
1533}
1534
1535/// Factor a small-leaf subtree using rectangular L storage.
1536///
1537/// Replaces the general `factor_single_supernode` path for classified small-leaf
1538/// subtrees with a streamlined per-node loop that:
1539/// 1. Uses rectangular m×k L storage instead of square m×m frontal matrix
1540/// 2. Factors via `tpp_factor_rect` (rectangular TPP kernel)
1541/// 3. Computes contribution GEMM from rectangular storage
1542/// 4. Reuses a single small workspace across all nodes in the subtree
1543///
1544/// # Arguments
1545///
1546/// - `subtree`: Small-leaf subtree descriptor (nodes in postorder).
1547/// - `supernodes`: All supernode descriptors.
1548/// - `children`: Children map.
1549/// - `matrix`: Original sparse matrix.
1550/// - `perm_fwd`/`perm_inv`: Forward/inverse permutations.
1551/// - `options`: APTP options.
1552/// - `scaling`: Optional MC64 scaling factors.
1553/// - `n`: Matrix dimension.
1554/// - `contributions`: Global contribution storage (read children, write results).
1555/// - `assembly_maps`: Precomputed scatter maps.
1556///
1557/// # Returns
1558///
1559/// Vec of (node_id, FrontFactors, PerSupernodeStats) for each node in the subtree.
1560// SPRAL Equivalent: `SmallLeafNumericSubtree::factor()` in `SmallLeafNumericSubtree.hxx:187-446` (BSD-3).
1561#[allow(clippy::too_many_arguments)]
1562fn factor_small_leaf_subtree(
1563    subtree: &SmallLeafSubtree,
1564    supernodes: &[SupernodeInfo],
1565    children: &[Vec<usize>],
1566    matrix: &SparseColMat<usize, f64>,
1567    perm_fwd: &[usize],
1568    perm_inv: &[usize],
1569    options: &AptpOptions,
1570    scaling: Option<&[f64]>,
1571    n: usize,
1572    contributions: &mut [Option<ContributionBlock>],
1573    assembly_maps: &AssemblyMaps,
1574) -> Result<Vec<(usize, FrontFactors, PerSupernodeStats)>, SparseError> {
1575    let max_front = subtree.max_front_size;
1576    let mut results = Vec::with_capacity(subtree.nodes.len());
1577
1578    // Shared workspace for the subtree:
1579    // - l_storage: rectangular m×k buffer, reused per node (max_front × max_front covers all)
1580    // - global_to_local: length n, reused per node
1581    // - contrib_buffer: for contribution GEMM output, recycled
1582    // - ld_workspace: for L·D product in contribution GEMM
1583    // - frontal_row_indices: reusable buffer for building front structure
1584    // - delayed_cols_buf: reusable buffer for collecting child delays
1585    let mut l_storage = Mat::<f64>::zeros(max_front, max_front);
1586    let mut global_to_local = vec![NOT_IN_FRONT; n];
1587    let mut contrib_buffer = Mat::<f64>::new();
1588    let mut ld_workspace = Mat::<f64>::new();
1589    let mut frontal_row_indices = Vec::<usize>::with_capacity(max_front);
1590    let mut delayed_cols_buf = Vec::<usize>::new();
1591
1592    let symbolic_matrix = matrix.symbolic();
1593    let col_ptrs = symbolic_matrix.col_ptr();
1594    let row_indices_csc = symbolic_matrix.row_idx();
1595    let values = matrix.val();
1596
1597    for &node_id in &subtree.nodes {
1598        let sn = &supernodes[node_id];
1599
1600        // 1. Collect delayed columns from children
1601        delayed_cols_buf.clear();
1602        for &c in &children[node_id] {
1603            if let Some(ref cb) = contributions[c] {
1604                delayed_cols_buf.extend_from_slice(&cb.row_indices[..cb.num_delayed]);
1605            }
1606        }
1607
1608        // 2. Compute frontal matrix structure
1609        let sn_ncols: usize = sn.owned_ranges.iter().map(|r| r.len()).sum();
1610        let k = sn_ncols + delayed_cols_buf.len(); // num_fully_summed
1611
1612        frontal_row_indices.clear();
1613        frontal_row_indices.reserve(k + sn.pattern.len());
1614        for range in &sn.owned_ranges {
1615            frontal_row_indices.extend(range.clone());
1616        }
1617        frontal_row_indices.extend_from_slice(&delayed_cols_buf);
1618        frontal_row_indices.extend_from_slice(&sn.pattern);
1619        let m = frontal_row_indices.len();
1620
1621        // 3. Ensure l_storage capacity and zero rectangular m×k region
1622        #[cfg(feature = "diagnostic")]
1623        let zero_start = std::time::Instant::now();
1624
1625        if m > l_storage.nrows() || k > l_storage.ncols() {
1626            l_storage = Mat::zeros(m, k.max(l_storage.ncols()));
1627        }
1628        // Zero only the m×k rectangle (smaller than m×m for NFS nodes)
1629        for j in 0..k {
1630            l_storage.col_as_slice_mut(j)[0..m].fill(0.0);
1631        }
1632
1633        #[cfg(feature = "diagnostic")]
1634        let zero_time = zero_start.elapsed();
1635
1636        // 4. Build global-to-local mapping
1637        #[cfg(feature = "diagnostic")]
1638        let g2l_start = std::time::Instant::now();
1639
1640        for (i, &global) in frontal_row_indices.iter().enumerate() {
1641            global_to_local[global] = i;
1642        }
1643
1644        #[cfg(feature = "diagnostic")]
1645        let g2l_time = g2l_start.elapsed();
1646
1647        // 5. Split assembly: scatter into l_storage (FS columns) and contrib_buffer (NFS×NFS)
1648        #[cfg(feature = "diagnostic")]
1649        let assembly_start = std::time::Instant::now();
1650        #[cfg(feature = "diagnostic")]
1651        let scatter_start = std::time::Instant::now();
1652
1653        let ndelay_in = delayed_cols_buf.len();
1654        let total_owned = sn_ncols;
1655        let nfs = m - k;
1656
1657        // Pre-allocate and zero contrib_buffer if there will be NFS rows.
1658        // This allows both original matrix scatter and child extend-add to
1659        // deposit NFS×NFS entries directly, avoiding a re-scatter pass.
1660        let has_nfs = nfs > 0 && sn.parent.is_some();
1661        if has_nfs {
1662            if contrib_buffer.nrows() < nfs || contrib_buffer.ncols() < nfs {
1663                contrib_buffer = Mat::zeros(
1664                    nfs.max(contrib_buffer.nrows()),
1665                    nfs.max(contrib_buffer.ncols()),
1666                );
1667            }
1668            for j in 0..nfs {
1669                contrib_buffer.col_as_slice_mut(j)[0..nfs].fill(0.0);
1670            }
1671        }
1672
1673        // 5a. Scatter original CSC entries — FS columns → l_storage, NFS×NFS → contrib_buffer
1674        if ndelay_in == 0 {
1675            let amap_start = assembly_maps.amap_offsets[node_id];
1676            let amap_end = assembly_maps.amap_offsets[node_id + 1];
1677            let amap = &assembly_maps.amap_entries[amap_start * 4..amap_end * 4];
1678
1679            if let Some(sc) = scaling {
1680                for entry in amap.chunks_exact(4) {
1681                    let src_idx = entry[0] as usize;
1682                    let dest_linear = entry[1] as usize;
1683                    let scale_row = entry[2] as usize;
1684                    let scale_col = entry[3] as usize;
1685                    let val = values[src_idx] * sc[scale_row] * sc[scale_col];
1686                    let amap_col = dest_linear / m;
1687                    let amap_row = dest_linear % m;
1688                    if amap_col < k {
1689                        l_storage[(amap_row, amap_col)] += val;
1690                    } else if has_nfs && amap_row >= k {
1691                        // NFS×NFS entry → contrib_buffer
1692                        contrib_buffer[(amap_row - k, amap_col - k)] += val;
1693                    }
1694                    // FS×NFS cross-terms (amap_col >= k, amap_row < k) are handled
1695                    // by the symmetric counterpart where amap_col < k.
1696                }
1697            } else {
1698                for entry in amap.chunks_exact(4) {
1699                    let src_idx = entry[0] as usize;
1700                    let dest_linear = entry[1] as usize;
1701                    let amap_col = dest_linear / m;
1702                    let amap_row = dest_linear % m;
1703                    if amap_col < k {
1704                        l_storage[(amap_row, amap_col)] += values[src_idx];
1705                    } else if has_nfs && amap_row >= k {
1706                        contrib_buffer[(amap_row - k, amap_col - k)] += values[src_idx];
1707                    }
1708                }
1709            }
1710        } else {
1711            // Fallback: delayed columns shift positions, use per-entry g2l scatter
1712            for range in &sn.owned_ranges {
1713                for pj in range.clone() {
1714                    let orig_col = perm_fwd[pj];
1715                    let local_col = global_to_local[pj];
1716
1717                    let start = col_ptrs[orig_col];
1718                    let end = col_ptrs[orig_col + 1];
1719                    for idx in start..end {
1720                        let orig_row = row_indices_csc[idx];
1721                        if orig_row < orig_col {
1722                            let perm_row = perm_inv[orig_row];
1723                            let local_peer = global_to_local[perm_row];
1724                            if local_peer != NOT_IN_FRONT && local_peer < total_owned {
1725                                continue;
1726                            }
1727                        }
1728                        let global_row = perm_inv[orig_row];
1729                        let local_row = global_to_local[global_row];
1730                        if local_row == NOT_IN_FRONT {
1731                            continue;
1732                        }
1733                        if local_row >= total_owned && local_row < k {
1734                            continue;
1735                        }
1736                        let mut val = values[idx];
1737                        if let Some(s) = scaling {
1738                            val *= s[perm_inv[orig_row]] * s[perm_inv[orig_col]];
1739                        }
1740                        let (r, c) = if local_row >= local_col {
1741                            (local_row, local_col)
1742                        } else {
1743                            (local_col, local_row)
1744                        };
1745                        if c < k {
1746                            l_storage[(r, c)] += val;
1747                        } else if has_nfs && r >= k {
1748                            contrib_buffer[(r - k, c - k)] += val;
1749                        }
1750                    }
1751                }
1752            }
1753        }
1754
1755        #[cfg(feature = "diagnostic")]
1756        let scatter_time = scatter_start.elapsed();
1757
1758        // 5b. Extend-add child contributions
1759        //     FS columns → l_storage, NFS×NFS → contrib_buffer
1760        #[cfg(feature = "diagnostic")]
1761        let extend_add_start = std::time::Instant::now();
1762
1763        for &c in &children[node_id] {
1764            if let Some(cb) = contributions[c].take() {
1765                let cb_size = cb.row_indices.len();
1766                for i in 0..cb_size {
1767                    let gi = cb.row_indices[i];
1768                    let li = global_to_local[gi];
1769                    debug_assert!(li != NOT_IN_FRONT, "child row {} not in parent", gi);
1770                    for j in 0..=i {
1771                        let gj = cb.row_indices[j];
1772                        let lj = global_to_local[gj];
1773                        debug_assert!(lj != NOT_IN_FRONT, "child col {} not in parent", gj);
1774                        let val = cb.data[(i, j)];
1775                        if val != 0.0 {
1776                            let (li, lj) = if li >= lj { (li, lj) } else { (lj, li) };
1777                            if lj < k {
1778                                // FS column → l_storage
1779                                l_storage[(li, lj)] += val;
1780                            } else if has_nfs {
1781                                // NFS×NFS → contrib_buffer
1782                                contrib_buffer[(li - k, lj - k)] += val;
1783                            }
1784                        }
1785                    }
1786                }
1787                // Only recycle buffer if we're NOT using contrib_buffer for NFS assembly
1788                if !has_nfs && cb.data.nrows() >= contrib_buffer.nrows() {
1789                    contrib_buffer = cb.data;
1790                }
1791            }
1792        }
1793
1794        #[cfg(feature = "diagnostic")]
1795        let extend_add_time = extend_add_start.elapsed();
1796        #[cfg(feature = "diagnostic")]
1797        let assembly_time = assembly_start.elapsed();
1798
1799        // 6. Factor using rectangular TPP kernel
1800        #[cfg(feature = "diagnostic")]
1801        let kernel_start = std::time::Instant::now();
1802
1803        let result = tpp_factor_rect(l_storage.as_mut().submatrix_mut(0, 0, m, k), k, options)?;
1804        let ne = result.num_eliminated;
1805
1806        #[cfg(feature = "diagnostic")]
1807        let kernel_time = kernel_start.elapsed();
1808
1809        // 7. Build contribution block
1810        #[cfg(feature = "diagnostic")]
1811        let mut contrib_gemm_time = std::time::Duration::ZERO;
1812        #[cfg(feature = "diagnostic")]
1813        let mut extract_contrib_time = std::time::Duration::ZERO;
1814        let contribution = if sn.parent.is_some() && ne < m {
1815            // Ensure contrib_buffer is large enough for extract_contribution_rect
1816            // which may need to assemble delayed + NFS into a single buffer
1817            if contrib_buffer.nrows() < m || contrib_buffer.ncols() < m {
1818                // Grow if delayed columns expanded the need
1819                let new_size = m.max(contrib_buffer.nrows());
1820                let mut new_buf = Mat::zeros(new_size, new_size);
1821                // Preserve NFS×NFS data already assembled
1822                if nfs > 0 {
1823                    for j in 0..nfs {
1824                        let col_len = nfs - j;
1825                        let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
1826                        new_buf.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
1827                    }
1828                }
1829                contrib_buffer = new_buf;
1830            }
1831
1832            // Apply contribution GEMM: updates NFS×NFS with -L21_NFS * D * L21_NFS^T
1833            #[cfg(feature = "diagnostic")]
1834            let gemm_start = std::time::Instant::now();
1835
1836            if nfs > 0 && ne > 0 {
1837                compute_contribution_gemm_rect(
1838                    &l_storage,
1839                    k,
1840                    ne,
1841                    m,
1842                    &result.d,
1843                    &mut contrib_buffer,
1844                    &mut ld_workspace,
1845                    Par::Seq,
1846                );
1847            }
1848
1849            #[cfg(feature = "diagnostic")]
1850            {
1851                contrib_gemm_time = gemm_start.elapsed();
1852            }
1853
1854            #[cfg(feature = "diagnostic")]
1855            let extract_contrib_start = std::time::Instant::now();
1856
1857            let cb = extract_contribution_rect(
1858                &l_storage,
1859                m,
1860                k,
1861                &frontal_row_indices,
1862                &result,
1863                std::mem::replace(&mut contrib_buffer, Mat::new()),
1864            );
1865
1866            #[cfg(feature = "diagnostic")]
1867            {
1868                extract_contrib_time = extract_contrib_start.elapsed();
1869            }
1870
1871            Some(cb)
1872        } else {
1873            None
1874        };
1875
1876        // 8. Extract FrontFactors from rectangular L storage
1877        #[cfg(feature = "diagnostic")]
1878        let extract_factors_start = std::time::Instant::now();
1879
1880        let ff = extract_front_factors_rect(&l_storage, m, k, &frontal_row_indices, &result);
1881
1882        #[cfg(feature = "diagnostic")]
1883        let extract_factors_time = extract_factors_start.elapsed();
1884
1885        let stats = PerSupernodeStats {
1886            snode_id: node_id,
1887            front_size: m,
1888            num_fully_summed: k,
1889            num_eliminated: ne,
1890            num_delayed: k - ne,
1891            num_1x1: result.stats.num_1x1,
1892            num_2x2: result.stats.num_2x2,
1893            max_l_entry: result.stats.max_l_entry,
1894            #[cfg(feature = "diagnostic")]
1895            assembly_time,
1896            #[cfg(feature = "diagnostic")]
1897            kernel_time,
1898            #[cfg(feature = "diagnostic")]
1899            extraction_time: extract_factors_time + extract_contrib_time,
1900            #[cfg(feature = "diagnostic")]
1901            zero_time,
1902            #[cfg(feature = "diagnostic")]
1903            g2l_time,
1904            #[cfg(feature = "diagnostic")]
1905            scatter_time,
1906            #[cfg(feature = "diagnostic")]
1907            extend_add_time,
1908            #[cfg(feature = "diagnostic")]
1909            extract_factors_time,
1910            #[cfg(feature = "diagnostic")]
1911            extract_contrib_time,
1912            #[cfg(feature = "diagnostic")]
1913            contrib_gemm_time,
1914            #[cfg(feature = "diagnostic")]
1915            g2l_reset_time: std::time::Duration::ZERO, // filled below
1916        };
1917
1918        // 9. Reset g2l
1919        #[cfg(feature = "diagnostic")]
1920        let g2l_reset_start = std::time::Instant::now();
1921
1922        for &global in &frontal_row_indices[..m] {
1923            global_to_local[global] = NOT_IN_FRONT;
1924        }
1925
1926        #[cfg(feature = "diagnostic")]
1927        let stats = {
1928            let mut s = stats;
1929            s.g2l_reset_time = g2l_reset_start.elapsed();
1930            s
1931        };
1932
1933        contributions[node_id] = contribution;
1934        results.push((node_id, ff, stats));
1935    }
1936
1937    Ok(results)
1938}
1939
1940/// Iterative level-set factorization of the assembly tree.
1941///
1942/// Processes supernodes bottom-up: first all leaves (in parallel if enabled),
1943/// then all nodes whose children are complete, and so on up to the roots.
1944/// This avoids deep recursion that overflows rayon's 2MB worker stacks on
1945/// matrices with deep elimination trees (e.g., c-71 with 76K supernodes).
1946///
1947/// The parallelism is "level-set" style: at each wave, all ready nodes are
1948/// independent (their children are already factored) and can be processed
1949/// concurrently. This provides the same tree-level parallelism as the
1950/// recursive approach but without stack depth concerns.
1951///
1952/// # References
1953///
1954/// - Liu (1992), Section 5: level-set scheduling for multifrontal methods
1955/// - Duff & Reid (1983): assembly tree parallelism
1956#[allow(clippy::too_many_arguments)]
1957fn factor_tree_levelset(
1958    supernodes: &[SupernodeInfo],
1959    children: &[Vec<usize>],
1960    matrix: &SparseColMat<usize, f64>,
1961    perm_fwd: &[usize],
1962    perm_inv: &[usize],
1963    options: &AptpOptions,
1964    scaling: Option<&[f64]>,
1965    n: usize,
1966    assembly_maps: &AssemblyMaps,
1967    small_leaf_subtrees: &[SmallLeafSubtree],
1968) -> Result<Vec<(usize, FrontFactors, PerSupernodeStats)>, SparseError> {
1969    let n_supernodes = supernodes.len();
1970
1971    // Track contributions from completed children
1972    let mut contributions: Vec<Option<ContributionBlock>> =
1973        (0..n_supernodes).map(|_| None).collect();
1974
1975    // Track how many children remain unprocessed for each supernode
1976    let mut remaining_children: Vec<usize> = children.iter().map(|c| c.len()).collect();
1977
1978    // Collect all results
1979    let mut all_results: Vec<(usize, FrontFactors, PerSupernodeStats)> =
1980        Vec::with_capacity(n_supernodes);
1981
1982    // --- Small-leaf subtree pre-pass (rectangular workspace fast path) ---
1983    // Factor all classified small-leaf subtrees before the main level-set loop.
1984    // Each subtree uses a rectangular m×k L storage kernel instead of the general
1985    // m×m frontal matrix, eliminating the large square allocation, frontal zeroing,
1986    // and extract_front_factors copy for every node.
1987    for subtree in small_leaf_subtrees {
1988        let subtree_results = factor_small_leaf_subtree(
1989            subtree,
1990            supernodes,
1991            children,
1992            matrix,
1993            perm_fwd,
1994            perm_inv,
1995            options,
1996            scaling,
1997            n,
1998            &mut contributions,
1999            assembly_maps,
2000        )?;
2001        all_results.extend(subtree_results);
2002
2003        // Decrement remaining_children for the subtree root's parent
2004        if let Some(parent) = subtree.parent_of_root {
2005            remaining_children[parent] -= 1;
2006        }
2007    }
2008
2009    let is_parallel = !matches!(options.par, Par::Seq);
2010    let max_front = estimate_max_front_size(supernodes);
2011
2012    // Sequential mode: pre-allocate a single workspace at max_front to avoid
2013    // repeated reallocations as the postorder traversal encounters progressively
2014    // larger fronts. No pool needed — only one workspace exists.
2015    //
2016    // Parallel mode: use a pool of grow-on-demand workspaces. Pre-allocating
2017    // a separate shared_workspace alongside the pool would waste max_front²×8
2018    // bytes (685 MB for H2O) during parallel waves when only the pool is used.
2019    // Single-node batches (sequential fallback near the tree root) pop from
2020    // the pool instead.
2021    //
2022    // The pool is dropped when factor_tree_levelset returns, freeing all
2023    // workspace memory (unlike thread_local! which leaks for the lifetime
2024    // of the rayon thread pool).
2025    let mut seq_workspace = if !is_parallel {
2026        FactorizationWorkspace::new(max_front, n)
2027    } else {
2028        FactorizationWorkspace::empty()
2029    };
2030    let workspace_pool: std::sync::Mutex<Vec<FactorizationWorkspace>> =
2031        std::sync::Mutex::new(Vec::new());
2032
2033    // Initial ready set: all leaf supernodes (no children) that are NOT in small-leaf subtrees
2034    let mut ready: Vec<usize> = (0..n_supernodes)
2035        .filter(|&s| remaining_children[s] == 0 && !supernodes[s].in_small_leaf)
2036        .collect();
2037
2038    while !ready.is_empty() {
2039        // Collect child contributions for each ready node. Children of different
2040        // ready nodes are disjoint (tree structure), so each contribution is
2041        // taken exactly once.
2042        let mut batch_inputs: Vec<(usize, Vec<Option<ContributionBlock>>)> =
2043            Vec::with_capacity(ready.len());
2044        for &s in &ready {
2045            let child_contribs: Vec<Option<ContributionBlock>> = children[s]
2046                .iter()
2047                .map(|&c| contributions[c].take())
2048                .collect();
2049            batch_inputs.push((s, child_contribs));
2050        }
2051
2052        // Factor all ready nodes (parallel if enabled and batch > 1)
2053        let batch_results: Vec<SupernodeResult> = if !is_parallel {
2054            // Pure sequential: use the pre-allocated workspace (no pool
2055            // overhead, no reallocations during postorder traversal).
2056            batch_inputs
2057                .into_iter()
2058                .map(|(s, contribs)| {
2059                    factor_single_supernode(
2060                        s,
2061                        &supernodes[s],
2062                        contribs,
2063                        matrix,
2064                        perm_fwd,
2065                        perm_inv,
2066                        options,
2067                        scaling,
2068                        &mut seq_workspace,
2069                        assembly_maps,
2070                    )
2071                })
2072                .collect::<Result<_, _>>()?
2073        } else if batch_inputs.len() == 1 {
2074            // Parallel mode, single-node batch (near tree root): pop a
2075            // workspace from the pool. Drop remaining pool workspaces to
2076            // free memory before processing large supernodes.
2077            let mut pool = workspace_pool.lock().unwrap();
2078            let mut ws = pool.pop().unwrap_or_else(FactorizationWorkspace::empty);
2079            pool.clear();
2080            drop(pool);
2081            ws.ensure_g2l(n);
2082            let results = batch_inputs
2083                .into_iter()
2084                .map(|(s, contribs)| {
2085                    factor_single_supernode(
2086                        s,
2087                        &supernodes[s],
2088                        contribs,
2089                        matrix,
2090                        perm_fwd,
2091                        perm_inv,
2092                        options,
2093                        scaling,
2094                        &mut ws,
2095                        assembly_maps,
2096                    )
2097                })
2098                .collect::<Result<_, _>>();
2099            workspace_pool.lock().unwrap().push(ws);
2100            results?
2101        } else {
2102            // Parallel: take workspaces from the caller-owned pool so each
2103            // rayon worker gets exclusive ownership during factorization.
2104            // The pool is reused across level-set waves and dropped when
2105            // factor_tree_levelset returns (unlike thread_local! which leaks
2106            // for the lifetime of the rayon thread pool).
2107            //
2108            // If rayon work-steals another par_iter task onto the same thread
2109            // (due to nested Par::rayon BLAS), that task takes a separate
2110            // workspace from the pool — no aliasing.
2111            use rayon::iter::{IntoParallelIterator, ParallelIterator};
2112            batch_inputs
2113                .into_par_iter()
2114                .map(|(s, contribs)| {
2115                    // Poisoned mutex means a worker panicked — already fatal.
2116                    let mut ws = workspace_pool
2117                        .lock()
2118                        .unwrap()
2119                        .pop()
2120                        .unwrap_or_else(FactorizationWorkspace::empty);
2121                    // Only ensure global_to_local is sized; frontal_data
2122                    // grows on demand in factor_single_supernode. Avoids
2123                    // eagerly allocating max_front² per thread (OOM on
2124                    // large-front matrices like H2O: 9258² × 8 = 685 MB).
2125                    ws.ensure_g2l(n);
2126                    let result = factor_single_supernode(
2127                        s,
2128                        &supernodes[s],
2129                        contribs,
2130                        matrix,
2131                        perm_fwd,
2132                        perm_inv,
2133                        options,
2134                        scaling,
2135                        &mut ws,
2136                        assembly_maps,
2137                    );
2138                    // Poisoned mutex means a worker panicked — already fatal.
2139                    workspace_pool.lock().unwrap().push(ws);
2140                    result
2141                })
2142                .collect::<Result<_, _>>()?
2143        };
2144
2145        // Store results and determine next ready set
2146        let mut next_ready = Vec::new();
2147        for (result, &s) in batch_results.into_iter().zip(ready.iter()) {
2148            contributions[s] = result.contribution;
2149            all_results.push((s, result.ff, result.stats));
2150
2151            if let Some(parent) = supernodes[s].parent {
2152                remaining_children[parent] -= 1;
2153                if remaining_children[parent] == 0 {
2154                    next_ready.push(parent);
2155                }
2156            }
2157        }
2158
2159        ready = next_ready;
2160    }
2161
2162    Ok(all_results)
2163}
2164
2165// ---------------------------------------------------------------------------
2166// Internal functions
2167// ---------------------------------------------------------------------------
2168
2169/// Build unified supernode descriptors from symbolic analysis.
2170///
2171/// For supernodal decompositions, maps directly from faer's supernode structure.
2172/// For simplicial decompositions, each column becomes a trivial 1-column supernode
2173/// with pattern derived from the elimination tree and L structure.
2174///
2175/// # Postcondition
2176///
2177/// `info[s].parent.map_or(true, |p| p > s)` for all `s` (postorder).
2178#[allow(clippy::single_range_in_vec_init)]
2179fn build_supernode_info(symbolic: &AptpSymbolic) -> Vec<SupernodeInfo> {
2180    match symbolic.raw() {
2181        SymbolicCholeskyRaw::Supernodal(sn) => {
2182            let ns = sn.n_supernodes();
2183            let begin = sn.supernode_begin();
2184            let end = sn.supernode_end();
2185            (0..ns)
2186                .map(|s| {
2187                    let pattern = sn.supernode(s).pattern().to_vec();
2188                    let parent = symbolic.supernode_parent(s);
2189                    SupernodeInfo {
2190                        col_begin: begin[s],
2191                        col_end: end[s],
2192                        pattern,
2193                        parent,
2194                        owned_ranges: vec![begin[s]..end[s]],
2195                        in_small_leaf: false,
2196                    }
2197                })
2198                .collect()
2199        }
2200        SymbolicCholeskyRaw::Simplicial(simp) => {
2201            let n = symbolic.nrows();
2202            let etree = symbolic.etree();
2203            // Get L structure from simplicial symbolic factorization
2204            let l_symbolic = simp.factor();
2205            let col_ptr = l_symbolic.col_ptr();
2206            let row_idx = l_symbolic.row_idx();
2207            (0..n)
2208                .map(|j| {
2209                    // Pattern = row indices in column j of L that are > j
2210                    // (off-diagonal entries below the diagonal)
2211                    let start = col_ptr[j];
2212                    let end = col_ptr[j + 1];
2213                    let pattern: Vec<usize> = row_idx[start..end]
2214                        .iter()
2215                        .copied()
2216                        .filter(|&r| r > j)
2217                        .collect();
2218                    // Parent from etree
2219                    let parent = if etree[j] < 0 {
2220                        None
2221                    } else {
2222                        Some(etree[j] as usize)
2223                    };
2224                    SupernodeInfo {
2225                        col_begin: j,
2226                        col_end: j + 1,
2227                        pattern,
2228                        parent,
2229                        owned_ranges: vec![j..j + 1],
2230                        in_small_leaf: false,
2231                    }
2232                })
2233                .collect()
2234        }
2235    }
2236}
2237
2238/// Build a map from each supernode to its children in the assembly tree.
2239fn build_children_map(infos: &[SupernodeInfo]) -> Vec<Vec<usize>> {
2240    let n = infos.len();
2241    let mut children = vec![Vec::new(); n];
2242    for (s, info) in infos.iter().enumerate() {
2243        if let Some(p) = info.parent {
2244            children[p].push(s);
2245        }
2246    }
2247    children
2248}
2249
2250/// Classify post-amalgamation supernodes into small-leaf subtrees for the fast path.
2251///
2252/// Performs a single O(n_supernodes) bottom-up pass to identify contiguous leaf
2253/// subtrees where every supernode has `front_size < threshold`. Sets `in_small_leaf`
2254/// on qualifying supernodes and returns a `Vec<SmallLeafSubtree>` describing each
2255/// identified subtree (with at least 2 nodes).
2256///
2257/// # Arguments
2258///
2259/// - `supernodes`: Post-amalgamation supernode descriptors (mutated: `in_small_leaf` set).
2260/// - `children_map`: Children map (`children[s]` = list of child supernode indices).
2261/// - `threshold`: Front-size threshold. Supernodes with `front_size >= threshold` are
2262///   excluded. If `threshold == 0`, returns empty (fast path disabled).
2263///
2264/// # Algorithm
2265///
2266/// 1. Compute `front_size[s] = sum(owned_ranges[s].len()) + pattern[s].len()`.
2267/// 2. Bottom-up (postorder): mark `in_small_leaf[s] = true` iff `front_size[s] < threshold`
2268///    AND (s is a leaf OR all children have `in_small_leaf = true`).
2269/// 3. Identify subtree roots: `in_small_leaf[s] = true` and parent has `in_small_leaf = false`
2270///    (or no parent).
2271/// 4. Collect descendant nodes in postorder via iterative stack-based traversal.
2272/// 5. Filter subtrees with < 2 nodes.
2273///
2274/// # References
2275///
2276/// - SPRAL `SymbolicSubtree.hxx:57-84` (BSD-3): bottom-up classification
2277/// - Duff, Hogg & Lopez (2020), Algorithm 6.1: find_subtree_partition
2278fn classify_small_leaf_subtrees(
2279    supernodes: &mut [SupernodeInfo],
2280    children_map: &[Vec<usize>],
2281    threshold: usize,
2282) -> Vec<SmallLeafSubtree> {
2283    if threshold == 0 {
2284        return Vec::new();
2285    }
2286
2287    let n_supernodes = supernodes.len();
2288
2289    // Step 1-2: Bottom-up classification (supernodes are in postorder: s=0 is leftmost leaf)
2290    for s in 0..n_supernodes {
2291        let owned_cols: usize = supernodes[s].owned_ranges.iter().map(|r| r.len()).sum();
2292        let front_size = owned_cols + supernodes[s].pattern.len();
2293
2294        if front_size >= threshold {
2295            supernodes[s].in_small_leaf = false;
2296            continue;
2297        }
2298
2299        if children_map[s].is_empty() {
2300            // Leaf node with small front
2301            supernodes[s].in_small_leaf = true;
2302        } else if children_map[s].iter().all(|&c| supernodes[c].in_small_leaf) {
2303            // Interior node: all children are in_small_leaf and this node is small
2304            supernodes[s].in_small_leaf = true;
2305        } else {
2306            supernodes[s].in_small_leaf = false;
2307        }
2308    }
2309
2310    // Step 3: Identify subtree roots
2311    let mut subtrees = Vec::new();
2312    for s in 0..n_supernodes {
2313        if !supernodes[s].in_small_leaf {
2314            continue;
2315        }
2316        let is_root = match supernodes[s].parent {
2317            None => true,
2318            Some(p) => !supernodes[p].in_small_leaf,
2319        };
2320        if !is_root {
2321            continue;
2322        }
2323
2324        // Step 4: Collect descendant nodes in postorder via iterative DFS.
2325        // Two-stack approach: first stack drives DFS, second collects in reverse postorder.
2326        let mut nodes = Vec::new();
2327        let mut stack = vec![s];
2328        let mut visit_stack = Vec::new();
2329        while let Some(node) = stack.pop() {
2330            visit_stack.push(node);
2331            // Push children (they'll be processed before parent due to stack LIFO)
2332            for &c in &children_map[node] {
2333                if supernodes[c].in_small_leaf {
2334                    stack.push(c);
2335                }
2336            }
2337        }
2338        // visit_stack is in reverse postorder — reverse to get postorder
2339        while let Some(node) = visit_stack.pop() {
2340            nodes.push(node);
2341        }
2342
2343        // Step 5: Filter subtrees with < 2 nodes
2344        if nodes.len() < 2 {
2345            // Single-node "subtree" — not worth the fast path overhead
2346            supernodes[s].in_small_leaf = false;
2347            continue;
2348        }
2349
2350        // Compute max_front_size across all nodes in the subtree
2351        let max_front_size = nodes
2352            .iter()
2353            .map(|&node| {
2354                let owned: usize = supernodes[node].owned_ranges.iter().map(|r| r.len()).sum();
2355                owned + supernodes[node].pattern.len()
2356            })
2357            .max()
2358            .unwrap_or(0);
2359
2360        subtrees.push(SmallLeafSubtree {
2361            root: s,
2362            nodes,
2363            max_front_size,
2364            parent_of_root: supernodes[s].parent,
2365        });
2366    }
2367
2368    subtrees
2369}
2370
2371/// Scatter original sparse matrix entries into a frontal matrix from multiple owned ranges.
2372///
2373/// After amalgamation, a supernode may own multiple non-contiguous column ranges.
2374/// This function iterates all owned columns and correctly handles upper-triangle
2375/// deduplication across ranges.
2376fn scatter_original_entries_multi(
2377    frontal: &mut FrontalMatrix<'_>,
2378    matrix: &SparseColMat<usize, f64>,
2379    perm_fwd: &[usize],
2380    perm_inv: &[usize],
2381    global_to_local: &[usize],
2382    owned_ranges: &[std::ops::Range<usize>],
2383    scaling: Option<&[f64]>,
2384) {
2385    let symbolic = matrix.symbolic();
2386    let col_ptrs = symbolic.col_ptr();
2387    let row_indices_csc = symbolic.row_idx();
2388    let values = matrix.val();
2389    let total_owned: usize = owned_ranges.iter().map(|r| r.len()).sum();
2390    let k = frontal.num_fully_summed;
2391
2392    // Build a fast check for "is this permuted column an owned supernode column?"
2393    // We reuse global_to_local: if g2l[pj] < total_owned, it's an owned column
2394    // (since frontal_rows puts owned columns first). This works because owned
2395    // columns always occupy the first `total_owned` positions in frontal_rows.
2396
2397    for range in owned_ranges {
2398        for pj in range.clone() {
2399            let orig_col = perm_fwd[pj];
2400            let local_col = global_to_local[pj];
2401
2402            let start = col_ptrs[orig_col];
2403            let end = col_ptrs[orig_col + 1];
2404            for idx in start..end {
2405                let orig_row = row_indices_csc[idx];
2406                // Skip upper-triangle if both are owned supernode cols
2407                if orig_row < orig_col {
2408                    let perm_row = perm_inv[orig_row];
2409                    let local_peer = global_to_local[perm_row];
2410                    if local_peer != NOT_IN_FRONT && local_peer < total_owned {
2411                        continue; // both owned cols — avoid double-count
2412                    }
2413                }
2414                let global_row = perm_inv[orig_row];
2415                let local_row = global_to_local[global_row];
2416                if local_row == NOT_IN_FRONT {
2417                    continue;
2418                }
2419                // Skip delayed columns from children (indices total_owned..k)
2420                if local_row >= total_owned && local_row < k {
2421                    continue;
2422                }
2423                let mut val = values[idx];
2424                if let Some(s) = scaling {
2425                    val *= s[perm_inv[orig_row]] * s[perm_inv[orig_col]];
2426                }
2427                if local_row >= local_col {
2428                    frontal.data[(local_row, local_col)] += val;
2429                } else {
2430                    frontal.data[(local_col, local_row)] += val;
2431                }
2432            }
2433        }
2434    }
2435}
2436
2437/// Perform extend-add: merge a child's contribution block into the parent frontal matrix.
2438///
2439/// For each lower-triangle entry in the child contribution, maps global indices
2440/// to parent local positions and adds the value.
2441///
2442/// # References
2443///
2444/// - Liu (1992), Section 4.2: extend-add operator
2445pub(crate) fn extend_add(
2446    parent: &mut FrontalMatrix<'_>,
2447    child: ContributionBlock,
2448    global_to_local: &[usize],
2449) -> Mat<f64> {
2450    let cb_size = child.row_indices.len();
2451    // Column-oriented iteration: for each child column j, scatter the
2452    // lower-triangle entries (rows j..cb_size) into the parent. Source
2453    // reads are contiguous in column-major layout. We keep the zero-check
2454    // and symmetry swap since delayed columns can break monotonicity.
2455    for j in 0..cb_size {
2456        let gj = child.row_indices[j];
2457        let lj = global_to_local[gj];
2458        debug_assert!(
2459            lj != NOT_IN_FRONT,
2460            "extend_add: child col {} not in parent",
2461            gj
2462        );
2463        let child_col = &child.data.col_as_slice(j)[j..cb_size];
2464        let row_indices = &child.row_indices[j..cb_size];
2465
2466        for (&val, &gi) in child_col.iter().zip(row_indices) {
2467            if val != 0.0 {
2468                let li = global_to_local[gi];
2469                debug_assert!(
2470                    li != NOT_IN_FRONT,
2471                    "extend_add: child row {} not in parent",
2472                    gi
2473                );
2474                // Map to parent: ensure lower triangle (li >= lj)
2475                if li >= lj {
2476                    parent.data[(li, lj)] += val;
2477                } else {
2478                    parent.data[(lj, li)] += val;
2479                }
2480            }
2481        }
2482    }
2483    // Return the consumed data buffer for recycling into workspace.contrib_buffer
2484    child.data
2485}
2486
2487/// Extend-add using a precomputed child→parent row mapping.
2488///
2489/// Used when the child had zero delayed columns, so the contribution block
2490/// rows correspond exactly to the child's symbolic pattern and the precomputed
2491/// `ea_row_map` provides direct local index lookup without `global_to_local`.
2492///
2493/// Uses column-oriented processing: for each child
2494/// column `j`, scatters the lower-triangle entries `(j..cb_size, j)` into
2495/// the parent. Source reads are contiguous in column-major layout.
2496///
2497/// # References
2498///
2499/// - SPRAL `assemble.hxx:27-38` — `asm_col` column-oriented scatter (BSD-3)
2500///
2501/// # Arguments
2502///
2503/// - `parent`: Parent frontal matrix to accumulate into (lower triangle).
2504/// - `child`: Child contribution block.
2505/// - `ea_row_map`: Precomputed mapping from child contribution row index to
2506///   parent frontal local row index. Length equals the child's pattern size.
2507fn extend_add_mapped(
2508    parent: &mut FrontalMatrix<'_>,
2509    child: ContributionBlock,
2510    ea_row_map: &[u32],
2511) -> Mat<f64> {
2512    let cb_size = child.row_indices.len();
2513    debug_assert!(
2514        ea_row_map.len() >= cb_size,
2515        "extend_add_mapped: ea_row_map len {} < cb_size {}",
2516        ea_row_map.len(),
2517        cb_size
2518    );
2519
2520    // Column-oriented scatter.
2521    // For each child column j, scatter lower-triangle entries (rows j..cb_size)
2522    // into the parent. Source reads via col_as_slice are contiguous in column-
2523    // major layout; writes target a single parent column when li >= lj (common
2524    // case), with a symmetry swap for the rare li < lj case.
2525    for j in 0..cb_size {
2526        let lj = ea_row_map[j] as usize;
2527        let child_col = child.data.col_as_slice(j);
2528        let row_map = &ea_row_map[j..cb_size];
2529        let src = &child_col[j..cb_size];
2530
2531        // 4× unrolled inner loop for ILP
2532        let n = cb_size - j;
2533        let n4 = n / 4 * 4;
2534        let mut k = 0;
2535        while k < n4 {
2536            ea_scatter_one(&mut parent.data, row_map[k] as usize, lj, src[k]);
2537            ea_scatter_one(&mut parent.data, row_map[k + 1] as usize, lj, src[k + 1]);
2538            ea_scatter_one(&mut parent.data, row_map[k + 2] as usize, lj, src[k + 2]);
2539            ea_scatter_one(&mut parent.data, row_map[k + 3] as usize, lj, src[k + 3]);
2540            k += 4;
2541        }
2542        while k < n {
2543            ea_scatter_one(&mut parent.data, row_map[k] as usize, lj, src[k]);
2544            k += 1;
2545        }
2546    }
2547    // Return the consumed data buffer for recycling into workspace.contrib_buffer
2548    child.data
2549}
2550
2551/// Scatter a single extend-add value into the parent's lower triangle.
2552#[inline(always)]
2553fn ea_scatter_one(parent: &mut MatMut<'_, f64>, li: usize, lj: usize, val: f64) {
2554    if li >= lj {
2555        parent[(li, lj)] += val;
2556    } else {
2557        parent[(lj, li)] += val;
2558    }
2559}
2560
2561/// Extract per-supernode factors from a factored frontal matrix.
2562///
2563/// Copies L11, D11, L21, and permutation information from the in-place
2564/// factored frontal matrix into a persistent [`FrontFactors`] struct.
2565///
2566/// Uses per-column `col_as_slice` + `copy_from_slice` for L21 extraction and
2567/// per-column slice copies for L11 1x1-pivot columns, replacing element-by-element
2568/// indexing with bulk memory operations.
2569///
2570/// # Arguments
2571///
2572/// - `frontal_data`: The factored frontal matrix data (owned `Mat<f64>`).
2573///   Columns must be contiguous (guaranteed by `Mat` layout). Only the first
2574///   `m` rows and columns are used.
2575/// - `m`: Front size (number of rows/columns used in `frontal_data`).
2576/// - `k`: Number of fully-summed rows/columns.
2577/// - `frontal_row_indices`: Global permuted column indices for each local row (length >= m).
2578/// - `result`: APTP factorization result (pivot types, permutation, etc.).
2579pub(crate) fn extract_front_factors(
2580    frontal_data: &Mat<f64>,
2581    m: usize,
2582    k: usize,
2583    frontal_row_indices: &[usize],
2584    result: &AptpFactorResult,
2585) -> FrontFactors {
2586    let ne = result.num_eliminated;
2587
2588    // Extract L11 (ne × ne) — unit lower triangular part
2589    // For 2x2 pivots, a[(col+1, col)] is the D off-diagonal, NOT an L entry.
2590    // Uses per-column slice copies for 1x1 pivot columns where the L entries
2591    // form a contiguous segment.
2592    let l11 = if ne > 0 {
2593        let mut l = Mat::zeros(ne, ne);
2594        let mut col = 0;
2595        while col < ne {
2596            l[(col, col)] = 1.0; // unit diagonal
2597            match result.d.pivot_type(col) {
2598                PivotType::OneByOne => {
2599                    // L entries: rows (col+1)..ne — contiguous in both source and dest
2600                    let n_entries = ne - (col + 1);
2601                    if n_entries > 0 {
2602                        let src = &frontal_data.col_as_slice(col)[col + 1..ne];
2603                        l.col_as_slice_mut(col)[col + 1..ne].copy_from_slice(src);
2604                    }
2605                    col += 1;
2606                }
2607                PivotType::TwoByTwo { partner } if partner > col => {
2608                    l[(col + 1, col + 1)] = 1.0; // unit diagonal for partner
2609                    // a[(col+1, col)] is D off-diagonal, skip it
2610                    // L entries for both columns start at row col+2
2611                    let n_entries = ne - (col + 2);
2612                    if n_entries > 0 {
2613                        let src0 = &frontal_data.col_as_slice(col)[col + 2..ne];
2614                        l.col_as_slice_mut(col)[col + 2..ne].copy_from_slice(src0);
2615                        let src1 = &frontal_data.col_as_slice(col + 1)[col + 2..ne];
2616                        l.col_as_slice_mut(col + 1)[col + 2..ne].copy_from_slice(src1);
2617                    }
2618                    col += 2;
2619                }
2620                PivotType::TwoByTwo { .. } => {
2621                    // Second column of a 2x2 pair — already handled above
2622                    col += 1;
2623                }
2624                PivotType::Delayed => {
2625                    // Should not appear for columns 0..ne (they were eliminated)
2626                    unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
2627                }
2628            }
2629        }
2630        l
2631    } else {
2632        Mat::zeros(0, 0)
2633    };
2634
2635    // Build truncated D11 (first ne pivots from result.d)
2636    let mut d11 = MixedDiagonal::new(ne);
2637    let mut col = 0;
2638    while col < ne {
2639        match result.d.pivot_type(col) {
2640            PivotType::OneByOne => {
2641                d11.set_1x1(col, result.d.diagonal_1x1(col));
2642                col += 1;
2643            }
2644            PivotType::TwoByTwo { partner: _ } => {
2645                let block = result.d.diagonal_2x2(col);
2646                d11.set_2x2(Block2x2 {
2647                    first_col: col,
2648                    a: block.a,
2649                    b: block.b,
2650                    c: block.c,
2651                });
2652                col += 2;
2653            }
2654            PivotType::Delayed => {
2655                // Should not happen for columns 0..ne (they were eliminated)
2656                unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
2657            }
2658        }
2659    }
2660
2661    // Extract L21 (r × ne) where r = m - ne (includes delayed rows ne..k AND
2662    // non-fully-summed rows k..m). The factored frontal has valid L entries at rows
2663    // ne..m for columns 0..ne: rows ne..k are delayed fully-summed rows that received
2664    // TRSM updates before being delayed; rows k..m are non-fully-summed rows.
2665    //
2666    // Uses per-column copy_from_slice for bulk extraction.
2667    //
2668    // NOTE: extract_contribution also includes delayed rows (see lines below).
2669    // These two functions MUST be consistent in their row treatment.
2670    let r = m - ne;
2671    let l21 = if ne > 0 && r > 0 {
2672        let mut l = Mat::zeros(r, ne);
2673        for j in 0..ne {
2674            let src = &frontal_data.col_as_slice(j)[ne..m];
2675            l.col_as_slice_mut(j)[..r].copy_from_slice(src);
2676        }
2677        l
2678    } else {
2679        Mat::zeros(r, ne)
2680    };
2681
2682    // Local permutation (maps factored position to original front-local column)
2683    let local_perm = result.perm[..k].to_vec();
2684
2685    // Column indices: the global permuted indices of the eliminated columns
2686    // local_perm[0..ne] gives the front-local columns that were eliminated,
2687    // mapped through frontal_row_indices to get global permuted indices
2688    let col_indices: Vec<usize> = local_perm[..ne]
2689        .iter()
2690        .map(|&lp| frontal_row_indices[lp])
2691        .collect();
2692
2693    // Row indices: global permuted indices for L21 rows.
2694    // First num_delayed entries: delayed fully-summed columns (ne..k), mapped through
2695    // result.perm and frontal_row_indices to get global permuted indices.
2696    // Remaining entries: non-fully-summed rows (k..m).
2697    // This matches extract_contribution's row_indices construction.
2698    let mut row_indices = Vec::with_capacity(m - ne);
2699    for &lp in &result.perm[ne..k] {
2700        row_indices.push(frontal_row_indices[lp]);
2701    }
2702    row_indices.extend_from_slice(&frontal_row_indices[k..m]);
2703
2704    FrontFactors {
2705        l11,
2706        d11,
2707        l21,
2708        local_perm,
2709        num_eliminated: ne,
2710        col_indices,
2711        row_indices,
2712    }
2713}
2714
2715/// Extract the contribution block from a factored frontal matrix.
2716///
2717/// The NFS×NFS Schur complement data is already in `contrib_buffer` (written
2718/// by [`compute_contribution_gemm`]). This function adds any delayed-column
2719/// data from the workspace and builds the index metadata.
2720///
2721/// # Layout within returned `ContributionBlock.data`
2722///
2723/// ```text
2724/// [0..num_delayed, 0..num_delayed]     → delayed × delayed (from workspace)
2725/// [num_delayed..size, 0..num_delayed]  → NFS × delayed cross-terms (from workspace)
2726/// [num_delayed..size, num_delayed..size] → NFS × NFS Schur complement (from contrib_buffer)
2727/// ```
2728///
2729/// # Arguments
2730///
2731/// - `frontal_data`: The factored frontal matrix data (owned `Mat<f64>`).
2732/// - `m`: Front size (number of rows/columns used).
2733/// - `k`: Number of fully-summed rows/columns.
2734/// - `frontal_row_indices`: Global permuted column indices for each local row.
2735/// - `result`: APTP factorization result.
2736/// - `contrib_buffer`: Pre-allocated buffer already containing NFS×NFS data
2737///   in `[0..nfs, 0..nfs]` (from deferred GEMM). Moved in; becomes the
2738///   `ContributionBlock.data`.
2739pub(crate) fn extract_contribution(
2740    frontal_data: &Mat<f64>,
2741    m: usize,
2742    k: usize,
2743    frontal_row_indices: &[usize],
2744    result: &AptpFactorResult,
2745    mut contrib_buffer: Mat<f64>,
2746) -> ContributionBlock {
2747    let ne = result.num_eliminated;
2748    let size = m - ne;
2749    let num_delayed = k - ne;
2750    let nfs = m - k; // = size - num_delayed
2751
2752    // The NFS×NFS region (contrib_buffer[0..nfs, 0..nfs]) is already filled by
2753    // compute_contribution_gemm. We need to handle delayed columns (if any) by
2754    // shifting the NFS×NFS data to make room for the delayed portion.
2755    if num_delayed > 0 {
2756        // Delayed columns exist: we need to assemble the full (size × size) contribution.
2757        // Layout: [delayed × delayed | NFS × delayed | NFS × NFS]
2758        //
2759        // Strategy: allocate a new buffer of the correct size, copy delayed regions
2760        // from workspace, and NFS×NFS from contrib_buffer.
2761        let mut data = Mat::zeros(size, size);
2762
2763        // Copy delayed × delayed from workspace: frontal_data[ne..k, ne..k]
2764        // These positions are in the APTP-permuted order. The delayed columns
2765        // are at positions ne..k in the factored frontal matrix.
2766        for j in 0..num_delayed {
2767            let col_len = num_delayed - j;
2768            let src = &frontal_data.col_as_slice(ne + j)[ne + j..ne + j + col_len];
2769            data.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
2770        }
2771
2772        // Copy NFS × delayed cross-terms from workspace: frontal_data[k..m, ne..k]
2773        for j in 0..num_delayed {
2774            let src = &frontal_data.col_as_slice(ne + j)[k..m];
2775            data.col_as_slice_mut(j)[num_delayed..size].copy_from_slice(src);
2776        }
2777
2778        // Copy NFS × NFS from contrib_buffer[0..nfs, 0..nfs] into data[num_delayed..size, num_delayed..size]
2779        for j in 0..nfs {
2780            let col_len = nfs - j;
2781            let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
2782            data.col_as_slice_mut(num_delayed + j)[num_delayed + j..size].copy_from_slice(src);
2783        }
2784
2785        contrib_buffer = data;
2786    }
2787    // else: num_delayed == 0 → contrib_buffer already has the full contribution
2788    // (NFS×NFS = entire contribution), no copy needed. True zero-copy path.
2789
2790    // Build row indices:
2791    // - First num_delayed entries: delayed columns mapped through result.perm and frontal_row_indices
2792    // - Remaining entries: non-fully-summed rows from frontal_row_indices[k..m]
2793    let mut row_indices = Vec::with_capacity(size);
2794
2795    // Delayed columns: these are at positions ne..k in the APTP-permuted frontal matrix
2796    // result.perm[ne..k] gives the original front-local indices of delayed columns
2797    for &lp in &result.perm[ne..k] {
2798        row_indices.push(frontal_row_indices[lp]);
2799    }
2800
2801    // Non-fully-summed rows
2802    row_indices.extend_from_slice(&frontal_row_indices[k..m]);
2803
2804    ContributionBlock {
2805        data: contrib_buffer,
2806        row_indices,
2807        num_delayed,
2808    }
2809}
2810
2811// ---------------------------------------------------------------------------
2812// Tests
2813// ---------------------------------------------------------------------------
2814
2815#[cfg(test)]
2816mod tests {
2817    use super::*;
2818    use crate::symmetric::factor::AptpOptions;
2819
2820    /// Helper: build frontal matrix data from a dense lower-triangle matrix.
2821    ///
2822    /// Returns owned `(data, row_indices)` — construct a `FrontalMatrix` view from these.
2823    /// `k` = number of fully-summed columns.
2824    /// `row_indices` = global permuted indices for each local row (length m).
2825    fn make_frontal_data(
2826        lower: &Mat<f64>,
2827        k: usize,
2828        row_indices: Vec<usize>,
2829    ) -> (Mat<f64>, Vec<usize>, usize) {
2830        let m = lower.nrows();
2831        assert_eq!(lower.ncols(), m);
2832        assert_eq!(row_indices.len(), m);
2833        assert!(k <= m);
2834
2835        // Build full symmetric from lower triangle
2836        let mut data = Mat::zeros(m, m);
2837        for i in 0..m {
2838            for j in 0..=i {
2839                data[(i, j)] = lower[(i, j)];
2840                data[(j, i)] = lower[(i, j)];
2841            }
2842        }
2843
2844        (data, row_indices, k)
2845    }
2846
2847    /// Helper: build a symmetric indefinite matrix (lower triangle) that forces
2848    /// delays. Strategy from SPRAL's `cause_delays`: scale selected rows/cols
2849    /// by a large factor, making their off-diagonal entries dominate the diagonal
2850    /// so the threshold test fails.
2851    ///
2852    /// Returns (matrix, expected_to_have_delays).
2853    fn make_delay_matrix(m: usize, k: usize, delay_rows: &[usize]) -> Mat<f64> {
2854        let mut a = Mat::zeros(m, m);
2855
2856        // Start with a diagonally dominant indefinite matrix
2857        for i in 0..m {
2858            // Alternate positive/negative diagonal for indefiniteness
2859            a[(i, i)] = if i % 2 == 0 { 4.0 } else { -4.0 };
2860            // Add some off-diagonal coupling
2861            for j in 0..i {
2862                let val = 0.5 / ((i - j) as f64 + 1.0);
2863                a[(i, j)] = val;
2864            }
2865        }
2866
2867        // Scale delay_rows to create threshold failures:
2868        // Multiply row/col by 1000 but NOT the diagonal, so off-diag >> diag
2869        for &r in delay_rows {
2870            if r < k {
2871                // Scale off-diagonal entries in row r
2872                for j in 0..r {
2873                    a[(r, j)] *= 1000.0;
2874                }
2875                for i in (r + 1)..m {
2876                    a[(i, r)] *= 1000.0;
2877                }
2878                // DON'T scale diagonal — this makes |a_rr| << |a_ir| => threshold failure
2879            }
2880        }
2881
2882        a
2883    }
2884
2885    /// Test that extract_front_factors produces correct L21 dimensions when
2886    /// delays are present: L21 should have (m - ne) rows, NOT (m - k) rows.
2887    /// This is the regression test for the bug fixed in this commit.
2888    #[test]
2889    fn test_extract_front_factors_l21_includes_delayed_rows() {
2890        let m = 12;
2891        let k = 8;
2892        let delay_rows = &[1, 5]; // Columns 1 and 5 should be delayed
2893
2894        let lower = make_delay_matrix(m, k, delay_rows);
2895        let row_indices: Vec<usize> = (100..100 + m).collect(); // arbitrary global indices
2896
2897        let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
2898
2899        let options = AptpOptions::default();
2900        let result =
2901            aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
2902
2903        let ne = result.num_eliminated;
2904        let num_delayed = k - ne;
2905
2906        // The bug: if delays occurred, the old code would make L21 have (m - k) rows
2907        // instead of (m - ne) rows, missing the delayed-row entries.
2908        if num_delayed > 0 {
2909            let ff = extract_front_factors(&data, m, k, &row_indices, &result);
2910
2911            // L21 must have (m - ne) rows = delayed_rows + non-fully-summed rows
2912            assert_eq!(
2913                ff.l21.nrows(),
2914                m - ne,
2915                "L21 should have m-ne={} rows (not m-k={}), ne={}, delays={}",
2916                m - ne,
2917                m - k,
2918                ne,
2919                num_delayed
2920            );
2921
2922            // row_indices must match L21 rows
2923            assert_eq!(
2924                ff.row_indices.len(),
2925                ff.l21.nrows(),
2926                "row_indices length must match L21 rows"
2927            );
2928
2929            // col_indices must have ne entries
2930            assert_eq!(ff.col_indices.len(), ne);
2931        } else {
2932            // No delays — still verify basic shapes
2933            let ff = extract_front_factors(&data, m, k, &row_indices, &result);
2934            assert_eq!(ff.l21.nrows(), m - ne);
2935            assert_eq!(ff.row_indices.len(), ff.l21.nrows());
2936        }
2937    }
2938
2939    /// Test that delayed rows in extract_front_factors and extract_contribution
2940    /// have consistent row_indices. Both functions MUST agree on which global
2941    /// indices correspond to delayed rows.
2942    #[test]
2943    fn test_extract_front_factors_contribution_row_indices_consistent() {
2944        let m = 10;
2945        let k = 6;
2946        let delay_rows = &[2]; // Force column 2 to be delayed
2947
2948        let lower = make_delay_matrix(m, k, delay_rows);
2949        let row_indices: Vec<usize> = (200..200 + m).collect();
2950
2951        let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
2952
2953        let options = AptpOptions::default();
2954        let result =
2955            aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
2956
2957        let ne = result.num_eliminated;
2958        let num_delayed = k - ne;
2959
2960        if num_delayed > 0 {
2961            let ff = extract_front_factors(&data, m, k, &row_indices, &result);
2962            let nfs = m - k;
2963            let mut contrib_buf = Mat::zeros(m, m);
2964            if nfs > 0 {
2965                let mut ld_ws = Mat::new();
2966                compute_contribution_gemm(
2967                    &data,
2968                    k,
2969                    ne,
2970                    m,
2971                    &result.d,
2972                    &mut contrib_buf,
2973                    &mut ld_ws,
2974                    Par::Seq,
2975                );
2976            }
2977            let cb = extract_contribution(&data, m, k, &row_indices, &result, contrib_buf);
2978
2979            // Both must have the same number of delayed rows
2980            assert_eq!(cb.num_delayed, num_delayed);
2981
2982            // The first num_delayed entries of row_indices must match
2983            let ff_delayed_indices = &ff.row_indices[..num_delayed];
2984            let cb_delayed_indices = &cb.row_indices[..num_delayed];
2985            assert_eq!(
2986                ff_delayed_indices, cb_delayed_indices,
2987                "Delayed row indices must be identical between extract_front_factors \
2988                 and extract_contribution.\n  front_factors: {:?}\n  contribution:  {:?}",
2989                ff_delayed_indices, cb_delayed_indices
2990            );
2991
2992            // The non-fully-summed row indices must also match
2993            let ff_nfs_indices = &ff.row_indices[num_delayed..];
2994            let cb_nfs_indices = &cb.row_indices[num_delayed..];
2995            assert_eq!(
2996                ff_nfs_indices, cb_nfs_indices,
2997                "Non-fully-summed row indices must match"
2998            );
2999        }
3000    }
3001
3002    /// Test that L21 entries at delayed row positions are non-zero.
3003    /// When APTP delays a column, the TRSM (apply_and_check) has already
3004    /// computed valid L entries at those row positions. These must be preserved
3005    /// in the extracted L21.
3006    #[test]
3007    fn test_extract_front_factors_delayed_l21_entries_populated() {
3008        let m = 12;
3009        let k = 8;
3010        // Use multiple delay candidates to increase chance of delays
3011        let delay_rows = &[0, 3, 5];
3012
3013        let lower = make_delay_matrix(m, k, delay_rows);
3014        let row_indices: Vec<usize> = (0..m).collect();
3015
3016        let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
3017
3018        let options = AptpOptions::default();
3019        let result =
3020            aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
3021
3022        let ne = result.num_eliminated;
3023        let num_delayed = k - ne;
3024
3025        if num_delayed > 0 && ne > 0 {
3026            let ff = extract_front_factors(&data, m, k, &row_indices, &result);
3027
3028            // Check that L21 entries in the delayed-row portion (rows 0..num_delayed)
3029            // are not all zero. These rows received TRSM updates before being delayed.
3030            let delayed_l21 = ff.l21.as_ref().subrows(0, num_delayed);
3031            let mut has_nonzero = false;
3032            for i in 0..delayed_l21.nrows() {
3033                for j in 0..delayed_l21.ncols() {
3034                    if delayed_l21[(i, j)].abs() > 1e-16 {
3035                        has_nonzero = true;
3036                    }
3037                }
3038            }
3039            assert!(
3040                has_nonzero,
3041                "L21 delayed rows should have non-zero entries from TRSM. \
3042                 ne={}, delays={}, L21 shape=({},{})",
3043                ne,
3044                num_delayed,
3045                ff.l21.nrows(),
3046                ff.l21.ncols()
3047            );
3048        }
3049    }
3050
3051    /// Reconstruction test for extract_front_factors with delays.
3052    /// Verify that the extracted L11, D11, L21 correctly reconstruct the
3053    /// factored portion of the frontal matrix: PAP^T[0..ne, 0..ne] = L11 * D11 * L11^T.
3054    #[test]
3055    fn test_extract_front_factors_reconstruction_with_delays() {
3056        let m = 16;
3057        let k = 10;
3058        let delay_rows = &[1, 4, 7];
3059
3060        let lower = make_delay_matrix(m, k, delay_rows);
3061        let row_indices: Vec<usize> = (0..m).collect();
3062
3063        // Save a copy of the original symmetric matrix before factoring
3064        let mut original = Mat::zeros(m, m);
3065        for i in 0..m {
3066            for j in 0..=i {
3067                original[(i, j)] = lower[(i, j)];
3068                original[(j, i)] = lower[(i, j)];
3069            }
3070        }
3071
3072        let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
3073
3074        let options = AptpOptions::default();
3075        let result =
3076            aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
3077
3078        let ne = result.num_eliminated;
3079        if ne == 0 {
3080            return; // Nothing to reconstruct
3081        }
3082
3083        let ff = extract_front_factors(&data, m, k, &row_indices, &result);
3084
3085        // Compute L11 * D * L11^T manually
3086        // Step 1: LD = L11 * D (column-by-column, handling 1x1 and 2x2 pivots)
3087        let mut ld = Mat::zeros(ne, ne);
3088        let mut col = 0;
3089        while col < ne {
3090            match ff.d11.pivot_type(col) {
3091                PivotType::OneByOne => {
3092                    let d = ff.d11.diagonal_1x1(col);
3093                    for i in 0..ne {
3094                        ld[(i, col)] = ff.l11[(i, col)] * d;
3095                    }
3096                    col += 1;
3097                }
3098                PivotType::TwoByTwo { .. } => {
3099                    let block = ff.d11.diagonal_2x2(col);
3100                    for i in 0..ne {
3101                        let l0 = ff.l11[(i, col)];
3102                        let l1 = ff.l11[(i, col + 1)];
3103                        ld[(i, col)] = l0 * block.a + l1 * block.b;
3104                        ld[(i, col + 1)] = l0 * block.b + l1 * block.c;
3105                    }
3106                    col += 2;
3107                }
3108                PivotType::Delayed => {
3109                    col += 1;
3110                }
3111            }
3112        }
3113
3114        // Step 2: reconstructed = LD * L11^T
3115        let mut reconstructed = Mat::zeros(ne, ne);
3116        for i in 0..ne {
3117            for j in 0..=i {
3118                let mut val = 0.0;
3119                for p in 0..ne {
3120                    val += ld[(i, p)] * ff.l11[(j, p)];
3121                }
3122                reconstructed[(i, j)] = val;
3123                reconstructed[(j, i)] = val;
3124            }
3125        }
3126
3127        // Build the permuted original submatrix for comparison
3128        let mut perm_original = Mat::zeros(ne, ne);
3129        for i in 0..ne {
3130            for j in 0..=i {
3131                let oi = result.perm[i];
3132                let oj = result.perm[j];
3133                let val = if oi >= oj {
3134                    original[(oi, oj)]
3135                } else {
3136                    original[(oj, oi)]
3137                };
3138                perm_original[(i, j)] = val;
3139                perm_original[(j, i)] = val;
3140            }
3141        }
3142
3143        // Check reconstruction: ||PAP^T[0..ne,0..ne] - L11*D11*L11^T|| / ||PAP^T||
3144        let mut diff_norm = 0.0f64;
3145        let mut orig_norm = 0.0f64;
3146        for i in 0..ne {
3147            for j in 0..=i {
3148                let d = reconstructed[(i, j)] - perm_original[(i, j)];
3149                diff_norm += d * d;
3150                orig_norm += perm_original[(i, j)] * perm_original[(i, j)];
3151            }
3152        }
3153        let rel_err = diff_norm.sqrt() / orig_norm.sqrt().max(1e-15);
3154
3155        assert!(
3156            rel_err < 1e-10,
3157            "L11*D11*L11^T reconstruction error: {:.2e} (ne={}, delays={})",
3158            rel_err,
3159            ne,
3160            k - ne
3161        );
3162    }
3163
3164    /// Verify that the solve path produces correct results when delays are
3165    /// present. Uses a single dense frontal matrix (simulating one supernode),
3166    /// factors with delays, extracts front_factors, and runs forward/diagonal/
3167    /// backward solve steps to verify Ax = b.
3168    ///
3169    /// This directly tests the code path that the extract_front_factors bug
3170    /// would break: delayed-row L21 entries participate in forward and backward
3171    /// solve via gather/scatter operations.
3172    #[test]
3173    fn test_extract_front_factors_solve_roundtrip_with_delays() {
3174        let m = 12;
3175        let k = 8; // fully-summed columns
3176
3177        let lower = make_delay_matrix(m, k, &[0, 3, 5]);
3178        let row_indices: Vec<usize> = (0..m).collect();
3179
3180        // Build full symmetric matrix for reference
3181        let mut a_full = Mat::zeros(m, m);
3182        for i in 0..m {
3183            for j in 0..=i {
3184                a_full[(i, j)] = lower[(i, j)];
3185                a_full[(j, i)] = lower[(i, j)];
3186            }
3187        }
3188
3189        let (mut data, row_indices, k) = make_frontal_data(&lower, k, row_indices);
3190
3191        let options = AptpOptions::default();
3192        let result =
3193            aptp_factor_in_place(data.as_mut(), k, &options).expect("factor should succeed");
3194
3195        let ne = result.num_eliminated;
3196        let num_delayed = k - ne;
3197
3198        // Only meaningful if we actually got some delays and some eliminations
3199        if num_delayed == 0 || ne == 0 {
3200            return;
3201        }
3202
3203        let ff = extract_front_factors(&data, m, k, &row_indices, &result);
3204
3205        // Check that L11, D11, L21 are internally consistent:
3206        // L * D * L^T (where L = [L11; L21]) should reconstruct the factored
3207        // portion of A when permuted.
3208
3209        // Build full L = [L11; L21] and check L * D * L^T = P * A[0..m, 0..ne] * P^T
3210        let r = ff.l21.nrows();
3211        let full_l_rows = ne + r;
3212        assert_eq!(full_l_rows, m, "L should cover all rows");
3213
3214        // Compute L*D*L^T for the eliminated portion
3215        let mut ld = Mat::zeros(full_l_rows, ne);
3216
3217        // L = [L11; L21]
3218        let mut full_l = Mat::zeros(full_l_rows, ne);
3219        for i in 0..ne {
3220            for j in 0..ne {
3221                full_l[(i, j)] = ff.l11[(i, j)];
3222            }
3223        }
3224        for i in 0..r {
3225            for j in 0..ne {
3226                full_l[(ne + i, j)] = ff.l21[(i, j)];
3227            }
3228        }
3229
3230        // LD = L * D (handling 1x1 and 2x2)
3231        let mut col = 0;
3232        while col < ne {
3233            match ff.d11.pivot_type(col) {
3234                PivotType::OneByOne => {
3235                    let d = ff.d11.diagonal_1x1(col);
3236                    for i in 0..full_l_rows {
3237                        ld[(i, col)] = full_l[(i, col)] * d;
3238                    }
3239                    col += 1;
3240                }
3241                PivotType::TwoByTwo { .. } => {
3242                    let block = ff.d11.diagonal_2x2(col);
3243                    for i in 0..full_l_rows {
3244                        let l0 = full_l[(i, col)];
3245                        let l1 = full_l[(i, col + 1)];
3246                        ld[(i, col)] = l0 * block.a + l1 * block.b;
3247                        ld[(i, col + 1)] = l0 * block.b + l1 * block.c;
3248                    }
3249                    col += 2;
3250                }
3251                PivotType::Delayed => {
3252                    col += 1;
3253                }
3254            }
3255        }
3256
3257        // reconstructed = LD * L^T
3258        let mut reconstructed = Mat::zeros(full_l_rows, full_l_rows);
3259        for i in 0..full_l_rows {
3260            for j in 0..=i {
3261                let mut val = 0.0;
3262                for p in 0..ne {
3263                    val += ld[(i, p)] * full_l[(j, p)];
3264                }
3265                reconstructed[(i, j)] = val;
3266                reconstructed[(j, i)] = val;
3267            }
3268        }
3269
3270        // Build permuted original: row_order = col_indices ++ row_indices
3271        let mut perm_rows = Vec::with_capacity(full_l_rows);
3272        perm_rows.extend_from_slice(ff.col_indices());
3273        perm_rows.extend_from_slice(ff.row_indices());
3274
3275        // Check that L*D*L^T[0..ne, 0..ne] ≈ A_perm[0..ne, 0..ne]
3276        let mut diff_norm = 0.0f64;
3277        let mut orig_norm = 0.0f64;
3278        for i in 0..ne {
3279            for j in 0..=i {
3280                let gi = perm_rows[i];
3281                let gj = perm_rows[j];
3282                let orig_val = if gi >= gj {
3283                    a_full[(gi, gj)]
3284                } else {
3285                    a_full[(gj, gi)]
3286                };
3287                let d = reconstructed[(i, j)] - orig_val;
3288                diff_norm += d * d;
3289                orig_norm += orig_val * orig_val;
3290            }
3291        }
3292        let rel_err = diff_norm.sqrt() / orig_norm.sqrt().max(1e-15);
3293        assert!(
3294            rel_err < 1e-10,
3295            "Full L*D*L^T reconstruction error: {:.2e} (ne={}, delays={}, L21_rows={})",
3296            rel_err,
3297            ne,
3298            num_delayed,
3299            r
3300        );
3301    }
3302
3303    /// Test scatter_original_entries_multi with non-contiguous owned_ranges.
3304    ///
3305    /// After amalgamation, a merged supernode may own columns from multiple
3306    /// non-contiguous ranges (e.g., [0..2, 4..6] when child columns 2,3 were
3307    /// NOT merged). This test verifies:
3308    ///
3309    /// 1. Entries from all owned ranges are scattered into the frontal matrix
3310    /// 2. Upper-triangle deduplication works correctly across non-contiguous
3311    ///    ranges (an entry with both row and col in owned ranges is only counted once)
3312    /// 3. Entries where one index is owned and the other is a non-fully-summed
3313    ///    row are scattered correctly
3314    #[test]
3315    fn test_scatter_original_entries_multi_non_contiguous() {
3316        use faer::sparse::{SparseColMat, Triplet};
3317
3318        // Build a small 6×6 symmetric matrix with identity permutation.
3319        // Owned ranges: [0..2, 4..6] (columns 0,1,4,5 owned; 2,3 not owned).
3320        //
3321        // The frontal has 6 rows: locals 0..4 are owned (4 total_owned),
3322        // no delayed children, rows 4..6 are non-fully-summed (rows for
3323        // global indices 2 and 3).
3324        //
3325        // Matrix (lower triangle values, symmetric):
3326        //     col: 0    1    2    3    4    5
3327        // 0: [10.0                          ]
3328        // 1: [ 1.0  11.0                    ]
3329        // 2: [ 2.0   0.0  12.0              ]
3330        // 3: [ 0.0   3.0   0.0  13.0        ]
3331        // 4: [ 4.0   0.0   0.0   0.0  14.0  ]
3332        // 5: [ 0.0   5.0   6.0   0.0   7.0  15.0]
3333        let triplets = vec![
3334            // Diagonal
3335            Triplet::new(0, 0, 10.0),
3336            Triplet::new(1, 1, 11.0),
3337            Triplet::new(2, 2, 12.0),
3338            Triplet::new(3, 3, 13.0),
3339            Triplet::new(4, 4, 14.0),
3340            Triplet::new(5, 5, 15.0),
3341            // Off-diagonal (both triangles for full CSC)
3342            Triplet::new(1, 0, 1.0),
3343            Triplet::new(0, 1, 1.0), // mirror
3344            Triplet::new(2, 0, 2.0),
3345            Triplet::new(0, 2, 2.0), // mirror
3346            Triplet::new(3, 1, 3.0),
3347            Triplet::new(1, 3, 3.0), // mirror
3348            Triplet::new(4, 0, 4.0),
3349            Triplet::new(0, 4, 4.0), // mirror
3350            Triplet::new(5, 1, 5.0),
3351            Triplet::new(1, 5, 5.0), // mirror
3352            Triplet::new(5, 2, 6.0),
3353            Triplet::new(2, 5, 6.0), // mirror
3354            Triplet::new(5, 4, 7.0),
3355            Triplet::new(4, 5, 7.0), // mirror
3356        ];
3357        let matrix = SparseColMat::try_new_from_triplets(6, 6, &triplets).expect("valid CSC");
3358
3359        // Identity permutation
3360        let perm_fwd: Vec<usize> = (0..6).collect();
3361        let perm_inv: Vec<usize> = (0..6).collect();
3362
3363        // Frontal rows: owned columns first (0,1,4,5), then non-owned rows (2,3)
3364        let frontal_rows = vec![0, 1, 4, 5, 2, 3];
3365        let total_owned = 4; // columns 0,1,4,5
3366        let m = frontal_rows.len();
3367        let k = total_owned; // num_fully_summed = total_owned (no delayed children)
3368
3369        // Build global_to_local mapping
3370        let mut global_to_local = vec![NOT_IN_FRONT; 6];
3371        for (local, &global) in frontal_rows.iter().enumerate() {
3372            global_to_local[global] = local;
3373        }
3374
3375        let mut frontal_data = Mat::zeros(m, m);
3376
3377        // Non-contiguous owned ranges (the key case)
3378        let owned_ranges = vec![0..2, 4..6];
3379
3380        {
3381            let mut frontal = FrontalMatrix {
3382                data: frontal_data.as_mut(),
3383                row_indices: &frontal_rows,
3384                num_fully_summed: k,
3385            };
3386
3387            scatter_original_entries_multi(
3388                &mut frontal,
3389                &matrix,
3390                &perm_fwd,
3391                &perm_inv,
3392                &global_to_local,
3393                &owned_ranges,
3394                None,
3395            );
3396        }
3397
3398        // Verify scattered values.
3399        // Local indices: 0→global 0, 1→global 1, 2→global 4, 3→global 5, 4→global 2, 5→global 3
3400        //
3401        // Expected entries in the frontal (lower triangle):
3402        //   (0,0)=10.0  diagonal of global 0
3403        //   (1,0)=1.0   global(1,0) → local(1,0)
3404        //   (1,1)=11.0  diagonal of global 1
3405        //   (2,0)=4.0   global(4,0) → local(2,0)
3406        //   (2,2)=14.0  diagonal of global 4
3407        //   (3,1)=5.0   global(5,1) → local(3,1)
3408        //   (3,2)=7.0   global(5,4) → local(3,2)
3409        //   (3,3)=15.0  diagonal of global 5
3410        //   (4,0)=2.0   global(2,0) → local(4,0)  [non-owned row, owned col]
3411        //   (4,3)=6.0   global(2,5) → local(4,3)  [non-owned row, owned col]
3412        //   (5,1)=3.0   global(3,1) → local(5,1)  [non-owned row, owned col]
3413        //
3414        // Upper-triangle dedup: global(0,1) should be skipped because both
3415        // 0 and 1 are owned. Similarly global(0,4), global(1,5), global(4,5)
3416        // are upper-triangle entries between owned cols and should be skipped.
3417
3418        let f = &frontal_data;
3419
3420        // Diagonal entries
3421        assert_eq!(f[(0, 0)], 10.0, "diag global 0");
3422        assert_eq!(f[(1, 1)], 11.0, "diag global 1");
3423        assert_eq!(f[(2, 2)], 14.0, "diag global 4");
3424        assert_eq!(f[(3, 3)], 15.0, "diag global 5");
3425
3426        // Off-diagonal between owned cols (lower triangle only)
3427        assert_eq!(f[(1, 0)], 1.0, "global(1,0) → local(1,0)");
3428        assert_eq!(f[(2, 0)], 4.0, "global(4,0) → local(2,0)");
3429        assert_eq!(f[(3, 1)], 5.0, "global(5,1) → local(3,1)");
3430        assert_eq!(f[(3, 2)], 7.0, "global(5,4) → local(3,2)");
3431
3432        // Non-owned rows (global 2 → local 4, global 3 → local 5)
3433        assert_eq!(f[(4, 0)], 2.0, "global(2,0) → local(4,0)");
3434        assert_eq!(f[(4, 3)], 6.0, "global(2,5) → local(4,3)");
3435        assert_eq!(f[(5, 1)], 3.0, "global(3,1) → local(5,1)");
3436
3437        // Verify no spurious double-counting: the (0,1) position should be 1.0,
3438        // not 2.0 (which would happen if upper-triangle entry was also scattered)
3439        assert_eq!(
3440            f[(1, 0)],
3441            1.0,
3442            "upper-triangle dedup: (1,0) should be 1.0, not 2.0"
3443        );
3444
3445        // Cross-range upper-triangle dedup: global(4,0) upper entry global(0,4)
3446        // should NOT be scattered separately, so local(2,0) should be 4.0
3447        assert_eq!(
3448            f[(2, 0)],
3449            4.0,
3450            "cross-range upper-triangle dedup: (2,0) should be 4.0, not 8.0"
3451        );
3452
3453        // Verify zeros where no entries exist
3454        assert_eq!(f[(4, 1)], 0.0, "no global(2,1) entry");
3455        assert_eq!(f[(4, 2)], 0.0, "no global(2,4) entry");
3456        assert_eq!(f[(5, 0)], 0.0, "no global(3,0) entry");
3457        assert_eq!(f[(5, 2)], 0.0, "no global(3,4) entry");
3458        assert_eq!(f[(5, 3)], 0.0, "no global(3,5) entry");
3459    }
3460
3461    // -----------------------------------------------------------------------
3462    // extend_add_mapped: precomputed-map scatter (zero-delay fast path)
3463    // -----------------------------------------------------------------------
3464
3465    /// Test that `extend_add_mapped` correctly scatters a child contribution
3466    /// into a parent frontal matrix using a precomputed u32 row mapping,
3467    /// handling the lower-triangle symmetry swap when child ordering differs
3468    /// from parent ordering (non-monotonic map from amalgamated supernodes).
3469    #[test]
3470    fn test_extend_add_mapped_hand_constructed() {
3471        // Parent frontal: 5×5, initially zero (represents assembled parent).
3472        let mut parent_data = Mat::<f64>::zeros(5, 5);
3473        let parent_row_indices = vec![10, 20, 30, 40, 50]; // global indices (unused by mapped path)
3474        let mut parent = FrontalMatrix {
3475            data: parent_data.as_mut(),
3476            row_indices: &parent_row_indices,
3477            num_fully_summed: 3,
3478        };
3479
3480        // Child contribution: 3×3 lower triangle.
3481        //   [1.0          ]
3482        //   [2.0  3.0     ]
3483        //   [4.0  5.0  6.0]
3484        let mut child_data = Mat::<f64>::zeros(3, 3);
3485        child_data[(0, 0)] = 1.0;
3486        child_data[(1, 0)] = 2.0;
3487        child_data[(1, 1)] = 3.0;
3488        child_data[(2, 0)] = 4.0;
3489        child_data[(2, 1)] = 5.0;
3490        child_data[(2, 2)] = 6.0;
3491
3492        let child = ContributionBlock {
3493            data: child_data,
3494            row_indices: vec![30, 10, 40], // global indices
3495            num_delayed: 0,
3496        };
3497
3498        // Precomputed map: child row 0 → parent local 2 (global 30),
3499        //                  child row 1 → parent local 0 (global 10),
3500        //                  child row 2 → parent local 3 (global 40).
3501        // Note: child row 1 maps to a LOWER parent index than child row 0,
3502        // so extend_add_mapped must handle the li < lj swap case.
3503        let ea_row_map: Vec<u32> = vec![2, 0, 3];
3504
3505        let recycled = extend_add_mapped(&mut parent, child, &ea_row_map);
3506
3507        // Verify buffer is returned for recycling
3508        assert_eq!(recycled.nrows(), 3);
3509        assert_eq!(recycled.ncols(), 3);
3510
3511        // Expected scatter (lower triangle of parent):
3512        //
3513        // child(0,0)=1.0: li=2, lj=2 → parent(2,2) += 1.0
3514        // child(1,0)=2.0: li=0, lj=2 → li < lj, swap → parent(2,0) += 2.0
3515        // child(1,1)=3.0: li=0, lj=0 → parent(0,0) += 3.0
3516        // child(2,0)=4.0: li=3, lj=2 → parent(3,2) += 4.0
3517        // child(2,1)=5.0: li=3, lj=0 → parent(3,0) += 5.0
3518        // child(2,2)=6.0: li=3, lj=3 → parent(3,3) += 6.0
3519
3520        let p = parent.data.as_ref();
3521        assert_eq!(p[(0, 0)], 3.0, "child(1,1)=3.0 → parent(0,0)");
3522        assert_eq!(p[(2, 0)], 2.0, "child(1,0)=2.0 swapped → parent(2,0)");
3523        assert_eq!(p[(2, 2)], 1.0, "child(0,0)=1.0 → parent(2,2)");
3524        assert_eq!(p[(3, 0)], 5.0, "child(2,1)=5.0 → parent(3,0)");
3525        assert_eq!(p[(3, 2)], 4.0, "child(2,0)=4.0 → parent(3,2)");
3526        assert_eq!(p[(3, 3)], 6.0, "child(2,2)=6.0 → parent(3,3)");
3527
3528        // Verify no spurious entries (untouched positions remain zero)
3529        assert_eq!(p[(1, 0)], 0.0, "row 1 untouched");
3530        assert_eq!(p[(1, 1)], 0.0, "row 1 untouched");
3531        assert_eq!(p[(4, 0)], 0.0, "row 4 untouched");
3532        assert_eq!(p[(4, 4)], 0.0, "row 4 untouched");
3533    }
3534
3535    /// Test column-oriented `extend_add_mapped` with a larger 10×10
3536    /// contribution (monotonic map), comparing against a naive reference.
3537    #[test]
3538    fn test_extend_add_mapped_10x10_vs_reference() {
3539        let cb_size = 10;
3540        let parent_size = 15;
3541
3542        // Monotonically increasing map: child rows map to
3543        // parent rows [1, 3, 4, 5, 7, 8, 10, 11, 12, 14].
3544        let ea_row_map: Vec<u32> = vec![1, 3, 4, 5, 7, 8, 10, 11, 12, 14];
3545
3546        // Build a child contribution with known lower-triangle values.
3547        // Use value = (i+1)*100 + (j+1) for easy identification.
3548        let mut child_data = Mat::<f64>::zeros(cb_size, cb_size);
3549        for j in 0..cb_size {
3550            for i in j..cb_size {
3551                child_data[(i, j)] = (i as f64 + 1.0) * 100.0 + (j as f64 + 1.0);
3552            }
3553        }
3554
3555        // Reference: naive element-by-element scatter with lower-triangle swap
3556        let mut ref_parent = Mat::<f64>::zeros(parent_size, parent_size);
3557        for j in 0..cb_size {
3558            let lj = ea_row_map[j] as usize;
3559            for i in j..cb_size {
3560                let li = ea_row_map[i] as usize;
3561                if li >= lj {
3562                    ref_parent[(li, lj)] += child_data[(i, j)];
3563                } else {
3564                    ref_parent[(lj, li)] += child_data[(i, j)];
3565                }
3566            }
3567        }
3568
3569        // Actual: column-oriented extend_add_mapped
3570        let mut parent_data = Mat::<f64>::zeros(parent_size, parent_size);
3571        let parent_row_indices: Vec<usize> = (0..parent_size).collect();
3572        let mut parent = FrontalMatrix {
3573            data: parent_data.as_mut(),
3574            row_indices: &parent_row_indices,
3575            num_fully_summed: 5,
3576        };
3577        let child = ContributionBlock {
3578            data: child_data,
3579            row_indices: (0..cb_size).collect(),
3580            num_delayed: 0,
3581        };
3582        let _ = extend_add_mapped(&mut parent, child, &ea_row_map);
3583
3584        // Compare every element
3585        for j in 0..parent_size {
3586            for i in 0..parent_size {
3587                assert_eq!(
3588                    parent.data[(i, j)],
3589                    ref_parent[(i, j)],
3590                    "mismatch at ({}, {})",
3591                    i,
3592                    j
3593                );
3594            }
3595        }
3596    }
3597
3598    /// Test column-oriented `extend_add_mapped` with a non-monotonic map
3599    /// (simulating post-amalgamation parent with non-contiguous owned ranges).
3600    #[test]
3601    fn test_extend_add_mapped_non_monotonic() {
3602        let cb_size = 4;
3603        let parent_size = 8;
3604
3605        // Non-monotonic map: simulates child pattern rows mapping into a
3606        // parent whose owned_ranges are interleaved with pattern rows.
3607        let ea_row_map: Vec<u32> = vec![5, 2, 7, 1];
3608
3609        // Build child contribution with lower-triangle values
3610        let mut child_data = Mat::<f64>::zeros(cb_size, cb_size);
3611        for j in 0..cb_size {
3612            for i in j..cb_size {
3613                child_data[(i, j)] = (i * 10 + j + 1) as f64;
3614            }
3615        }
3616
3617        // Reference: naive scatter with swap
3618        let mut ref_parent = Mat::<f64>::zeros(parent_size, parent_size);
3619        for j in 0..cb_size {
3620            let lj = ea_row_map[j] as usize;
3621            for i in j..cb_size {
3622                let li = ea_row_map[i] as usize;
3623                if li >= lj {
3624                    ref_parent[(li, lj)] += child_data[(i, j)];
3625                } else {
3626                    ref_parent[(lj, li)] += child_data[(i, j)];
3627                }
3628            }
3629        }
3630
3631        // Actual
3632        let mut parent_data = Mat::<f64>::zeros(parent_size, parent_size);
3633        let parent_row_indices: Vec<usize> = (0..parent_size).collect();
3634        let mut parent = FrontalMatrix {
3635            data: parent_data.as_mut(),
3636            row_indices: &parent_row_indices,
3637            num_fully_summed: 4,
3638        };
3639        let child = ContributionBlock {
3640            data: child_data,
3641            row_indices: (0..cb_size).collect(),
3642            num_delayed: 0,
3643        };
3644        let _ = extend_add_mapped(&mut parent, child, &ea_row_map);
3645
3646        // Compare every element
3647        for j in 0..parent_size {
3648            for i in 0..parent_size {
3649                assert_eq!(
3650                    parent.data[(i, j)],
3651                    ref_parent[(i, j)],
3652                    "mismatch at ({}, {})",
3653                    i,
3654                    j
3655                );
3656            }
3657        }
3658    }
3659
3660    /// Test that `extend_add_mapped` accumulates contributions from multiple
3661    /// children into the same parent.
3662    #[test]
3663    fn test_extend_add_mapped_accumulates() {
3664        let mut parent_data = Mat::<f64>::zeros(4, 4);
3665        let parent_row_indices = vec![0, 1, 2, 3];
3666        let mut parent = FrontalMatrix {
3667            data: parent_data.as_mut(),
3668            row_indices: &parent_row_indices,
3669            num_fully_summed: 2,
3670        };
3671
3672        // First child: 2×2, maps to parent rows [1, 3]
3673        let mut c1_data = Mat::<f64>::zeros(2, 2);
3674        c1_data[(0, 0)] = 10.0;
3675        c1_data[(1, 0)] = 20.0;
3676        c1_data[(1, 1)] = 30.0;
3677        let child1 = ContributionBlock {
3678            data: c1_data,
3679            row_indices: vec![1, 3],
3680            num_delayed: 0,
3681        };
3682        let map1: Vec<u32> = vec![1, 3];
3683
3684        let _ = extend_add_mapped(&mut parent, child1, &map1);
3685
3686        // Second child: 2×2, also maps to parent rows [1, 3]
3687        let mut c2_data = Mat::<f64>::zeros(2, 2);
3688        c2_data[(0, 0)] = 5.0;
3689        c2_data[(1, 0)] = 7.0;
3690        c2_data[(1, 1)] = 9.0;
3691        let child2 = ContributionBlock {
3692            data: c2_data,
3693            row_indices: vec![1, 3],
3694            num_delayed: 0,
3695        };
3696        let map2: Vec<u32> = vec![1, 3];
3697
3698        let _ = extend_add_mapped(&mut parent, child2, &map2);
3699
3700        let p = parent.data.as_ref();
3701        assert_eq!(p[(1, 1)], 15.0, "10 + 5 accumulated");
3702        assert_eq!(p[(3, 1)], 27.0, "20 + 7 accumulated");
3703        assert_eq!(p[(3, 3)], 39.0, "30 + 9 accumulated");
3704    }
3705
3706    // -----------------------------------------------------------------------
3707    // Small-leaf subtree classification tests
3708    // -----------------------------------------------------------------------
3709
3710    /// Helper: create a SupernodeInfo with given parameters for classification tests.
3711    /// `ncols` = number of owned columns, `pattern_len` = number of off-diagonal rows.
3712    /// front_size = ncols + pattern_len.
3713    #[allow(clippy::single_range_in_vec_init)]
3714    fn make_snode(
3715        col_begin: usize,
3716        ncols: usize,
3717        pattern_len: usize,
3718        parent: Option<usize>,
3719    ) -> SupernodeInfo {
3720        SupernodeInfo {
3721            col_begin,
3722            col_end: col_begin + ncols,
3723            pattern: (0..pattern_len).map(|i| 1000 + col_begin + i).collect(),
3724            parent,
3725            owned_ranges: vec![col_begin..col_begin + ncols],
3726            in_small_leaf: false,
3727        }
3728    }
3729
3730    /// Linear chain of 5 small supernodes — all should be in_small_leaf.
3731    #[test]
3732    fn test_classify_all_small() {
3733        // Chain: 0 → 1 → 2 → 3 → 4 (each has 2 owned cols, 3 pattern → front_size=5)
3734        let mut supernodes = vec![
3735            make_snode(0, 2, 3, Some(1)),
3736            make_snode(2, 2, 3, Some(2)),
3737            make_snode(4, 2, 3, Some(3)),
3738            make_snode(6, 2, 3, Some(4)),
3739            make_snode(8, 2, 3, None),
3740        ];
3741        let children = build_children_map(&supernodes);
3742
3743        let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3744
3745        // All 5 nodes should be in_small_leaf
3746        for sn in &supernodes {
3747            assert!(
3748                sn.in_small_leaf,
3749                "snode {} should be in_small_leaf",
3750                sn.col_begin
3751            );
3752        }
3753        // One subtree with all 5 nodes in postorder
3754        assert_eq!(subtrees.len(), 1);
3755        assert_eq!(subtrees[0].nodes.len(), 5);
3756        assert_eq!(subtrees[0].nodes, vec![0, 1, 2, 3, 4]);
3757        assert_eq!(subtrees[0].root, 4);
3758        assert_eq!(subtrees[0].parent_of_root, None);
3759    }
3760
3761    /// Mixed tree — small leaves, large root.
3762    #[test]
3763    fn test_classify_mixed_tree() {
3764        // Tree: leaves 0,1 → parent 2 (large)
3765        // Nodes 0,1: front_size=5 (small)
3766        // Node 2: front_size=300 (large, >= 256)
3767        let mut supernodes = vec![
3768            make_snode(0, 2, 3, Some(2)),  // front=5, small
3769            make_snode(2, 2, 3, Some(2)),  // front=5, small
3770            make_snode(4, 100, 200, None), // front=300, large
3771        ];
3772        let children = build_children_map(&supernodes);
3773
3774        let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3775
3776        // Node 2 is large → not in_small_leaf
3777        assert!(!supernodes[2].in_small_leaf);
3778        // Nodes 0,1 are small leaves but they don't form a subtree with ≥2 nodes
3779        // that has a common root — they are separate single-leaf subtrees
3780        // Each leaf alone is 1 node which is < 2, so no subtrees
3781        assert!(!supernodes[0].in_small_leaf, "single node excluded");
3782        assert!(!supernodes[1].in_small_leaf, "single node excluded");
3783        assert_eq!(subtrees.len(), 0);
3784    }
3785
3786    /// Single isolated small leaves — no subtree formed (min 2 nodes).
3787    #[test]
3788    fn test_classify_single_node_excluded() {
3789        // Three independent small leaves (no parent linkage between them)
3790        let mut supernodes = vec![
3791            make_snode(0, 2, 3, None), // root, small, standalone
3792            make_snode(2, 2, 3, None), // root, small, standalone
3793            make_snode(4, 2, 3, None), // root, small, standalone
3794        ];
3795        let children = build_children_map(&supernodes);
3796
3797        let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3798
3799        // Each is a single-node subtree → excluded
3800        assert_eq!(subtrees.len(), 0);
3801        // in_small_leaf should be false for single-node subtrees (cleared by filter)
3802        for sn in &supernodes {
3803            assert!(!sn.in_small_leaf);
3804        }
3805    }
3806
3807    /// Threshold boundary — front_size == 256 excluded, 255 included.
3808    #[test]
3809    fn test_classify_threshold_boundary() {
3810        // Node 0: front_size=255 (qualifies), child of node 1
3811        // Node 1: front_size=256 (does NOT qualify — strict less-than)
3812        let mut supernodes = vec![
3813            make_snode(0, 100, 155, Some(1)), // front=255
3814            make_snode(100, 128, 128, None),  // front=256
3815        ];
3816        let children = build_children_map(&supernodes);
3817
3818        let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3819
3820        assert!(
3821            !supernodes[1].in_small_leaf,
3822            "front_size=256 must NOT qualify"
3823        );
3824        // Node 0 is a small leaf but its parent is NOT small → single-node subtree → excluded
3825        assert!(!supernodes[0].in_small_leaf, "single node excluded");
3826        assert_eq!(subtrees.len(), 0);
3827
3828        // Now with front_size=255 for the parent too
3829        let mut supernodes2 = vec![
3830            make_snode(0, 100, 155, Some(1)), // front=255
3831            make_snode(100, 127, 128, None),  // front=255
3832        ];
3833        let children2 = build_children_map(&supernodes2);
3834        let subtrees2 = classify_small_leaf_subtrees(&mut supernodes2, &children2, 256);
3835
3836        assert!(supernodes2[0].in_small_leaf);
3837        assert!(supernodes2[1].in_small_leaf);
3838        assert_eq!(subtrees2.len(), 1);
3839        assert_eq!(subtrees2[0].nodes, vec![0, 1]);
3840    }
3841
3842    /// Disabled when threshold=0.
3843    #[test]
3844    fn test_classify_disabled() {
3845        let mut supernodes = vec![make_snode(0, 2, 3, Some(1)), make_snode(2, 2, 3, None)];
3846        let children = build_children_map(&supernodes);
3847
3848        let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 0);
3849
3850        assert_eq!(subtrees.len(), 0);
3851        assert!(!supernodes[0].in_small_leaf);
3852        assert!(!supernodes[1].in_small_leaf);
3853    }
3854
3855    /// Two independent small-leaf subtrees.
3856    #[test]
3857    fn test_classify_multiple_subtrees() {
3858        // Tree:
3859        //   0 → 2 (subtree A)
3860        //   1 → 2 (subtree A)
3861        //   2 → 5 (large root)
3862        //   3 → 4 (subtree B)
3863        //   4 → 5 (subtree B is just 3→4)
3864        //   5 = large root
3865        let mut supernodes = vec![
3866            make_snode(0, 2, 3, Some(2)),   // 0: front=5, small
3867            make_snode(2, 2, 3, Some(2)),   // 1: front=5, small
3868            make_snode(4, 2, 3, Some(5)),   // 2: front=5, small (parent is large)
3869            make_snode(6, 2, 3, Some(4)),   // 3: front=5, small
3870            make_snode(8, 2, 3, Some(5)),   // 4: front=5, small (parent is large)
3871            make_snode(10, 100, 200, None), // 5: front=300, large
3872        ];
3873        let children = build_children_map(&supernodes);
3874
3875        let subtrees = classify_small_leaf_subtrees(&mut supernodes, &children, 256);
3876
3877        // Two subtrees: {0,1,2} and {3,4}
3878        assert_eq!(subtrees.len(), 2);
3879
3880        // Subtree A: root=2, nodes=[0,1,2] in postorder
3881        assert_eq!(subtrees[0].root, 2);
3882        assert_eq!(subtrees[0].nodes, vec![0, 1, 2]);
3883        assert_eq!(subtrees[0].parent_of_root, Some(5));
3884
3885        // Subtree B: root=4, nodes=[3,4] in postorder
3886        assert_eq!(subtrees[1].root, 4);
3887        assert_eq!(subtrees[1].nodes, vec![3, 4]);
3888        assert_eq!(subtrees[1].parent_of_root, Some(5));
3889
3890        // Large root is not in_small_leaf
3891        assert!(!supernodes[5].in_small_leaf);
3892        // All others are
3893        for (s, sn) in supernodes.iter().enumerate().take(5) {
3894            assert!(sn.in_small_leaf, "snode {} should be in_small_leaf", s);
3895        }
3896    }
3897}