Skip to main content

rivrs_sparse/symmetric/
factor.rs

1//! Dense APTP (A Posteriori Threshold Pivoting) factorization kernel.
2//!
3//! Implements the APTP algorithm for dense symmetric indefinite matrices —
4//! the core numerical kernel for the SSIDS multifrontal solver. The kernel
5//! factors a dense frontal matrix in place using an optimistic 1x1 pivot
6//! strategy with a posteriori stability checking, falling back to 2x2
7//! Bunch-Kaufman pivots or column delay when stability bounds are violated.
8//!
9//! # Algorithm
10//!
11//! Implements the single-level (column-by-column) APTP strategy from
12//! Duff, Hogg & Lopez (2020), "A New Sparse LDL^T Solver Using A Posteriori
13//! Threshold Pivoting", SIAM J. Sci. Comput. 42(4).
14//!
15//! For each column k (left-to-right):
16//! 1. Attempt 1x1 pivot: compute L column, check stability bound |l_ij| < 1/u
17//! 2. On pass: accept, apply rank-1 Schur complement update
18//! 3. On fail: either attempt 2x2 Bunch-Kaufman pivot (with best partner) or
19//!    immediately delay, depending on [`AptpFallback`] strategy
20//! 4. After all columns: permute delayed columns to end of ordering
21//!
22//! # Complexity
23//!
24//! - Time: O(n^3) for full n x n factorization (dominated by Schur complement updates)
25//! - Space: O(n^2) for the input matrix (factored in place) plus O(n) for D and permutation
26//!
27//! # References
28//!
29//! - Duff, Hogg & Lopez (2020), "A New Sparse LDL^T Solver Using A Posteriori
30//!   Threshold Pivoting", SIAM J. Sci. Comput. 42(4)
31//! - Bunch & Kaufman (1977), "Some Stable Methods for Calculating Inertia and
32//!   Solving Symmetric Linear Systems", Math. Comp.
33
34// SPRAL Equivalent: `LDLT<T, BLOCK_SIZE, Backup>::factor()` in
35// `spral/src/ssids/cpu/kernels/ldlt_app.cxx` (BSD-3).
36
37use faer::Par;
38use faer::linalg::matmul::triangular::{self as tri_matmul, BlockStructure};
39use faer::linalg::triangular_solve;
40use faer::perm::Perm;
41use faer::prelude::*;
42use faer::{Accum, Conj, Mat, MatMut, MatRef};
43
44use super::diagonal::MixedDiagonal;
45use super::pivot::{Block2x2, PivotType};
46use crate::error::SparseError;
47use crate::ordering::perm_from_forward;
48
49/// Bunch-Kaufman α parameter for 2×2 determinant condition: |det| ≥ α × |a₂₁|².
50const BUNCH_KAUFMAN_ALPHA: f64 = 0.5;
51
52/// Growth factor bound for complete pivoting: max |L_ij| ≤ COMPLETE_PIVOTING_GROWTH_BOUND.
53/// From Algorithm 4.1 of Duff, Hogg & Lopez (2020).
54#[cfg(test)]
55const COMPLETE_PIVOTING_GROWTH_BOUND: f64 = 4.0;
56
57// ---------------------------------------------------------------------------
58// Configuration types
59// ---------------------------------------------------------------------------
60
61/// Configuration for the APTP factorization kernel.
62///
63/// # Defaults
64/// - `threshold`: 0.01 (growth factor bound of 100)
65/// - `small`: 1e-20 (singularity detection)
66/// - `fallback`: [`AptpFallback::BunchKaufman`]
67/// - `outer_block_size`: 256 (two-level outer block)
68/// - `inner_block_size`: 32 (two-level inner block)
69///
70/// # Block Size Parameters (Two-Level APTP)
71///
72/// For frontal matrices larger than `outer_block_size`, the kernel uses
73/// a two-level blocked algorithm (Duff, Hogg & Lopez 2020, Section 3):
74/// - Outer loop processes blocks of `outer_block_size` columns
75/// - Inner loop processes sub-blocks of `inner_block_size` columns
76/// - Innermost ib×ib diagonal blocks use complete pivoting (Algorithm 4.1)
77///
78/// For frontal matrices ≤ `outer_block_size`, the kernel processes the
79/// entire fully-summed portion as a single inner block.
80///
81/// # References
82///
83/// - Duff, Hogg & Lopez (2020), Section 4: threshold parameter u
84/// - Duff, Hogg & Lopez (2020), Section 3: two-level blocking with nb=256, ib=32
85#[derive(Debug, Clone)]
86pub struct AptpOptions {
87    /// Stability threshold u: entries must satisfy |l_ij| < 1/u.
88    /// Must be in (0.0, 1.0]. Default: 0.01.
89    pub threshold: f64,
90    /// Singularity detection: pivots with |d| < small treated as zero.
91    /// Must be >= 0.0. Default: 1e-20.
92    pub small: f64,
93    /// Strategy when a 1x1 pivot fails the stability check.
94    pub fallback: AptpFallback,
95    /// Outer block size (nb) for two-level APTP. Default: 256.
96    /// Must be > 0 and >= inner_block_size.
97    pub outer_block_size: usize,
98    /// Inner block size (ib) for two-level APTP. Default: 32.
99    /// Must be > 0 and <= outer_block_size.
100    pub inner_block_size: usize,
101    /// Strategy for columns that APTP fails to eliminate. Default: Tpp.
102    pub failed_pivot_method: FailedPivotMethod,
103    /// Parallelism control for BLAS-3 operations (TRSM, GEMM). Default: `Par::Seq`.
104    pub par: Par,
105    /// Minimum supernode size for amalgamation. Supernodes with fewer than
106    /// `nemin` eliminated columns may be merged with their parent. Default: 32.
107    ///
108    /// Setting `nemin = 1` disables amalgamation entirely.
109    // SPRAL Equivalent: `options%nemin` (`datatypes.f90:21`).
110    pub nemin: usize,
111    /// Front-size threshold for small-leaf subtree fast path. Default: 256.
112    /// Set to 0 to disable.
113    pub small_leaf_threshold: usize,
114}
115
116impl Default for AptpOptions {
117    fn default() -> Self {
118        Self {
119            threshold: 0.01,
120            small: 1e-20,
121            fallback: AptpFallback::BunchKaufman,
122            outer_block_size: 256,
123            inner_block_size: 32,
124            failed_pivot_method: FailedPivotMethod::Tpp,
125            par: Par::Seq,
126            nemin: 32,
127            small_leaf_threshold: 256,
128        }
129    }
130}
131
132/// Fallback strategy when a 1x1 pivot fails the stability check.
133///
134/// # References
135///
136/// - Duff, Hogg & Lopez (2020), Algorithm 3.1: fallback strategies
137#[derive(Debug, Clone, Copy, PartialEq, Eq)]
138pub enum AptpFallback {
139    /// Attempt 2x2 Bunch-Kaufman pivot; delay if that also fails.
140    BunchKaufman,
141    /// Immediately delay the column without attempting 2x2.
142    Delay,
143}
144
145/// Strategy for handling columns that APTP fails to eliminate.
146///
147/// When APTP's block-scoped search cannot find acceptable pivots for some
148/// columns, the `FailedPivotMethod` controls what happens next.
149///
150/// # References
151///
152/// - Duff, Hogg & Lopez (2020), Section 3: TPP fallback after APTP
153/// - SPRAL `options%failed_pivot_method`: 0 = pass, 1 = tpp (default)
154#[derive(Debug, Clone, Copy, PartialEq, Eq)]
155pub enum FailedPivotMethod {
156    /// Retry failed columns with serial Threshold Partial Pivoting (default).
157    ///
158    /// TPP searches ALL remaining columns for acceptable pivots, which
159    /// succeeds where APTP's block-scoped search fails.
160    Tpp,
161    /// Pass failed columns directly to parent as delays (no retry).
162    Pass,
163}
164
165// ---------------------------------------------------------------------------
166// Kernel workspace
167// ---------------------------------------------------------------------------
168
169/// Pre-allocated reusable buffers for the BLAS-3 inner loop.
170///
171/// Eliminates per-block heap allocations inside `factor_inner` and
172/// `two_level_factor`. Sized to `max_front × inner_block_size` and reused
173/// across all block iterations within and across supernodes.
174///
175/// # References
176///
177/// - SPRAL `NumericSubtree.hxx:75-81`: per-thread workspace pattern
178pub(crate) struct AptpKernelWorkspace {
179    /// Block backup for restore-on-failure, max_front × inner_block_size.
180    backup: Mat<f64>,
181    /// Copy of L11 block for TRSM aliasing avoidance, inner_block_size × inner_block_size.
182    l11_buf: Mat<f64>,
183    /// L·D product workspace for update_trailing/cross_terms, max_front × inner_block_size.
184    ld_buf: Mat<f64>,
185    /// Copy buffer for L21/L_panel aliasing avoidance, max_front × inner_block_size.
186    copy_buf: Mat<f64>,
187}
188
189impl AptpKernelWorkspace {
190    /// Create a new kernel workspace sized for the given maximum front and
191    /// inner block dimensions.
192    pub(crate) fn new(max_front: usize, inner_block_size: usize) -> Self {
193        Self {
194            backup: Mat::zeros(max_front, inner_block_size),
195            l11_buf: Mat::zeros(inner_block_size, inner_block_size),
196            ld_buf: Mat::zeros(max_front, inner_block_size),
197            copy_buf: Mat::zeros(max_front, inner_block_size),
198        }
199    }
200}
201
202// ---------------------------------------------------------------------------
203// Result types
204// ---------------------------------------------------------------------------
205
206/// Result of in-place APTP factorization.
207///
208/// The L factor is stored in the lower triangle of the mutated input matrix.
209/// Only the first `num_eliminated` columns contain valid L entries.
210#[derive(Debug)]
211pub struct AptpFactorResult {
212    /// Block diagonal D factor with mixed 1x1/2x2 blocks.
213    pub d: MixedDiagonal,
214    /// Column permutation: `perm[i]` = original column index at position i.
215    pub perm: Vec<usize>,
216    /// Number of successfully eliminated columns (q <= num_fully_summed).
217    pub num_eliminated: usize,
218    /// Original column indices that were not eliminated.
219    pub delayed_cols: Vec<usize>,
220    /// Summary statistics for diagnostics.
221    pub stats: AptpStatistics,
222    /// Per-column diagnostic log.
223    pub pivot_log: Vec<AptpPivotRecord>,
224}
225
226/// Result of convenience APTP factorization (with extracted L).
227#[derive(Debug)]
228pub struct AptpFactorization {
229    /// Unit lower triangular factor (extracted from in-place result).
230    pub l: Mat<f64>,
231    /// Block diagonal D factor with mixed 1x1/2x2 blocks.
232    pub d: MixedDiagonal,
233    /// Column permutation as faer Perm type.
234    pub perm: Perm<usize>,
235    /// Original column indices not eliminated.
236    pub delayed_cols: Vec<usize>,
237    /// Summary statistics.
238    pub stats: AptpStatistics,
239    /// Per-column diagnostic log.
240    pub pivot_log: Vec<AptpPivotRecord>,
241}
242
243/// Summary statistics from factorization.
244///
245/// Invariant: `num_1x1 + 2 * num_2x2 + num_delayed == total_fully_summed_columns`
246#[derive(Debug, Clone, Default)]
247pub struct AptpStatistics {
248    /// Count of 1x1 pivots accepted.
249    pub num_1x1: usize,
250    /// Count of 2x2 pivot pairs (each pair counts as 1).
251    pub num_2x2: usize,
252    /// Count of delayed columns.
253    pub num_delayed: usize,
254    /// Maximum absolute value across all L entries (stability metric).
255    pub max_l_entry: f64,
256}
257
258/// Per-column pivot diagnostic record.
259#[derive(Debug, Clone)]
260pub struct AptpPivotRecord {
261    /// Original column index.
262    pub col: usize,
263    /// Classification (OneByOne, TwoByTwo, Delayed).
264    pub pivot_type: PivotType,
265    /// Worst stability metric for this column's L entries.
266    pub max_l_entry: f64,
267    /// True if 2x2 fallback was attempted (regardless of outcome).
268    pub was_fallback: bool,
269}
270
271// ---------------------------------------------------------------------------
272// Public API
273// ---------------------------------------------------------------------------
274
275/// Factor a dense symmetric matrix using A Posteriori Threshold Pivoting.
276///
277/// Performs partial LDL^T factorization of the first `num_fully_summed`
278/// columns of the input matrix `a`. The L factor is stored in-place in the
279/// lower triangle of `a`. The contribution block (trailing submatrix after
280/// the fully-summed columns) is updated with the Schur complement.
281///
282/// # Arguments
283/// - `a`: Dense symmetric matrix (m x m), mutated in place. Only the lower
284///   triangle is read; the upper triangle may contain arbitrary values.
285///   On return, the lower triangle of the first `num_eliminated` columns
286///   contains the L factor entries (unit diagonal implicit).
287/// - `num_fully_summed`: Number of columns eligible for elimination (p <= m).
288///   For full factorization, pass `a.nrows()`.
289/// - `options`: APTP configuration (threshold, fallback strategy).
290///
291/// # Returns
292/// `AptpFactorResult` containing D factor, column permutation, delayed
293/// column indices, and diagnostics.
294///
295/// # Errors
296/// Returns `SparseError::InvalidInput` if dimensions are inconsistent.
297///
298/// # Algorithm
299/// Implements the APTP strategy from Duff, Hogg & Lopez (2020).
300/// Single-level (column-by-column) variant.
301pub fn aptp_factor_in_place(
302    mut a: MatMut<'_, f64>,
303    num_fully_summed: usize,
304    options: &AptpOptions,
305) -> Result<AptpFactorResult, SparseError> {
306    let m = a.nrows();
307    if a.ncols() != m {
308        return Err(SparseError::InvalidInput {
309            reason: format!("matrix must be square, got {}x{}", m, a.ncols()),
310        });
311    }
312    if num_fully_summed > m {
313        return Err(SparseError::InvalidInput {
314            reason: format!(
315                "num_fully_summed ({}) > matrix dimension ({})",
316                num_fully_summed, m
317            ),
318        });
319    }
320    if options.threshold <= 0.0 || options.threshold > 1.0 {
321        return Err(SparseError::InvalidInput {
322            reason: format!("threshold must be in (0, 1], got {}", options.threshold),
323        });
324    }
325    if options.small < 0.0 {
326        return Err(SparseError::InvalidInput {
327            reason: format!("small must be >= 0.0, got {}", options.small),
328        });
329    }
330    if options.outer_block_size == 0 {
331        return Err(SparseError::InvalidInput {
332            reason: "outer_block_size must be > 0".to_string(),
333        });
334    }
335    if options.inner_block_size == 0 {
336        return Err(SparseError::InvalidInput {
337            reason: "inner_block_size must be > 0".to_string(),
338        });
339    }
340    if options.inner_block_size > options.outer_block_size {
341        return Err(SparseError::InvalidInput {
342            reason: format!(
343                "inner_block_size ({}) must be <= outer_block_size ({})",
344                options.inner_block_size, options.outer_block_size
345            ),
346        });
347    }
348
349    // Primary factorization dispatch.
350    //
351    // Small fronts (< inner_block_size) use TPP, which tries 2x2 pivots first
352    // on every column. This accepts more 2x2 pivots than complete pivoting
353    // (which only tries 2x2 when the off-diagonal is the global max). For
354    // indefinite matrices, 2x2 pivots pair +/- eigenvalues and produce
355    // better-conditioned factors.
356    // Allocate kernel workspace once for the BLAS-3 paths (reused across
357    // all block iterations within factor_inner / two_level_factor).
358    let mut kernel_ws = AptpKernelWorkspace::new(m, options.inner_block_size);
359
360    let mut result = if num_fully_summed < options.inner_block_size {
361        // Small front: use TPP as primary method
362        tpp_factor_rect(a.rb_mut(), num_fully_summed, options)?
363    } else if num_fully_summed > options.outer_block_size {
364        two_level_factor(a.rb_mut(), num_fully_summed, options, &mut kernel_ws)?
365    } else {
366        factor_inner(
367            a.rb_mut(),
368            num_fully_summed,
369            num_fully_summed,
370            options,
371            &mut kernel_ws,
372        )?
373    };
374
375    // Fallback: TPP on remaining columns
376    if result.num_eliminated < num_fully_summed
377        && options.failed_pivot_method == FailedPivotMethod::Tpp
378    {
379        let q = result.num_eliminated;
380
381        // Grow D to accommodate TPP's writes at positions q..num_fully_summed
382        result.d.grow(num_fully_summed);
383
384        let additional = tpp_factor(
385            a.rb_mut(),
386            q,
387            num_fully_summed,
388            &mut result.perm,
389            &mut result.d,
390            &mut result.stats,
391            &mut result.pivot_log,
392            options,
393        );
394
395        result.num_eliminated = q + additional;
396        result.d.truncate(result.num_eliminated);
397        result.delayed_cols = (result.num_eliminated..num_fully_summed)
398            .map(|i| result.perm[i])
399            .collect();
400        result.stats.num_delayed = num_fully_summed - result.num_eliminated;
401    }
402
403    Ok(result)
404}
405
406/// Factor a dense symmetric matrix using APTP, returning extracted L factor.
407///
408/// Convenience wrapper around [`aptp_factor_in_place`] that:
409/// 1. Copies the input matrix
410/// 2. Calls `aptp_factor_in_place` with `num_fully_summed = n`
411/// 3. Extracts L from the mutated copy
412/// 4. Returns [`AptpFactorization`] with all components as owned types
413///
414/// Suitable for standalone testing and small matrices. For large frontal
415/// matrices in multifrontal factorization, use
416/// [`aptp_factor_in_place`] directly to avoid the copy.
417pub fn aptp_factor(
418    a: MatRef<'_, f64>,
419    options: &AptpOptions,
420) -> Result<AptpFactorization, SparseError> {
421    let n = a.nrows();
422    if a.ncols() != n {
423        return Err(SparseError::InvalidInput {
424            reason: format!("matrix must be square, got {}x{}", n, a.ncols()),
425        });
426    }
427
428    let mut a_copy = a.to_owned();
429    let result = aptp_factor_in_place(a_copy.as_mut(), n, options)?;
430    let l = extract_l(a_copy.as_ref(), &result.d, result.num_eliminated);
431    let perm = perm_from_forward(result.perm.clone())?;
432
433    Ok(AptpFactorization {
434        l,
435        d: result.d,
436        perm,
437        delayed_cols: result.delayed_cols,
438        stats: result.stats,
439        pivot_log: result.pivot_log,
440    })
441}
442
443// ---------------------------------------------------------------------------
444// Internal functions
445// ---------------------------------------------------------------------------
446
447/// Swap symmetric rows/columns i and j in the lower triangle of a dense matrix.
448///
449/// This performs a simultaneous row-and-column permutation on a symmetric
450/// matrix stored in the lower triangle, so that the data at position i
451/// moves to position j and vice versa.
452fn swap_symmetric(mut a: MatMut<'_, f64>, i: usize, j: usize) {
453    let m = a.nrows();
454    swap_symmetric_block(a.rb_mut(), i, j, 0, m);
455}
456
457/// Block-scoped symmetric swap: permute rows/columns i and j within
458/// a[col_start..row_limit, col_start..row_limit].
459///
460/// Same as `swap_symmetric` but the "rows k < i" loop starts at `col_start` instead of 0,
461/// and the "rows k > j" loop uses `row_limit` instead of `a.nrows()`. This limits the
462/// swap to the diagonal block being factored, leaving both previously-factored columns
463/// (0..col_start) and panel rows (row_limit..m) untouched.
464// SPRAL Equivalent: `swap_cols(p, t, BLOCK_SIZE, a, lda, ...)` in `block_ldlt.hxx:68` (BSD-3).
465fn swap_symmetric_block(
466    mut a: MatMut<'_, f64>,
467    i: usize,
468    j: usize,
469    col_start: usize,
470    row_limit: usize,
471) {
472    if i == j {
473        return;
474    }
475    let (i, j) = if i < j { (i, j) } else { (j, i) };
476
477    // Swap diagonals
478    let tmp = a[(i, i)];
479    a[(i, i)] = a[(j, j)];
480    a[(j, j)] = tmp;
481
482    // Rows k < i: swap lower-triangle entries a[(i,k)] and a[(j,k)]
483    // LIMITED to col_start..i (don't touch previously-factored columns)
484    for k in col_start..i {
485        let tmp = a[(i, k)];
486        a[(i, k)] = a[(j, k)];
487        a[(j, k)] = tmp;
488    }
489
490    // Rows i < k < j: swap a[(k,i)] and a[(j,k)]
491    for k in (i + 1)..j {
492        let tmp = a[(k, i)];
493        a[(k, i)] = a[(j, k)];
494        a[(j, k)] = tmp;
495    }
496
497    // Rows k > j: swap a[(k,i)] and a[(k,j)] — LIMITED to row_limit
498    for k in (j + 1)..row_limit {
499        let tmp = a[(k, i)];
500        a[(k, i)] = a[(k, j)];
501        a[(k, j)] = tmp;
502    }
503
504    // Cross element a[(j,i)] stays unchanged
505}
506
507/// Rank-1 Schur complement update after eliminating a 1x1 pivot at column `col`.
508#[cfg(test)]
509#[allow(dead_code)] // Used by complete_pivoting_factor (also #[cfg(test)])
510fn update_schur_1x1(mut a: MatMut<'_, f64>, col: usize, d_value: f64) {
511    let m = a.nrows();
512    let n_trail = m - col - 1;
513    if n_trail == 0 {
514        return;
515    }
516
517    let l_col: Vec<f64> = (0..n_trail).map(|i| a[(col + 1 + i, col)]).collect();
518
519    for j in 0..n_trail {
520        let ldlj = l_col[j] * d_value;
521        for i in j..n_trail {
522            let old = a[(col + 1 + i, col + 1 + j)];
523            a[(col + 1 + i, col + 1 + j)] = old - l_col[i] * ldlj;
524        }
525    }
526}
527
528/// Rank-2 Schur complement update after eliminating a 2x2 pivot.
529#[cfg(test)]
530#[allow(dead_code)] // Used by complete_pivoting_factor (also #[cfg(test)])
531fn update_schur_2x2(mut a: MatMut<'_, f64>, col: usize, partner: usize, block: &Block2x2) {
532    let m = a.nrows();
533    let start = col.max(partner) + 1;
534    let n_trail = m - start;
535    if n_trail == 0 {
536        return;
537    }
538
539    let l1: Vec<f64> = (0..n_trail).map(|i| a[(start + i, col)]).collect();
540    let l2: Vec<f64> = (0..n_trail).map(|i| a[(start + i, partner)]).collect();
541
542    let d_a = block.a;
543    let d_b = block.b;
544    let d_c = block.c;
545
546    for j in 0..n_trail {
547        let w_j1 = l1[j] * d_a + l2[j] * d_b;
548        let w_j2 = l1[j] * d_b + l2[j] * d_c;
549        for i in j..n_trail {
550            let update = l1[i] * w_j1 + l2[i] * w_j2;
551            let old = a[(start + i, start + j)];
552            a[(start + i, start + j)] = old - update;
553        }
554    }
555}
556
557/// Extract the L factor from the in-place factorized matrix.
558///
559/// Uses the D factor to identify 2x2 pivot blocks, where `a[(k+1, k)]`
560/// contains the D off-diagonal (not an L entry). For 2x2 pivots at
561/// positions (k, k+1), L entries start at row k+2.
562fn extract_l(a: MatRef<'_, f64>, d: &MixedDiagonal, num_eliminated: usize) -> Mat<f64> {
563    let n = a.nrows();
564    let mut l = Mat::zeros(n, n);
565
566    for i in 0..n {
567        l[(i, i)] = 1.0;
568    }
569
570    let mut col = 0;
571    while col < num_eliminated {
572        match d.pivot_type(col) {
573            PivotType::OneByOne => {
574                for i in (col + 1)..n {
575                    l[(i, col)] = a[(i, col)];
576                }
577                col += 1;
578            }
579            PivotType::TwoByTwo { partner } if partner > col => {
580                // a[(col+1, col)] is the D off-diagonal, NOT an L entry.
581                // L entries for the 2x2 block start at row col+2.
582                for i in (col + 2)..n {
583                    l[(i, col)] = a[(i, col)];
584                    l[(i, col + 1)] = a[(i, col + 1)];
585                }
586                col += 2;
587            }
588            _ => {
589                col += 1;
590            }
591        }
592    }
593
594    l
595}
596
597// ---------------------------------------------------------------------------
598// Two-level APTP: Complete pivoting (Algorithm 4.1)
599// ---------------------------------------------------------------------------
600
601/// Factor a small dense symmetric block using complete pivoting.
602///
603/// Implements Algorithm 4.1 from Duff, Hogg & Lopez (2020): searches
604/// the entire remaining submatrix for the entry with maximum magnitude,
605/// then uses it as a 1×1 pivot (if on diagonal) or as the off-diagonal
606/// of a 2×2 pivot. Provably stable with growth factor bound ≤ 4
607/// (equivalent to threshold u=0.25).
608///
609/// Used at the innermost level of two-level APTP for ib×ib diagonal
610/// blocks. Never delays columns (always finds a valid pivot unless
611/// the block is numerically singular).
612///
613/// The matrix is factored in-place (lower triangle). On return:
614/// - Columns 0..num_eliminated contain L entries (unit diagonal implicit)
615/// - D entries stored in returned MixedDiagonal
616/// - The Schur complement is updated in-place in the trailing submatrix
617// SPRAL Equivalent: `block_ldlt()` in `spral/src/ssids/cpu/kernels/ldlt_app.cxx` (BSD-3).
618#[cfg(test)]
619fn complete_pivoting_factor(mut a: MatMut<'_, f64>, small: f64) -> AptpFactorResult {
620    let n = a.nrows();
621    debug_assert_eq!(
622        n,
623        a.ncols(),
624        "complete_pivoting_factor requires square matrix"
625    );
626
627    let mut col_order: Vec<usize> = (0..n).collect();
628    let mut d = MixedDiagonal::new(n);
629    let mut stats = AptpStatistics::default();
630    let mut pivot_log = Vec::with_capacity(n);
631    let mut k = 0; // next column to eliminate
632
633    while k < n {
634        let remaining = n - k;
635
636        // 1. Find (t, m_idx) = argmax |a[i,j]| over all uneliminated entries (lower triangle)
637        let mut max_val = 0.0_f64;
638        let mut max_row = k;
639        let mut max_col = k;
640        for j in k..n {
641            for i in j..n {
642                let v = a[(i, j)].abs();
643                if v > max_val {
644                    max_val = v;
645                    max_row = i;
646                    max_col = j;
647                }
648            }
649        }
650
651        // 2. Singularity check
652        if max_val < small {
653            // Mark all remaining as zero pivots (delayed)
654            for &orig_col in &col_order[k..n] {
655                stats.num_delayed += 1;
656                pivot_log.push(AptpPivotRecord {
657                    col: orig_col,
658                    pivot_type: PivotType::Delayed,
659                    max_l_entry: 0.0,
660                    was_fallback: false,
661                });
662            }
663            break;
664        }
665
666        // 3. Decide pivot type
667        if max_row == max_col {
668            // Maximum is on diagonal → 1×1 pivot
669            // Swap max_row to position k
670            if max_row != k {
671                swap_symmetric(a.rb_mut(), k, max_row);
672                col_order.swap(k, max_row);
673            }
674
675            let d_kk = a[(k, k)];
676            let inv_d = 1.0 / d_kk;
677
678            // Compute L column
679            let mut max_l = 0.0_f64;
680            for i in (k + 1)..n {
681                let l_ik = a[(i, k)] * inv_d;
682                a[(i, k)] = l_ik;
683                let abs_l = l_ik.abs();
684                if abs_l > max_l {
685                    max_l = abs_l;
686                }
687            }
688
689            // Schur complement update
690            update_schur_1x1(a.rb_mut(), k, d_kk);
691
692            d.set_1x1(k, d_kk);
693            stats.num_1x1 += 1;
694            if max_l > stats.max_l_entry {
695                stats.max_l_entry = max_l;
696            }
697            pivot_log.push(AptpPivotRecord {
698                col: col_order[k],
699                pivot_type: PivotType::OneByOne,
700                max_l_entry: max_l,
701                was_fallback: false,
702            });
703            k += 1;
704        } else {
705            // Maximum is off-diagonal at (max_row, max_col)
706            // t = max_row, m = max_col (in paper's notation)
707            let t = max_row;
708            let m = max_col;
709
710            // Compute Δ = a[m,m] * a[t,t] - a[t,m]^2
711            // Need current values (before any swap)
712            let a_mm = a[(m, m)];
713            let a_tt = a[(t, t)];
714            let a_tm = a[(t, m)]; // lower triangle: t > m
715            let delta = a_mm * a_tt - a_tm * a_tm;
716
717            if remaining < 2 {
718                // Only one column left, must use 1×1 (shouldn't normally reach here
719                // since max_row == max_col for single element, but guard anyway)
720                let d_kk = a[(k, k)];
721                if d_kk.abs() < small {
722                    stats.num_delayed += 1;
723                    pivot_log.push(AptpPivotRecord {
724                        col: col_order[k],
725                        pivot_type: PivotType::Delayed,
726                        max_l_entry: 0.0,
727                        was_fallback: false,
728                    });
729                    break;
730                }
731                let inv_d = 1.0 / d_kk;
732                for i in (k + 1)..n {
733                    a[(i, k)] *= inv_d;
734                }
735                d.set_1x1(k, d_kk);
736                stats.num_1x1 += 1;
737                k += 1;
738                continue;
739            }
740
741            if delta.abs() >= BUNCH_KAUFMAN_ALPHA * a_tm * a_tm {
742                // 2×2 pivot using (t, m)
743                // Swap m → k and t → k+1
744                if m != k {
745                    swap_symmetric(a.rb_mut(), k, m);
746                    col_order.swap(k, m);
747                }
748                // After swap: the row that was 't' may have moved
749                let new_t = if t == k { m } else { t };
750                if new_t != k + 1 {
751                    swap_symmetric(a.rb_mut(), k + 1, new_t);
752                    col_order.swap(k + 1, new_t);
753                }
754
755                let a11 = a[(k, k)];
756                let a22 = a[(k + 1, k + 1)];
757                let a21 = a[(k + 1, k)];
758                let det = a11 * a22 - a21 * a21;
759                let inv_det = 1.0 / det;
760
761                let block = Block2x2 {
762                    first_col: k,
763                    a: a11,
764                    b: a21,
765                    c: a22,
766                };
767
768                // Compute L columns for 2×2 pivot
769                let mut max_l = 0.0_f64;
770                let rows_start = k + 2;
771                for i in rows_start..n {
772                    let ai1 = a[(i, k)];
773                    let ai2 = a[(i, k + 1)];
774                    let l_i1 = (ai1 * a22 - ai2 * a21) * inv_det;
775                    let l_i2 = (ai2 * a11 - ai1 * a21) * inv_det;
776                    a[(i, k)] = l_i1;
777                    a[(i, k + 1)] = l_i2;
778                    if l_i1.abs() > max_l {
779                        max_l = l_i1.abs();
780                    }
781                    if l_i2.abs() > max_l {
782                        max_l = l_i2.abs();
783                    }
784                }
785
786                // Schur complement update for 2×2
787                update_schur_2x2(a.rb_mut(), k, k + 1, &block);
788
789                d.set_2x2(block);
790                stats.num_2x2 += 1;
791                if max_l > stats.max_l_entry {
792                    stats.max_l_entry = max_l;
793                }
794                pivot_log.push(AptpPivotRecord {
795                    col: col_order[k],
796                    pivot_type: PivotType::TwoByTwo {
797                        partner: col_order[k + 1],
798                    },
799                    max_l_entry: max_l,
800                    was_fallback: false,
801                });
802                pivot_log.push(AptpPivotRecord {
803                    col: col_order[k + 1],
804                    pivot_type: PivotType::TwoByTwo {
805                        partner: col_order[k],
806                    },
807                    max_l_entry: max_l,
808                    was_fallback: false,
809                });
810                k += 2;
811            } else {
812                // Failed 2×2 determinant condition → use 1×1 on max(|a_mm|, |a_tt|)
813                let pivot_pos = if a_mm.abs() >= a_tt.abs() { m } else { t };
814                if pivot_pos != k {
815                    swap_symmetric(a.rb_mut(), k, pivot_pos);
816                    col_order.swap(k, pivot_pos);
817                }
818
819                let d_kk = a[(k, k)];
820                let inv_d = 1.0 / d_kk;
821
822                let mut max_l = 0.0_f64;
823                for i in (k + 1)..n {
824                    let l_ik = a[(i, k)] * inv_d;
825                    a[(i, k)] = l_ik;
826                    let abs_l = l_ik.abs();
827                    if abs_l > max_l {
828                        max_l = abs_l;
829                    }
830                }
831
832                update_schur_1x1(a.rb_mut(), k, d_kk);
833
834                d.set_1x1(k, d_kk);
835                stats.num_1x1 += 1;
836                if max_l > stats.max_l_entry {
837                    stats.max_l_entry = max_l;
838                }
839                pivot_log.push(AptpPivotRecord {
840                    col: col_order[k],
841                    pivot_type: PivotType::OneByOne,
842                    max_l_entry: max_l,
843                    was_fallback: true, // fell back from 2×2
844                });
845                k += 1;
846            }
847        }
848    }
849
850    let num_eliminated = k;
851
852    d.truncate(num_eliminated);
853
854    let delayed_cols: Vec<usize> = (num_eliminated..n).map(|i| col_order[i]).collect();
855
856    AptpFactorResult {
857        d,
858        perm: col_order,
859        num_eliminated,
860        delayed_cols,
861        stats,
862        pivot_log,
863    }
864}
865
866// ---------------------------------------------------------------------------
867// Two-level APTP: BLAS-3 building blocks
868// ---------------------------------------------------------------------------
869
870/// Factor an ib×ib diagonal block using complete pivoting (Algorithm 4.1).
871///
872/// Performs complete pivoting on the diagonal block `a[col_start..col_start+block_size,
873/// col_start..col_start+block_size]`, but symmetric swaps affect ALL m rows of the
874/// matrix (keeping row ordering consistent for the panel below).
875///
876/// L entries are computed ONLY for rows within the block (col_start..col_start+block_size).
877/// Schur complement updates are ONLY within the block. The panel rows below are NOT
878/// modified except by symmetric swaps.
879///
880/// Complete pivoting guarantees |L_ij| ≤ 4 (growth factor for u=0.25).
881///
882/// # Returns
883/// (block_d, local_perm, nelim, stats, pivot_log) where:
884/// - `block_d`: MixedDiagonal of dimension block_size with D entries
885/// - `local_perm`: permutation applied within positions col_start..col_start+block_size
886///   (values are offsets from col_start)
887/// - `nelim`: number of successfully eliminated columns (may be < block_size if singular)
888/// - `stats`: pivot statistics for this block
889/// - `pivot_log`: per-pivot diagnostic records
890// SPRAL Equivalent: `block_ldlt()` in `spral/src/ssids/cpu/kernels/block_ldlt.hxx` (BSD-3).
891fn factor_block_diagonal(
892    mut a: MatMut<'_, f64>,
893    col_start: usize,
894    block_size: usize,
895    small: f64,
896    row_limit: usize,
897) -> (
898    MixedDiagonal,
899    Vec<usize>,
900    usize,
901    AptpStatistics,
902    Vec<AptpPivotRecord>,
903) {
904    let block_end = col_start + block_size;
905
906    let mut local_perm: Vec<usize> = (0..block_size).collect();
907    let mut d = MixedDiagonal::new(block_size);
908    let mut stats = AptpStatistics::default();
909    let mut pivot_log = Vec::with_capacity(block_size);
910    let mut k = 0; // offset within block (next column to eliminate)
911
912    while k < block_size {
913        let cur = col_start + k; // absolute position
914        let search_end = block_end;
915        let remaining = block_size - k;
916
917        // 1. Find max |a[i,j]| in remaining diagonal sub-block [cur..search_end, cur..search_end]
918        let mut max_val = 0.0_f64;
919        let mut max_row = cur;
920        let mut max_col = cur;
921        for j in cur..search_end {
922            for i in j..search_end {
923                let v = a[(i, j)].abs();
924                if v > max_val {
925                    max_val = v;
926                    max_row = i;
927                    max_col = j;
928                }
929            }
930        }
931
932        // 2. Singularity check
933        if max_val < small {
934            // All remaining entries in block are near-zero
935            stats.num_delayed += remaining;
936            for &perm_val in &local_perm[k..block_size] {
937                pivot_log.push(AptpPivotRecord {
938                    col: perm_val,
939                    pivot_type: PivotType::Delayed,
940                    max_l_entry: 0.0,
941                    was_fallback: false,
942                });
943            }
944            break;
945        }
946
947        // 3. Decide pivot type
948        if max_row == max_col {
949            // Diagonal maximum → 1×1 pivot
950            if max_row != cur {
951                swap_symmetric_block(a.rb_mut(), cur, max_row, col_start, row_limit);
952                local_perm.swap(k, max_row - col_start);
953            }
954
955            let d_kk = a[(cur, cur)];
956            let inv_d = 1.0 / d_kk;
957
958            // Compute L column ONLY within block
959            let mut max_l = 0.0_f64;
960            for i in (cur + 1)..block_end {
961                let l_ik = a[(i, cur)] * inv_d;
962                a[(i, cur)] = l_ik;
963                let abs_l = l_ik.abs();
964                if abs_l > max_l {
965                    max_l = abs_l;
966                }
967            }
968
969            // Schur complement update ONLY within block
970            for j in (cur + 1)..block_end {
971                let l_j = a[(j, cur)];
972                let ldl_j = l_j * d_kk;
973                for i in j..block_end {
974                    a[(i, j)] -= a[(i, cur)] * ldl_j;
975                }
976            }
977
978            d.set_1x1(k, d_kk);
979            stats.num_1x1 += 1;
980            if max_l > stats.max_l_entry {
981                stats.max_l_entry = max_l;
982            }
983            pivot_log.push(AptpPivotRecord {
984                col: local_perm[k],
985                pivot_type: PivotType::OneByOne,
986                max_l_entry: max_l,
987                was_fallback: false,
988            });
989            k += 1;
990        } else {
991            // Off-diagonal maximum → try 2×2 pivot
992            let t = max_row; // absolute positions
993            let m_idx = max_col;
994
995            let a_mm = a[(m_idx, m_idx)];
996            let a_tt = a[(t, t)];
997            let a_tm = a[(t, m_idx)]; // lower triangle: t > m_idx
998            let delta = a_mm * a_tt - a_tm * a_tm;
999
1000            if remaining < 2 {
1001                // Only one column left, use 1×1 on max diagonal
1002                let pivot_pos = if a_mm.abs() >= a_tt.abs() { m_idx } else { t };
1003                if pivot_pos != cur {
1004                    swap_symmetric_block(a.rb_mut(), cur, pivot_pos, col_start, row_limit);
1005                    local_perm.swap(k, pivot_pos - col_start);
1006                }
1007                let d_kk = a[(cur, cur)];
1008                if d_kk.abs() < small {
1009                    stats.num_delayed += 1;
1010                    pivot_log.push(AptpPivotRecord {
1011                        col: local_perm[k],
1012                        pivot_type: PivotType::Delayed,
1013                        max_l_entry: 0.0,
1014                        was_fallback: false,
1015                    });
1016                    break;
1017                }
1018                let inv_d = 1.0 / d_kk;
1019                for i in (cur + 1)..block_end {
1020                    a[(i, cur)] *= inv_d;
1021                }
1022                d.set_1x1(k, d_kk);
1023                stats.num_1x1 += 1;
1024                k += 1;
1025                continue;
1026            }
1027
1028            if delta.abs() >= BUNCH_KAUFMAN_ALPHA * a_tm * a_tm {
1029                // 2×2 pivot: swap m_idx → cur, t → cur+1
1030                if m_idx != cur {
1031                    swap_symmetric_block(a.rb_mut(), cur, m_idx, col_start, row_limit);
1032                    local_perm.swap(k, m_idx - col_start);
1033                }
1034                let new_t = if t == cur { m_idx } else { t };
1035                if new_t != cur + 1 {
1036                    swap_symmetric_block(a.rb_mut(), cur + 1, new_t, col_start, row_limit);
1037                    local_perm.swap(k + 1, new_t - col_start);
1038                }
1039
1040                let a11 = a[(cur, cur)];
1041                let a22 = a[(cur + 1, cur + 1)];
1042                let a21 = a[(cur + 1, cur)];
1043                let det = a11 * a22 - a21 * a21;
1044                let inv_det = 1.0 / det;
1045
1046                let block = Block2x2 {
1047                    first_col: k,
1048                    a: a11,
1049                    b: a21,
1050                    c: a22,
1051                };
1052
1053                // Compute L columns ONLY within block
1054                let mut max_l = 0.0_f64;
1055                let rows_start = cur + 2;
1056                for i in rows_start..block_end {
1057                    let ai1 = a[(i, cur)];
1058                    let ai2 = a[(i, cur + 1)];
1059                    let l_i1 = (ai1 * a22 - ai2 * a21) * inv_det;
1060                    let l_i2 = (ai2 * a11 - ai1 * a21) * inv_det;
1061                    a[(i, cur)] = l_i1;
1062                    a[(i, cur + 1)] = l_i2;
1063                    if l_i1.abs() > max_l {
1064                        max_l = l_i1.abs();
1065                    }
1066                    if l_i2.abs() > max_l {
1067                        max_l = l_i2.abs();
1068                    }
1069                }
1070
1071                // Schur complement update ONLY within block (rank-2)
1072                for j in rows_start..block_end {
1073                    let l1j = a[(j, cur)];
1074                    let l2j = a[(j, cur + 1)];
1075                    let w_j1 = l1j * a11 + l2j * a21;
1076                    let w_j2 = l1j * a21 + l2j * a22;
1077                    for i in j..block_end {
1078                        let l1i = a[(i, cur)];
1079                        let l2i = a[(i, cur + 1)];
1080                        a[(i, j)] -= l1i * w_j1 + l2i * w_j2;
1081                    }
1082                }
1083
1084                d.set_2x2(block);
1085                stats.num_2x2 += 1;
1086                if max_l > stats.max_l_entry {
1087                    stats.max_l_entry = max_l;
1088                }
1089                pivot_log.push(AptpPivotRecord {
1090                    col: local_perm[k],
1091                    pivot_type: PivotType::TwoByTwo {
1092                        partner: local_perm[k + 1],
1093                    },
1094                    max_l_entry: max_l,
1095                    was_fallback: false,
1096                });
1097                pivot_log.push(AptpPivotRecord {
1098                    col: local_perm[k + 1],
1099                    pivot_type: PivotType::TwoByTwo {
1100                        partner: local_perm[k],
1101                    },
1102                    max_l_entry: max_l,
1103                    was_fallback: false,
1104                });
1105                k += 2;
1106            } else {
1107                // Failed Δ → 1×1 on max diagonal
1108                let pivot_pos = if a_mm.abs() >= a_tt.abs() { m_idx } else { t };
1109                if pivot_pos != cur {
1110                    swap_symmetric_block(a.rb_mut(), cur, pivot_pos, col_start, row_limit);
1111                    local_perm.swap(k, pivot_pos - col_start);
1112                }
1113
1114                let d_kk = a[(cur, cur)];
1115                let inv_d = 1.0 / d_kk;
1116
1117                let mut max_l = 0.0_f64;
1118                for i in (cur + 1)..block_end {
1119                    let l_ik = a[(i, cur)] * inv_d;
1120                    a[(i, cur)] = l_ik;
1121                    let abs_l = l_ik.abs();
1122                    if abs_l > max_l {
1123                        max_l = abs_l;
1124                    }
1125                }
1126
1127                for j in (cur + 1)..block_end {
1128                    let l_j = a[(j, cur)];
1129                    let ldl_j = l_j * d_kk;
1130                    for i in j..block_end {
1131                        a[(i, j)] -= a[(i, cur)] * ldl_j;
1132                    }
1133                }
1134
1135                d.set_1x1(k, d_kk);
1136                stats.num_1x1 += 1;
1137                if max_l > stats.max_l_entry {
1138                    stats.max_l_entry = max_l;
1139                }
1140                pivot_log.push(AptpPivotRecord {
1141                    col: local_perm[k],
1142                    pivot_type: PivotType::OneByOne,
1143                    max_l_entry: max_l,
1144                    was_fallback: true,
1145                });
1146                k += 1;
1147            }
1148        }
1149    }
1150
1151    let nelim = k;
1152    (d, local_perm, nelim, stats, pivot_log)
1153}
1154
1155/// Adjust effective_nelim to avoid splitting a 2×2 pivot across block boundaries.
1156///
1157/// If the last accepted pivot at position `effective_nelim - 1` is the first half
1158/// of a 2×2 pair whose partner is beyond `effective_nelim`, decrement by 1.
1159// SPRAL Equivalent: `Column::adjust()` in `spral/src/ssids/cpu/kernels/ldlt_app.cxx:112-127` (BSD-3).
1160fn adjust_for_2x2_boundary(effective_nelim: usize, d: &MixedDiagonal) -> usize {
1161    if effective_nelim == 0 {
1162        return 0;
1163    }
1164    let last = effective_nelim - 1;
1165    match d.pivot_type(last) {
1166        PivotType::TwoByTwo { partner } if partner > last => {
1167            // Last accepted is the first column of a 2×2 whose partner is beyond nelim
1168            effective_nelim - 1
1169        }
1170        _ => effective_nelim,
1171    }
1172}
1173
1174/// Per-block backup for the two-level APTP algorithm.
1175///
1176/// Stores a copy of matrix entries for one outer block column,
1177/// enabling restore when the a posteriori check reduces nelim.
1178///
1179// SPRAL Equivalent: `CopyBackup<T>` in `spral/src/ssids/cpu/kernels/ldlt_app.cxx` (BSD-3).
1180/// Used by the BLAS-3 `factor_inner` to save the block column before
1181/// factoring, so it can be restored if the Apply step's threshold check
1182/// reduces nelim.
1183struct BlockBackup<'a> {
1184    data: MatMut<'a, f64>,
1185}
1186
1187impl<'a> BlockBackup<'a> {
1188    /// Create a backup of the block column starting at `col_start` with `block_cols` columns.
1189    /// Backs up `a[col_start.., col_start..col_start+block_cols]` into the provided buffer.
1190    fn create(
1191        a: MatRef<'_, f64>,
1192        col_start: usize,
1193        block_cols: usize,
1194        m: usize,
1195        buf: &'a mut Mat<f64>,
1196    ) -> Self {
1197        let rows = m - col_start;
1198        let mut data = buf.as_mut().submatrix_mut(0, 0, rows, block_cols);
1199        for j in 0..block_cols {
1200            for i in 0..rows {
1201                data[(i, j)] = a[(col_start + i, col_start + j)];
1202            }
1203        }
1204        BlockBackup { data }
1205    }
1206
1207    /// Restore the failed portion of the block (columns nelim..block_cols) from backup.
1208    /// Columns 0..nelim are left untouched (successfully factored).
1209    fn restore_failed(
1210        &self,
1211        mut a: MatMut<'_, f64>,
1212        col_start: usize,
1213        nelim: usize,
1214        block_cols: usize,
1215        m: usize,
1216    ) {
1217        let rows = m - col_start;
1218        for j in nelim..block_cols {
1219            for i in 0..rows {
1220                a[(col_start + i, col_start + j)] = self.data[(i, j)];
1221            }
1222        }
1223    }
1224
1225    /// Restore failed columns of the diagonal block from pre-factor backup,
1226    /// applying the block permutation to read from the correct backup positions.
1227    ///
1228    /// The backup was taken BEFORE factor_block_diagonal permuted columns.
1229    /// factor_block_diagonal applied symmetric swaps described by `block_perm`,
1230    /// so to restore failed column j (in post-perm ordering), we must read
1231    /// from backup position perm[j] (pre-perm ordering).
1232    ///
1233    /// Restores two regions:
1234    /// 1. Diagonal block: a[k+e..k+bs, k+e..k+bs] — symmetric with perm
1235    /// 2. Panel below: a[k+bs..m, k+e..k+bs] — column perm only
1236    // SPRAL Equivalent: `CopyBackup::restore_part_with_sym_perm` (`ldlt_app.cxx:562-574`) (BSD-3).
1237    fn restore_diagonal_with_perm(
1238        &self,
1239        mut a: MatMut<'_, f64>,
1240        col_start: usize,
1241        nelim: usize,
1242        block_cols: usize,
1243        m: usize,
1244        block_perm: &[usize],
1245    ) {
1246        // Region 1: Diagonal block — restore a[k+e..k+bs, k+e..k+bs]
1247        // with symmetric permutation from backup.
1248        // backup[r, c] stored with r >= c (lower triangle).
1249        // In backup coordinates: row i, col j.
1250        // a[(col_start+i, col_start+j)] = backup[(max(r,c), min(r,c))]
1251        for j in nelim..block_cols {
1252            let c = block_perm[j]; // pre-perm column
1253            for i in nelim..block_cols {
1254                let r = block_perm[i]; // pre-perm row
1255                // Read from lower triangle of backup (row >= col)
1256                let val = if r >= c {
1257                    self.data[(r, c)]
1258                } else {
1259                    self.data[(c, r)]
1260                };
1261                a[(col_start + i, col_start + j)] = val;
1262            }
1263        }
1264
1265        // Region 2: Panel below diagonal block — restore a[k+bs..m, k+e..k+bs]
1266        // Only column permutation applies (panel rows were permuted by
1267        // apply_cperm step, but we need original values at permuted column).
1268        // a[(col_start+i, col_start+j)] = backup[(i, c)]
1269        for j in nelim..block_cols {
1270            let c = block_perm[j]; // pre-perm column
1271            for i in block_cols..(m - col_start) {
1272                a[(col_start + i, col_start + j)] = self.data[(i, c)];
1273            }
1274        }
1275    }
1276}
1277
1278/// Apply factored L11/D11 to the panel below the diagonal block (TRSM),
1279/// then perform a posteriori threshold check on all L21 entries.
1280///
1281/// Given a factored diagonal block at `a[col_start..col_start+block_nelim, ...]`
1282/// with L11 (unit lower triangular) and D11, computes:
1283///   L21 = A21 * (L11 * D11)^{-T}
1284/// and checks that all |L21[i,j]| < 1/threshold.
1285///
1286/// Returns the effective nelim (<= block_nelim): the number of columns
1287/// whose L entries all satisfy the threshold bound.
1288///
1289/// # Algorithm
1290/// 1. Solve: X = A21 * L11^{-T} via triangular solve (TRSM)
1291/// 2. Scale: L21[i,j] = X[i,j] / D[j,j] for 1×1 pivots,
1292///    or L21[i,k:k+1] via 2×2 inversion for 2×2 pivots
1293/// 3. Scan L21 column-by-column; find first column where any entry exceeds 1/threshold
1294#[allow(clippy::too_many_arguments)]
1295fn apply_and_check(
1296    mut a: MatMut<'_, f64>,
1297    col_start: usize,
1298    block_nelim: usize,
1299    block_cols: usize,
1300    m: usize,
1301    d: &MixedDiagonal,
1302    threshold: f64,
1303    par: Par,
1304    l11_buf: &mut Mat<f64>,
1305) -> usize {
1306    if block_nelim == 0 {
1307        return 0;
1308    }
1309
1310    let panel_rows = m - col_start - block_cols;
1311    if panel_rows == 0 {
1312        return block_nelim;
1313    }
1314
1315    // Step 1: TRSM — solve panel * L11^T = A21 for panel (= L21)
1316    // L11 is unit lower triangular in a[col_start..+block_nelim, col_start..+block_nelim]
1317    // panel is a[panel_start..m, col_start..col_start+block_nelim]
1318    //
1319    // Transposing: L11 * panel^T = A21^T, i.e. unit lower triangular solve on panel^T.
1320    // Copy L11 to a temporary to avoid aliasing (L11 and panel overlap in `a`).
1321
1322    let panel_start = col_start + block_cols;
1323
1324    // Copy L11 into workspace buffer to avoid aliasing
1325    {
1326        let src = a
1327            .rb()
1328            .submatrix(col_start, col_start, block_nelim, block_nelim);
1329        let mut dst = l11_buf
1330            .as_mut()
1331            .submatrix_mut(0, 0, block_nelim, block_nelim);
1332        dst.copy_from(src);
1333    }
1334    let l11_ref = l11_buf.as_ref().submatrix(0, 0, block_nelim, block_nelim);
1335    let panel = a
1336        .rb_mut()
1337        .submatrix_mut(panel_start, col_start, panel_rows, block_nelim);
1338    triangular_solve::solve_unit_lower_triangular_in_place(l11_ref, panel.transpose_mut(), par);
1339
1340    // Step 2: Scale by D^{-1}
1341    // For 1×1 pivot at column j: L21[:, j] /= D[j]
1342    // For 2×2 pivot at columns (j, j+1): solve the 2×2 system
1343    let mut col = 0;
1344    while col < block_nelim {
1345        match d.pivot_type(col) {
1346            PivotType::OneByOne => {
1347                let d_val = d.diagonal_1x1(col);
1348                let inv_d = 1.0 / d_val;
1349                for i in 0..panel_rows {
1350                    a[(panel_start + i, col_start + col)] *= inv_d;
1351                }
1352                col += 1;
1353            }
1354            PivotType::TwoByTwo { partner } if partner > col => {
1355                let block = d.diagonal_2x2(col);
1356                let det = block.a * block.c - block.b * block.b;
1357                let inv_det = 1.0 / det;
1358                for i in 0..panel_rows {
1359                    let x1 = a[(panel_start + i, col_start + col)];
1360                    let x2 = a[(panel_start + i, col_start + col + 1)];
1361                    a[(panel_start + i, col_start + col)] = (x1 * block.c - x2 * block.b) * inv_det;
1362                    a[(panel_start + i, col_start + col + 1)] =
1363                        (x2 * block.a - x1 * block.b) * inv_det;
1364                }
1365                col += 2;
1366            }
1367            _ => {
1368                col += 1;
1369            }
1370        }
1371    }
1372
1373    // Step 3: Threshold scan — find first failing column
1374    let stability_bound = 1.0 / threshold;
1375    let mut effective_nelim = block_nelim;
1376
1377    let mut scan_col = 0;
1378    while scan_col < block_nelim {
1379        let pivot_width = match d.pivot_type(scan_col) {
1380            PivotType::TwoByTwo { partner } if partner > scan_col => 2,
1381            _ => 1,
1382        };
1383
1384        let mut col_ok = true;
1385        for c in scan_col..scan_col + pivot_width {
1386            if c >= block_nelim {
1387                break;
1388            }
1389            for i in 0..panel_rows {
1390                if a[(panel_start + i, col_start + c)].abs() >= stability_bound {
1391                    col_ok = false;
1392                    break;
1393                }
1394            }
1395            if !col_ok {
1396                break;
1397            }
1398        }
1399
1400        if !col_ok {
1401            effective_nelim = scan_col;
1402            break;
1403        }
1404        scan_col += pivot_width;
1405    }
1406
1407    effective_nelim
1408}
1409
1410/// Rank-nelim Schur complement update on the trailing submatrix via GEMM.
1411///
1412/// Computes: A[trailing, trailing] -= L21 * D11 * L21^T
1413/// where L21 is the panel at a[panel_start..m, col_start..col_start+nelim]
1414/// and D11 is the block diagonal from the Factor phase.
1415///
1416/// Uses explicit W = L21 * D11 workspace, then A -= W * L21^T (GEMM).
1417#[allow(clippy::too_many_arguments)]
1418fn update_trailing(
1419    mut a: MatMut<'_, f64>,
1420    col_start: usize,
1421    nelim: usize,
1422    block_cols: usize,
1423    m: usize,
1424    num_fully_summed: usize,
1425    d: &MixedDiagonal,
1426    par: Par,
1427    ld_buf: &mut Mat<f64>,
1428    copy_buf: &mut Mat<f64>,
1429) {
1430    if nelim == 0 {
1431        return;
1432    }
1433
1434    let trailing_start = col_start + block_cols;
1435    let trailing_size = m - trailing_start;
1436    if trailing_size == 0 {
1437        return;
1438    }
1439
1440    // p = num_fully_summed boundary: rows [trailing_start..p] are FS, [p..m] are NFS.
1441    let p = num_fully_summed;
1442
1443    // Compute W = L21 * D11 using workspace buffer (trailing_size × nelim subview).
1444    // We compute W for ALL trailing rows (FS + NFS) since the FS×FS region (region 1)
1445    // and the NFS×FS cross-term (region 2) both need it.
1446    let mut w = ld_buf.as_mut().submatrix_mut(0, 0, trailing_size, nelim);
1447    let mut col = 0;
1448    while col < nelim {
1449        match d.pivot_type(col) {
1450            PivotType::OneByOne => {
1451                let d_val = d.diagonal_1x1(col);
1452                for i in 0..trailing_size {
1453                    w[(i, col)] = a[(trailing_start + i, col_start + col)] * d_val;
1454                }
1455                col += 1;
1456            }
1457            PivotType::TwoByTwo { partner } if partner > col => {
1458                let block = d.diagonal_2x2(col);
1459                for i in 0..trailing_size {
1460                    let l1 = a[(trailing_start + i, col_start + col)];
1461                    let l2 = a[(trailing_start + i, col_start + col + 1)];
1462                    w[(i, col)] = l1 * block.a + l2 * block.b;
1463                    w[(i, col + 1)] = l1 * block.b + l2 * block.c;
1464                }
1465                col += 2;
1466            }
1467            _ => {
1468                col += 1;
1469            }
1470        }
1471    }
1472
1473    // Copy L21 into workspace buffer to avoid borrow conflict (L21 and A22 overlap in `a`)
1474    {
1475        let src = a
1476            .rb()
1477            .submatrix(trailing_start, col_start, trailing_size, nelim);
1478        let mut dst = copy_buf.as_mut().submatrix_mut(0, 0, trailing_size, nelim);
1479        dst.copy_from(src);
1480    }
1481
1482    // Region 1 (FS×FS): Lower-triangular GEMM on A[ts..p, ts..p]
1483    let fs_size = p.saturating_sub(trailing_start);
1484    if fs_size > 0 {
1485        let w_fs = ld_buf.as_ref().submatrix(0, 0, fs_size, nelim);
1486        let l21_fs = copy_buf.as_ref().submatrix(0, 0, fs_size, nelim);
1487        let mut a_fs = a
1488            .rb_mut()
1489            .submatrix_mut(trailing_start, trailing_start, fs_size, fs_size);
1490
1491        tri_matmul::matmul_with_conj(
1492            a_fs.rb_mut(),
1493            BlockStructure::TriangularLower,
1494            Accum::Add,
1495            w_fs,
1496            BlockStructure::Rectangular,
1497            Conj::No,
1498            l21_fs.transpose(),
1499            BlockStructure::Rectangular,
1500            Conj::No,
1501            -1.0,
1502            par,
1503        );
1504    }
1505
1506    // Region 2 (NFS×FS cross-term): Rectangular GEMM on A[p..m, ts..p]
1507    let nfs_size = m - p;
1508    if nfs_size > 0 && fs_size > 0 {
1509        let w_nfs = ld_buf.as_ref().submatrix(fs_size, 0, nfs_size, nelim);
1510        let l21_fs = copy_buf.as_ref().submatrix(0, 0, fs_size, nelim);
1511        let mut a_cross = a.submatrix_mut(p, trailing_start, nfs_size, fs_size);
1512
1513        faer::linalg::matmul::matmul_with_conj(
1514            a_cross.rb_mut(),
1515            Accum::Add,
1516            w_nfs,
1517            Conj::No,
1518            l21_fs.transpose(),
1519            Conj::No,
1520            -1.0,
1521            par,
1522        );
1523    }
1524
1525    // Region 3 (NFS×NFS): SKIPPED — deferred to compute_contribution_gemm
1526}
1527
1528/// Compute W = L * D for a set of rows, where L and D come from the factored block.
1529///
1530/// For 1×1 pivots: W[i, col] = L[i, col] * D[col]
1531/// For 2×2 pivots: W[i, col] = L[i,col]*D[col,col] + L[i,col+1]*D[col+1,col]
1532///                 W[i, col+1] = L[i,col]*D[col,col+1] + L[i,col+1]*D[col+1,col+1]
1533///
1534// SPRAL Equivalent: `calcLD<OP_N>` in `spral/src/ssids/cpu/kernels/calc_ld.hxx:41+` (BSD-3).
1535/// Writes into `dst[0..nrows, 0..nelim]`. The destination is overwritten, not zeroed first.
1536fn compute_ld_into(l: MatRef<'_, f64>, d: &MixedDiagonal, nelim: usize, mut dst: MatMut<'_, f64>) {
1537    let nrows = l.nrows();
1538    let mut col = 0;
1539    while col < nelim {
1540        match d.pivot_type(col) {
1541            PivotType::OneByOne => {
1542                let d_val = d.diagonal_1x1(col);
1543                for i in 0..nrows {
1544                    dst[(i, col)] = l[(i, col)] * d_val;
1545                }
1546                col += 1;
1547            }
1548            PivotType::TwoByTwo { partner } if partner > col => {
1549                let block = d.diagonal_2x2(col);
1550                for i in 0..nrows {
1551                    let l1 = l[(i, col)];
1552                    let l2 = l[(i, col + 1)];
1553                    dst[(i, col)] = l1 * block.a + l2 * block.b;
1554                    dst[(i, col + 1)] = l1 * block.b + l2 * block.c;
1555                }
1556                col += 2;
1557            }
1558            _ => {
1559                col += 1;
1560            }
1561        }
1562    }
1563}
1564
1565/// Compute the NFS×NFS Schur complement via a single deferred GEMM.
1566///
1567/// Called after the BLAS-3 blocking loop (which skips the NFS×NFS region).
1568/// Copies the assembled NFS×NFS values from `frontal_data[p..m, p..m]` into
1569/// `contrib_buffer`, then applies the rank-`ne` symmetric update in-place:
1570///
1571/// ```text
1572/// contrib_buffer[0..nfs, 0..nfs] = assembled[NFS×NFS] - L21_NFS * D * L21_NFS^T
1573/// ```
1574///
1575/// where `L21_NFS = frontal_data[p..m, 0..ne]` and `D` is the accumulated
1576/// diagonal from the blocking loop. The output is the lower triangle of the
1577/// Schur complement.
1578///
1579/// # Guard conditions
1580///
1581/// - `nfs == 0`: no contribution (root or fully eliminated), returns immediately
1582/// - `ne == 0`: no rank update (copy only — all columns were delayed)
1583///
1584// SPRAL Equivalent: `host_gemm` writing to `node.contrib` in `factor.hxx:92-103` (BSD-3).
1585/// # References
1586///
1587/// - Duff, Hogg & Lopez (2020), Section 3: deferred Schur complement
1588/// - Liu (1992), Section 4: Schur complement in multifrontal method
1589#[allow(clippy::too_many_arguments)]
1590pub(crate) fn compute_contribution_gemm(
1591    frontal_data: &Mat<f64>,
1592    num_fully_summed: usize,
1593    num_eliminated: usize,
1594    m: usize,
1595    d: &MixedDiagonal,
1596    contrib_buffer: &mut Mat<f64>,
1597    ld_workspace: &mut Mat<f64>,
1598    par: Par,
1599) {
1600    let p = num_fully_summed;
1601    let ne = num_eliminated;
1602    let nfs = m - p;
1603
1604    if nfs == 0 {
1605        return; // No contribution block (root or fully eliminated)
1606    }
1607
1608    // Step 1: Copy assembled NFS×NFS from frontal_data[p..m, p..m] into
1609    //         contrib_buffer[0..nfs, 0..nfs] (lower triangle only).
1610    //
1611    // This copy is unavoidable: assembly scatters into frontal_data, but the
1612    // GEMM output goes to contrib_buffer. A future optimization could scatter
1613    // NFS entries directly into the contribution buffer during assembly,
1614    // eliminating this copy at the cost of a larger architectural change.
1615    for j in 0..nfs {
1616        let col_len = nfs - j;
1617        let src = &frontal_data.col_as_slice(p + j)[p + j..m];
1618        contrib_buffer.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
1619    }
1620
1621    if ne == 0 {
1622        return; // No rank update — all columns were delayed, copy only
1623    }
1624
1625    // Step 2: Compute W = L21_NFS * D (nfs × ne)
1626    // L21_NFS = frontal_data[p..m, 0..ne]
1627    let l21_nfs = frontal_data.as_ref().submatrix(p, 0, nfs, ne);
1628
1629    // Use caller-provided workspace to avoid per-supernode allocation.
1630    // Resize if needed (rare: delayed cascades may exceed initial estimate).
1631    if ld_workspace.nrows() < nfs || ld_workspace.ncols() < ne {
1632        *ld_workspace = Mat::zeros(nfs.max(ld_workspace.nrows()), ne.max(ld_workspace.ncols()));
1633    }
1634    let mut w = ld_workspace.as_mut().submatrix_mut(0, 0, nfs, ne);
1635    w.fill(0.0);
1636    compute_ld_into(l21_nfs, d, ne, w.rb_mut());
1637
1638    // Step 3: Symmetric rank-ne update: contrib -= W * L21_NFS^T (lower triangle)
1639    tri_matmul::matmul_with_conj(
1640        contrib_buffer.as_mut().submatrix_mut(0, 0, nfs, nfs),
1641        BlockStructure::TriangularLower,
1642        Accum::Add,
1643        w.as_ref(),
1644        BlockStructure::Rectangular,
1645        Conj::No,
1646        l21_nfs.transpose(),
1647        BlockStructure::Rectangular,
1648        Conj::No,
1649        -1.0,
1650        par,
1651    );
1652}
1653
1654/// Apply Schur complement updates from passed columns to failed and trailing regions.
1655///
1656/// After factoring `nelim` out of `block_cols` columns, three update regions exist:
1657///
1658/// 1. **Failed×failed** (diagonal): A[k+e..k+bs, k+e..k+bs] -= W_blk * L_blk^T
1659/// 2. **Trailing×failed** (cross-term): A[ts..m, k+e..k+bs] -= W_panel * L_blk^T
1660/// 3. **Trailing×trailing**: handled separately by `update_trailing`
1661///
1662/// where:
1663/// - L_blk = a[k+e..k+bs, k..k+e] (within-block L for failed rows)
1664/// - L_panel = a[ts..m, k..k+e] (panel L below diagonal block)
1665/// - W = L * D (LD product)
1666/// - ts = k + bs (trailing start)
1667// SPRAL Equivalent: `Block::update` with rfrom/cfrom skip (`ldlt_app.cxx:1082-1153`) (BSD-3).
1668#[allow(clippy::too_many_arguments)]
1669fn update_cross_terms(
1670    mut a: MatMut<'_, f64>,
1671    col_start: usize,
1672    nelim: usize,
1673    block_cols: usize,
1674    m: usize,
1675    d: &MixedDiagonal,
1676    ld_buf: &mut Mat<f64>,
1677    copy_buf: &mut Mat<f64>,
1678) {
1679    if nelim == 0 || nelim >= block_cols {
1680        return; // No failed columns → no cross-term updates
1681    }
1682
1683    let n_failed = block_cols - nelim;
1684    let trailing_start = col_start + block_cols;
1685    let trailing_size = m - trailing_start;
1686
1687    // L_blk: the L entries for failed rows within the diagonal block
1688    // Copy to workspace to avoid aliasing: a[k+e..k+bs, k..k+e]
1689    {
1690        let src = a
1691            .rb()
1692            .submatrix(col_start + nelim, col_start, n_failed, nelim);
1693        let mut dst = copy_buf.as_mut().submatrix_mut(0, 0, n_failed, nelim);
1694        dst.copy_from(src);
1695    }
1696
1697    // W_blk = L_blk * D (into ld_buf)
1698    {
1699        let l_blk = copy_buf.as_ref().submatrix(0, 0, n_failed, nelim);
1700        compute_ld_into(
1701            l_blk,
1702            d,
1703            nelim,
1704            ld_buf.as_mut().submatrix_mut(0, 0, n_failed, nelim),
1705        );
1706    }
1707
1708    // Region 1: Failed×failed diagonal update
1709    // A[k+e..k+bs, k+e..k+bs] -= W_blk * L_blk^T (lower triangle only)
1710    {
1711        let l_blk = copy_buf.as_ref().submatrix(0, 0, n_failed, nelim);
1712        let w_blk = ld_buf.as_ref().submatrix(0, 0, n_failed, nelim);
1713        for j in 0..n_failed {
1714            for i in j..n_failed {
1715                let mut sum = 0.0;
1716                for c in 0..nelim {
1717                    sum += w_blk[(i, c)] * l_blk[(j, c)];
1718                }
1719                a[(col_start + nelim + i, col_start + nelim + j)] -= sum;
1720            }
1721        }
1722    }
1723
1724    // Region 2: Trailing×failed cross-term update
1725    // A[ts..m, k+e..k+bs] -= W_panel * L_blk^T
1726    if trailing_size > 0 {
1727        // Copy L_panel into workspace (offset past L_blk already in copy_buf)
1728        {
1729            let src = a
1730                .rb()
1731                .submatrix(trailing_start, col_start, trailing_size, nelim);
1732            let mut dst = copy_buf
1733                .as_mut()
1734                .submatrix_mut(n_failed, 0, trailing_size, nelim);
1735            dst.copy_from(src);
1736        }
1737
1738        // Now both l_blk and l_panel are in copy_buf at non-overlapping offsets.
1739        // Re-borrow the full copy_buf immutably to access both regions.
1740        let l_blk = copy_buf.as_ref().submatrix(0, 0, n_failed, nelim);
1741        let l_panel = copy_buf
1742            .as_ref()
1743            .submatrix(n_failed, 0, trailing_size, nelim);
1744
1745        // W_panel = L_panel * D (into ld_buf offset past W_blk)
1746        compute_ld_into(
1747            l_panel,
1748            d,
1749            nelim,
1750            ld_buf
1751                .as_mut()
1752                .submatrix_mut(n_failed, 0, trailing_size, nelim),
1753        );
1754        let w_panel = ld_buf.as_ref().submatrix(n_failed, 0, trailing_size, nelim);
1755
1756        for j in 0..n_failed {
1757            for i in 0..trailing_size {
1758                let mut sum = 0.0;
1759                for c in 0..nelim {
1760                    sum += w_panel[(i, c)] * l_blk[(j, c)];
1761                }
1762                a[(trailing_start + i, col_start + nelim + j)] -= sum;
1763            }
1764        }
1765    }
1766}
1767
1768/// Factor an nb-sized block using BLAS-3 Factor/Apply/Update loop.
1769///
1770/// # Lower-Triangle Convention
1771///
1772/// This function operates exclusively on the **lower triangle** of the dense
1773/// frontal matrix. All reads (pivot search, L extraction) and writes (Schur
1774/// updates via `BlockStructure::TriangularLower`, `swap_symmetric`) touch only
1775/// entries where `row >= col`. The upper triangle may contain stale values
1776/// after column swaps and Schur updates — this is intentional and safe because
1777/// no code path reads upper-triangle entries. This convention is consistent
1778/// across `factor_block_diagonal`, `apply_and_check`, `update_trailing`,
1779/// `update_cross_terms`, and `swap_symmetric_block`.
1780///
1781/// This is the middle level of the two-level hierarchy. Processes `num_fully_summed`
1782/// columns of the block `a[0..m, 0..m]` using ib-sized sub-blocks with the
1783/// three-phase BLAS-3 pattern (factor / apply+check / update):
1784///
1785/// 1. **Backup**: Save `a[k..m, k..k+block_size]` before factoring
1786/// 2. **Factor**: `factor_block_diagonal` on the ib×ib diagonal block (complete pivoting)
1787/// 3. **Permute panel**: apply block_perm to panel columns
1788/// 4. **Zero D off-diagonals**: for TRSM
1789/// 5. **Row perm propagation**: apply block_perm to columns 0..k (always, not just on success)
1790/// 6. **Update col_order**: by block_perm (always, not just on success)
1791/// 7. **Apply+Check**: TRSM on panel + threshold check → effective_nelim
1792/// 8. **Adjust**: avoid splitting 2×2 across boundary
1793/// 9. **Partial restore** (on failure): restore failed columns with permuted backup
1794/// 10. **Schur updates**: trailing×trailing + cross-term updates for failed columns
1795/// 11. **Delay** failed columns: swap to end_pos
1796/// 12. **Advance**: k += nelim (may be < block_size)
1797///
1798/// Key difference from the old implementation: on threshold failure we DO NOT
1799/// fully restore and retry. Instead we keep the passed columns, partially
1800/// restore only the failed columns (using permuted backup), apply Schur updates
1801/// from passed to failed+trailing, and advance.
1802///
1803/// # Arguments
1804/// - `a`: Dense frontal matrix block (m × m), modified in place
1805/// - `num_fully_summed`: Number of columns eligible for elimination
1806/// - `options`: APTP configuration (inner_block_size determines ib)
1807///
1808/// # References
1809/// - SPRAL: `run_elim_pivoted_notasks` in `ldlt_app.cxx:1585-1713`
1810/// - Duff, Hogg & Lopez (2020), Algorithm 3.1
1811fn factor_inner(
1812    mut a: MatMut<'_, f64>,
1813    num_fully_summed: usize,
1814    nfs_boundary: usize,
1815    options: &AptpOptions,
1816    kernel_ws: &mut AptpKernelWorkspace,
1817) -> Result<AptpFactorResult, SparseError> {
1818    let m = a.nrows();
1819    let ib = options.inner_block_size;
1820    let small = options.small;
1821    let threshold = options.threshold;
1822    let p = num_fully_summed;
1823
1824    let mut col_order: Vec<usize> = (0..m).collect();
1825    let mut d = MixedDiagonal::new(p);
1826    let mut stats = AptpStatistics::default();
1827    let mut pivot_log = Vec::with_capacity(p);
1828    let mut k = 0;
1829    let mut end_pos = p;
1830
1831    // Pre-allocated buffers reused across block iterations to avoid
1832    // per-block allocations in the hot loop.
1833    let mut panel_perm_buf = vec![0.0f64; ib];
1834    let mut row_perm_buf = vec![0.0f64; ib];
1835    let mut col_order_buf = vec![0usize; ib];
1836
1837    // BLAS-3 Factor/Apply/Update loop with ib-sized inner blocks.
1838    //
1839    // BLAS-3 Factor/Apply/Update architecture:
1840    //   1. Backup (pre-factor)
1841    //   2. Factor diagonal block (complete pivoting, block-scoped swaps)
1842    //   3. Permute panel columns by block_perm
1843    //   4. Zero D off-diagonals for TRSM
1844    //   5. Row perm propagation to columns 0..k
1845    //   6. Update col_order by block_perm
1846    //   7. Apply+Check (TRSM + threshold scan)
1847    //   8. Adjust for 2×2 boundary
1848    //   9. On failure: partial restore of failed columns (not full restore+retry)
1849    //  10. Schur updates (trailing + cross-terms for failed columns)
1850    //  11. Delay failed columns (swap to end_pos)
1851    //  12. Advance k += effective_nelim
1852    //
1853    // Steps 5-6 happen BEFORE apply_and_check to ensure a consistent
1854    // permuted state regardless of threshold outcome.
1855    while k < end_pos {
1856        let block_size = (end_pos - k).min(ib);
1857        let block_end = k + block_size;
1858
1859        // 1. BACKUP: save a[k..m, k..k+block_size] before factoring
1860        let backup = BlockBackup::create(a.as_ref(), k, block_size, m, &mut kernel_ws.backup);
1861
1862        // 2. FACTOR: complete pivoting on the block_size×block_size diagonal block
1863        //    Block-scoped swaps: only rows/columns within [0..block_end] are permuted.
1864        //    Panel rows [block_end..m] are NOT touched.
1865        let (block_d, block_perm, block_nelim, block_stats, block_log) =
1866            factor_block_diagonal(a.rb_mut(), k, block_size, small, block_end);
1867
1868        if block_nelim == 0 {
1869            // Entire block is singular — restore and delay all columns
1870            backup.restore_failed(a.rb_mut(), k, 0, block_size, m);
1871            for offset in 0..block_size {
1872                let delayed_orig = col_order[k + block_perm[offset]];
1873                end_pos -= 1;
1874                if k + offset < end_pos {
1875                    swap_symmetric(a.rb_mut(), k + offset, end_pos);
1876                    col_order.swap(k + offset, end_pos);
1877                }
1878                stats.num_delayed += 1;
1879                pivot_log.push(AptpPivotRecord {
1880                    col: delayed_orig,
1881                    pivot_type: PivotType::Delayed,
1882                    max_l_entry: 0.0,
1883                    was_fallback: false,
1884                });
1885            }
1886            continue;
1887        }
1888
1889        // 3. PERMUTE PANEL: reorder panel column entries a[r, k..k+bs]
1890        //    according to block_perm so that TRSM sees the correct input.
1891        let panel_start = block_end;
1892        for r in panel_start..m {
1893            for i in 0..block_size {
1894                panel_perm_buf[i] = a[(r, k + block_perm[i])];
1895            }
1896            for i in 0..block_size {
1897                a[(r, k + i)] = panel_perm_buf[i];
1898            }
1899        }
1900
1901        // 4. Zero out D off-diagonals so apply_and_check's TRSM reads them as
1902        //    L11 entries (should be 0 for 2×2 pivots where L starts at row k+2).
1903        {
1904            let mut bc = 0;
1905            while bc < block_nelim {
1906                match block_d.pivot_type(bc) {
1907                    PivotType::TwoByTwo { partner } if partner > bc => {
1908                        a[(k + bc + 1, k + bc)] = 0.0;
1909                        bc += 2;
1910                    }
1911                    _ => {
1912                        bc += 1;
1913                    }
1914                }
1915            }
1916        }
1917
1918        // 5. ROW PERM PROPAGATION: apply block_perm to columns 0..k.
1919        //    Done BEFORE apply_and_check to ensure the matrix is in a
1920        //    consistent permuted state regardless of threshold outcome.
1921        if k > 0 {
1922            for c in 0..k {
1923                for i in 0..block_size {
1924                    row_perm_buf[i] = a[(k + block_perm[i], c)];
1925                }
1926                for i in 0..block_size {
1927                    a[(k + i, c)] = row_perm_buf[i];
1928                }
1929            }
1930        }
1931
1932        // 6. UPDATE COL_ORDER by block_perm (always, not just on success).
1933        col_order_buf[..block_size].copy_from_slice(&col_order[k..k + block_size]);
1934        for i in 0..block_size {
1935            col_order[k + i] = col_order_buf[block_perm[i]];
1936        }
1937
1938        // 7. APPLY: TRSM on panel below + threshold check
1939        let mut effective_nelim = apply_and_check(
1940            a.rb_mut(),
1941            k,
1942            block_nelim,
1943            block_size,
1944            m,
1945            &block_d,
1946            threshold,
1947            options.par,
1948            &mut kernel_ws.l11_buf,
1949        );
1950
1951        // 8. ADJUST: don't split 2×2 pivot across block boundary
1952        effective_nelim = adjust_for_2x2_boundary(effective_nelim, &block_d);
1953
1954        // 9. PARTIAL RESTORE (on failure): restore only failed columns from
1955        //    pre-factor backup with permutation applied.
1956        //    We keep passed columns (0..effective_nelim) — their L11, D, L21 are committed.
1957        if effective_nelim < block_nelim {
1958            backup.restore_diagonal_with_perm(
1959                a.rb_mut(),
1960                k,
1961                effective_nelim,
1962                block_size,
1963                m,
1964                &block_perm,
1965            );
1966        }
1967
1968        // Use effective_nelim as the number of passed columns for all subsequent steps.
1969        let nelim = effective_nelim;
1970
1971        // 10. SCHUR UPDATES using only passed columns' L and D.
1972        //     Truncate block_d to passed columns for update computations.
1973        //     Three regions:
1974        //     a. Trailing×trailing: A[ts..m, ts..m] -= L_panel * D * L_panel^T
1975        //     b. Failed×failed: A[k+e..k+bs, k+e..k+bs] -= L_blk * D * L_blk^T
1976        //     c. Trailing×failed: A[ts..m, k+e..k+bs] -= L_panel * D * L_blk^T
1977        if nelim > 0 {
1978            update_trailing(
1979                a.rb_mut(),
1980                k,
1981                nelim,
1982                block_size,
1983                m,
1984                nfs_boundary,
1985                &block_d,
1986                options.par,
1987                &mut kernel_ws.ld_buf,
1988                &mut kernel_ws.copy_buf,
1989            );
1990            update_cross_terms(
1991                a.rb_mut(),
1992                k,
1993                nelim,
1994                block_size,
1995                m,
1996                &block_d,
1997                &mut kernel_ws.ld_buf,
1998                &mut kernel_ws.copy_buf,
1999            );
2000        }
2001
2002        // 11. ACCUMULATE D entries for passed columns
2003        d.copy_from_offset(&block_d, k, nelim);
2004
2005        // Accumulate stats from passed columns only (count from block_d directly)
2006        let mut passed_1x1 = 0;
2007        let mut passed_2x2 = 0;
2008        let mut sc = 0;
2009        while sc < nelim {
2010            match block_d.pivot_type(sc) {
2011                PivotType::OneByOne => {
2012                    passed_1x1 += 1;
2013                    sc += 1;
2014                }
2015                PivotType::TwoByTwo { partner } if partner > sc => {
2016                    passed_2x2 += 1;
2017                    sc += 2;
2018                }
2019                _ => {
2020                    sc += 1;
2021                }
2022            }
2023        }
2024        stats.num_1x1 += passed_1x1;
2025        stats.num_2x2 += passed_2x2;
2026        if block_stats.max_l_entry > stats.max_l_entry {
2027            stats.max_l_entry = block_stats.max_l_entry;
2028        }
2029        // Check panel L entries for max_l_entry (passed columns only)
2030        for c in 0..nelim {
2031            for i in panel_start..m {
2032                let v = a[(i, k + c)].abs();
2033                if v > stats.max_l_entry {
2034                    stats.max_l_entry = v;
2035                }
2036            }
2037        }
2038
2039        // Accumulate pivot log for passed columns
2040        for record in &block_log {
2041            if !matches!(record.pivot_type, PivotType::Delayed) && record.col < nelim {
2042                let global_col = col_order[k + record.col];
2043                let global_pivot_type = match record.pivot_type {
2044                    PivotType::TwoByTwo { partner } => PivotType::TwoByTwo {
2045                        partner: col_order[k + partner],
2046                    },
2047                    other => other,
2048                };
2049                pivot_log.push(AptpPivotRecord {
2050                    col: global_col,
2051                    pivot_type: global_pivot_type,
2052                    max_l_entry: record.max_l_entry,
2053                    was_fallback: record.was_fallback,
2054                });
2055            }
2056        }
2057
2058        // 12. DELAY failed columns (nelim..block_size): swap to end_pos
2059        if nelim < block_size {
2060            let n_delayed = block_size - nelim;
2061            for i in (0..n_delayed).rev() {
2062                let delayed_pos = k + nelim + i;
2063                end_pos -= 1;
2064                if delayed_pos < end_pos {
2065                    swap_symmetric(a.rb_mut(), delayed_pos, end_pos);
2066                    col_order.swap(delayed_pos, end_pos);
2067                }
2068                stats.num_delayed += 1;
2069                pivot_log.push(AptpPivotRecord {
2070                    col: col_order[end_pos],
2071                    pivot_type: PivotType::Delayed,
2072                    max_l_entry: 0.0,
2073                    was_fallback: false,
2074                });
2075            }
2076        }
2077
2078        k += nelim;
2079    }
2080
2081    let num_eliminated = k;
2082
2083    d.truncate(num_eliminated);
2084
2085    let delayed_cols: Vec<usize> = (num_eliminated..p).map(|i| col_order[i]).collect();
2086
2087    Ok(AptpFactorResult {
2088        d,
2089        perm: col_order,
2090        num_eliminated,
2091        delayed_cols,
2092        stats,
2093        pivot_log,
2094    })
2095}
2096
2097/// Two-level outer block loop for large frontal matrices.
2098///
2099/// Processes nb-sized blocks. For each outer block: `factor_inner` handles
2100/// the BLAS-3 Factor/Apply/Update loop internally on ib-sized sub-blocks,
2101/// including threshold checking on the panel via `apply_and_check` and
2102/// block-level backup/restore on failure.
2103///
2104/// Called by `aptp_factor_in_place` when `num_fully_summed > outer_block_size`.
2105///
2106/// # References
2107/// - Duff, Hogg & Lopez (2020), Algorithm 3.1: two-level outer loop
2108fn two_level_factor(
2109    mut a: MatMut<'_, f64>,
2110    num_fully_summed: usize,
2111    options: &AptpOptions,
2112    kernel_ws: &mut AptpKernelWorkspace,
2113) -> Result<AptpFactorResult, SparseError> {
2114    let m = a.nrows();
2115    let nb = options.outer_block_size;
2116    let p = num_fully_summed;
2117
2118    let mut col_order: Vec<usize> = (0..m).collect();
2119    let mut global_d = MixedDiagonal::new(p);
2120    let mut stats = AptpStatistics::default();
2121    let mut pivot_log = Vec::with_capacity(p);
2122
2123    let mut global_nelim = 0;
2124    let mut remaining_fully_summed = p;
2125
2126    while remaining_fully_summed > 0 {
2127        let col_start = global_nelim;
2128        let block_cols = remaining_fully_summed.min(nb);
2129
2130        // FACTOR: inner APTP on the full view (including panel rows).
2131        // factor_inner handles:
2132        //   - Complete pivoting search within ib×ib sub-blocks
2133        //   - Symmetric swaps applied to ALL rows (including panel)
2134        //   - L entry computation + threshold check for ALL rows (via try_1x1/try_2x2)
2135        //   - Schur complement update for ALL trailing rows
2136        //   - Internal backup/restore for failed pivots (inside try_1x1/try_2x2)
2137        //
2138        // No external backup/restore is needed: factor_inner handles all
2139        // threshold failures internally, and the Schur complement has been
2140        // applied to all trailing entries including any delayed columns.
2141        let block_m = m - col_start;
2142        let block_result = {
2143            let block_view = a
2144                .rb_mut()
2145                .submatrix_mut(col_start, col_start, block_m, block_m);
2146            // nfs_boundary relative to this subview: p - col_start
2147            // This ensures inner blocks skip NFS×NFS updates consistently
2148            // with the deferred GEMM that runs after aptp_factor_in_place.
2149            factor_inner(block_view, block_cols, p - col_start, options, kernel_ws)?
2150        };
2151        let block_nelim = block_result.num_eliminated;
2152
2153        // PROPAGATE ROW PERMUTATION to already-factored columns.
2154        //
2155        // factor_inner operates on a submatrix view [col_start..m, col_start..m].
2156        // Its swap_symmetric calls only rearrange rows WITHIN that submatrix.
2157        // The L entries from previously-factored blocks (columns 0..col_start)
2158        // are NOT rearranged by these swaps. We must apply the same row
2159        // permutation to those columns so that extract_l reads consistent
2160        // L entries across all blocks.
2161        if col_start > 0 {
2162            let block_perm = &block_result.perm;
2163            let mut temp = vec![0.0f64; block_cols];
2164            for c in 0..col_start {
2165                // Gather: temp[i] = row that ended up at local position i
2166                for i in 0..block_cols {
2167                    temp[i] = a[(col_start + block_perm[i], c)];
2168                }
2169                // Scatter: write to sequential positions
2170                for i in 0..block_cols {
2171                    a[(col_start + i, c)] = temp[i];
2172                }
2173            }
2174        }
2175
2176        // Update col_order BEFORE delay swap: factor_inner may have permuted
2177        // columns within the block. We must capture the pre-swap col_order so
2178        // that orig_order[block_perm[i]] reads the correct original column index.
2179        // If we swapped first, delayed positions would contain post-swap values,
2180        // corrupting the mapping for eliminated columns.
2181        {
2182            let block_perm = &block_result.perm;
2183            let orig_order: Vec<usize> = col_order[col_start..col_start + block_cols].to_vec();
2184            for i in 0..block_cols {
2185                // Entries with block_perm[i] >= block_cols are contribution block rows
2186                // (not fully-summed columns), so they don't have a col_order mapping.
2187                if block_perm[i] < block_cols {
2188                    col_order[col_start + i] = orig_order[block_perm[i]];
2189                }
2190            }
2191        }
2192
2193        // ADJUST delayed columns: swap them to end of unprocessed region
2194        // so the next outer block processes fresh columns first.
2195        if block_nelim < block_cols {
2196            let n_failed = block_cols - block_nelim;
2197            for i in 0..n_failed {
2198                let failed_pos = col_start + block_nelim + i;
2199                let end = col_start + remaining_fully_summed - 1 - i;
2200                if failed_pos < end {
2201                    swap_symmetric(a.rb_mut(), failed_pos, end);
2202                    col_order.swap(failed_pos, end);
2203                }
2204            }
2205            stats.num_delayed += n_failed;
2206        }
2207
2208        // 4. Accumulate into global result
2209        global_d.copy_from_offset(&block_result.d, global_nelim, block_nelim);
2210
2211        // Merge stats
2212        stats.num_1x1 += block_result.stats.num_1x1;
2213        stats.num_2x2 += block_result.stats.num_2x2;
2214        if block_result.stats.max_l_entry > stats.max_l_entry {
2215            stats.max_l_entry = block_result.stats.max_l_entry;
2216        }
2217
2218        // Merge pivot log
2219        for record in &block_result.pivot_log {
2220            if !matches!(record.pivot_type, PivotType::Delayed) {
2221                pivot_log.push(record.clone());
2222            }
2223        }
2224        // Add delayed logs
2225        let n_failed = block_cols - block_nelim;
2226        for i in 0..n_failed {
2227            let delayed_pos = col_start + remaining_fully_summed - 1 - i;
2228            pivot_log.push(AptpPivotRecord {
2229                col: col_order[delayed_pos],
2230                pivot_type: PivotType::Delayed,
2231                max_l_entry: 0.0,
2232                was_fallback: false,
2233            });
2234        }
2235
2236        global_nelim += block_nelim;
2237        remaining_fully_summed -= block_cols;
2238    }
2239
2240    global_d.truncate(global_nelim);
2241
2242    let delayed_cols: Vec<usize> = (global_nelim..p).map(|i| col_order[i]).collect();
2243
2244    Ok(AptpFactorResult {
2245        d: global_d,
2246        perm: col_order,
2247        num_eliminated: global_nelim,
2248        delayed_cols,
2249        stats,
2250        pivot_log,
2251    })
2252}
2253
2254// ---------------------------------------------------------------------------
2255// TPP (Threshold Partial Pivoting) fallback
2256// ---------------------------------------------------------------------------
2257
2258/// Test if `(t, p)` with `t < p` form a good 2x2 pivot.
2259///
2260/// Three necessary conditions for a stable 2x2 block pivot:
2261/// 1. Non-zero pivot block: max(|a11|, |a21|, |a22|) >= small
2262/// 2. Non-singular determinant with cancellation guard
2263/// 3. Threshold: u * max(|D^{-1}_{11}|*maxt + |D^{-1}_{12}|*maxp,
2264///    |D^{-1}_{12}|*maxt + |D^{-1}_{22}|*maxp) < 1
2265///
2266/// Returns `Some((a11, a21, a22))` (D values, NOT D^{-1}) on success.
2267// SPRAL Equivalent: `test_2x2()` in `spral/src/ssids/cpu/kernels/ldlt_tpp.cxx` (BSD-3).
2268fn tpp_test_2x2(
2269    a: MatRef<'_, f64>,
2270    t: usize,
2271    p: usize,
2272    maxt: f64,
2273    maxp: f64,
2274    u: f64,
2275    small: f64,
2276) -> Option<(f64, f64, f64)> {
2277    debug_assert!(t < p, "tpp_test_2x2 requires t < p");
2278
2279    let a11 = a[(t, t)];
2280    let a21 = a[(p, t)]; // lower triangle: p > t
2281    let a22 = a[(p, p)];
2282
2283    // 1. Non-zero pivot block
2284    let maxpiv = a11.abs().max(a21.abs()).max(a22.abs());
2285    if maxpiv < small {
2286        return None;
2287    }
2288
2289    // 2. Cancellation guard on determinant
2290    let detscale = 1.0 / maxpiv;
2291    let detpiv0 = (a11 * detscale) * a22;
2292    let detpiv1 = (a21 * detscale) * a21;
2293    let detpiv = detpiv0 - detpiv1;
2294    if detpiv.abs() < small.max((detpiv0 / 2.0).abs()).max((detpiv1 / 2.0).abs()) {
2295        return None;
2296    }
2297
2298    // 3. Threshold test using D^{-1}
2299    let d_inv_11 = (a22 * detscale) / detpiv;
2300    let d_inv_12 = (-a21 * detscale) / detpiv;
2301    let d_inv_22 = (a11 * detscale) / detpiv;
2302
2303    if maxt.max(maxp) < small {
2304        // Rest of column is small — accept
2305        return Some((a11, a21, a22));
2306    }
2307
2308    let x1 = d_inv_11.abs() * maxt + d_inv_12.abs() * maxp;
2309    let x2 = d_inv_12.abs() * maxt + d_inv_22.abs() * maxp;
2310    if u * x1.max(x2) < 1.0 {
2311        Some((a11, a21, a22))
2312    } else {
2313        None
2314    }
2315}
2316
2317/// TPP fallback: serial column-by-column factorization on APTP's remaining columns.
2318///
2319/// Operates on the partially-factored matrix `a` where columns `0..start_col`
2320/// have been eliminated by APTP. Searches all remaining fully-summed columns
2321/// (`start_col..num_fully_summed`) for acceptable pivots using threshold partial
2322/// pivoting.
2323///
2324/// Uses full-matrix `swap_symmetric` which correctly propagates row swaps to
2325/// already-factored L columns (essential for correct L21 computation when rows
2326/// are reordered).
2327///
2328/// Returns the number of additional columns eliminated.
2329///
2330/// # References
2331///
2332/// - SPRAL `ldlt_tpp_factor()` in `spral/src/ssids/cpu/kernels/ldlt_tpp.cxx` (BSD-3)
2333/// - Duff, Hogg & Lopez (2020), Section 3: TPP as fallback after APTP
2334#[allow(clippy::too_many_arguments)]
2335fn tpp_factor(
2336    mut a: MatMut<'_, f64>,
2337    start_col: usize,
2338    num_fully_summed: usize,
2339    col_order: &mut [usize],
2340    global_d: &mut MixedDiagonal,
2341    stats: &mut AptpStatistics,
2342    pivot_log: &mut Vec<AptpPivotRecord>,
2343    options: &AptpOptions,
2344) -> usize {
2345    let m = a.nrows();
2346    let n = num_fully_summed;
2347    let u = options.threshold;
2348    let small = options.small;
2349
2350    let mut nelim = start_col;
2351
2352    while nelim < n {
2353        // Check if current column is effectively zero
2354        if tpp_is_col_small(a.as_ref(), nelim, nelim, m, small) {
2355            // Zero pivot: record and advance
2356            global_d.set_1x1(nelim, 0.0);
2357            stats.num_1x1 += 1;
2358            pivot_log.push(AptpPivotRecord {
2359                col: col_order[nelim],
2360                pivot_type: PivotType::OneByOne,
2361                max_l_entry: 0.0,
2362                was_fallback: true,
2363            });
2364            // Zero the column entries
2365            for r in (nelim + 1)..m {
2366                a[(r, nelim)] = 0.0;
2367            }
2368            nelim += 1;
2369            continue;
2370        }
2371
2372        // Search columns p = nelim+1..n-1 for acceptable pivot
2373        let mut found = false;
2374        for p in (nelim + 1)..n {
2375            // Check if column p is effectively zero
2376            if tpp_is_col_small(a.as_ref(), p, nelim, m, small) {
2377                // Swap zero column to front and record zero pivot
2378                if p != nelim {
2379                    swap_symmetric(a.rb_mut(), p, nelim);
2380                    col_order.swap(p, nelim);
2381                }
2382                global_d.set_1x1(nelim, 0.0);
2383                stats.num_1x1 += 1;
2384                pivot_log.push(AptpPivotRecord {
2385                    col: col_order[nelim],
2386                    pivot_type: PivotType::OneByOne,
2387                    max_l_entry: 0.0,
2388                    was_fallback: true,
2389                });
2390                for r in (nelim + 1)..m {
2391                    a[(r, nelim)] = 0.0;
2392                }
2393                nelim += 1;
2394                found = true;
2395                break;
2396            }
2397
2398            // Find column index t of largest |a(p, c)| for c in nelim..p
2399            let t = tpp_find_row_abs_max(a.as_ref(), p, nelim, p);
2400
2401            // Try (t, p) as 2x2 pivot (requires t < p)
2402            let maxt = tpp_find_rc_abs_max_exclude(a.as_ref(), t, nelim, m, p);
2403            let maxp = tpp_find_rc_abs_max_exclude(a.as_ref(), p, nelim, m, t);
2404            if tpp_test_2x2(a.as_ref(), t, p, maxt, maxp, u, small).is_some() {
2405                // Accept 2x2 pivot: swap t→nelim, p→nelim+1
2406                if t != nelim {
2407                    swap_symmetric(a.rb_mut(), t, nelim);
2408                    col_order.swap(t, nelim);
2409                }
2410                // After first swap, p may have moved
2411                let new_p = if p == nelim { t } else { p };
2412                if new_p != nelim + 1 {
2413                    swap_symmetric(a.rb_mut(), new_p, nelim + 1);
2414                    col_order.swap(new_p, nelim + 1);
2415                }
2416
2417                // Re-read D values after swap
2418                let d11 = a[(nelim, nelim)];
2419                let d21 = a[(nelim + 1, nelim)];
2420                let d22 = a[(nelim + 1, nelim + 1)];
2421
2422                // Apply elimination and Schur update
2423                let max_l = tpp_apply_2x2(a.rb_mut(), nelim, m, n);
2424
2425                // Record pivot
2426                global_d.set_2x2(Block2x2 {
2427                    first_col: nelim,
2428                    a: d11,
2429                    b: d21,
2430                    c: d22,
2431                });
2432                stats.num_2x2 += 1;
2433                stats.max_l_entry = stats.max_l_entry.max(max_l);
2434                pivot_log.push(AptpPivotRecord {
2435                    col: col_order[nelim],
2436                    pivot_type: PivotType::TwoByTwo {
2437                        partner: col_order[nelim + 1],
2438                    },
2439                    max_l_entry: max_l,
2440                    was_fallback: true,
2441                });
2442                pivot_log.push(AptpPivotRecord {
2443                    col: col_order[nelim + 1],
2444                    pivot_type: PivotType::TwoByTwo {
2445                        partner: col_order[nelim],
2446                    },
2447                    max_l_entry: max_l,
2448                    was_fallback: true,
2449                });
2450                nelim += 2;
2451                found = true;
2452                break;
2453            }
2454
2455            // Try p as 1x1 pivot
2456            // maxp should include |a(p, t)| for the off-diagonal contribution
2457            let maxp_with_t = maxp.max(tpp_sym_entry(a.as_ref(), p, t).abs());
2458            if a[(p, p)].abs() >= u * maxp_with_t {
2459                // Accept 1x1 pivot: swap p→nelim
2460                if p != nelim {
2461                    swap_symmetric(a.rb_mut(), p, nelim);
2462                    col_order.swap(p, nelim);
2463                }
2464
2465                let max_l = tpp_apply_1x1(a.rb_mut(), nelim, m, n);
2466
2467                global_d.set_1x1(nelim, a[(nelim, nelim)]);
2468                stats.num_1x1 += 1;
2469                stats.max_l_entry = stats.max_l_entry.max(max_l);
2470                pivot_log.push(AptpPivotRecord {
2471                    col: col_order[nelim],
2472                    pivot_type: PivotType::OneByOne,
2473                    max_l_entry: max_l,
2474                    was_fallback: true,
2475                });
2476                nelim += 1;
2477                found = true;
2478                break;
2479            }
2480        }
2481
2482        if !found {
2483            // Last resort: try column nelim as 1x1 (we started searching at nelim+1)
2484            let maxp = tpp_find_rc_abs_max_exclude(a.as_ref(), nelim, nelim, m, usize::MAX);
2485            if a[(nelim, nelim)].abs() >= u * maxp {
2486                let max_l = tpp_apply_1x1(a.rb_mut(), nelim, m, n);
2487
2488                global_d.set_1x1(nelim, a[(nelim, nelim)]);
2489                stats.num_1x1 += 1;
2490                stats.max_l_entry = stats.max_l_entry.max(max_l);
2491                pivot_log.push(AptpPivotRecord {
2492                    col: col_order[nelim],
2493                    pivot_type: PivotType::OneByOne,
2494                    max_l_entry: max_l,
2495                    was_fallback: true,
2496                });
2497                nelim += 1;
2498            } else {
2499                // No more pivots can be found
2500                break;
2501            }
2502        }
2503    }
2504
2505    nelim - start_col
2506}
2507
2508// ---------------------------------------------------------------------------
2509// Rectangular TPP factorization for small-leaf fast path
2510// ---------------------------------------------------------------------------
2511
2512/// Swap rows/columns i and j in a rectangular m×n lower-triangle-like matrix.
2513///
2514/// The matrix `a` has m rows and n columns (m ≥ n). It stores:
2515/// - Column data in columns 0..n (the portion being factored)
2516/// - Row data extends to all m rows
2517///
2518/// Unlike `swap_symmetric` which operates on an m×m square matrix, this limits
2519/// column operations to 0..n. Row swaps span the full column range 0..n.
2520// SPRAL Equivalent: `swap_cols(col1, col2, m, n, ...)` in `ldlt_tpp.cxx:44-72` (BSD-3).
2521fn swap_rect(mut a: MatMut<'_, f64>, i: usize, j: usize) {
2522    if i == j {
2523        return;
2524    }
2525    let (i, j) = if i < j { (i, j) } else { (j, i) };
2526    let n = a.ncols();
2527
2528    // Swap diagonals (only if both are within column range)
2529    if i < n && j < n {
2530        let tmp = a[(i, i)];
2531        a[(i, i)] = a[(j, j)];
2532        a[(j, j)] = tmp;
2533    }
2534
2535    // Rows k < i: swap a[(i,k)] and a[(j,k)] — both are in columns 0..i < n
2536    for k in 0..i.min(n) {
2537        let tmp = a[(i, k)];
2538        a[(i, k)] = a[(j, k)];
2539        a[(j, k)] = tmp;
2540    }
2541
2542    // Rows i < k < j: swap a[(k,i)] and a[(j,k)]
2543    // a[(k,i)] is in column i < n; a[(j,k)] is in column k
2544    for k in (i + 1)..j {
2545        if k < n {
2546            let tmp = a[(k, i)];
2547            a[(k, i)] = a[(j, k)];
2548            a[(j, k)] = tmp;
2549        } else if i < n {
2550            // k >= n: only a[(k, i)] is accessible (column i < n)
2551            // a[(j, k)] is inaccessible (column k >= n)
2552            // Nothing to swap on the column side
2553            break;
2554        }
2555    }
2556
2557    // Rows k > j: swap a[(k,i)] and a[(k,j)]
2558    // Only accessible if both i < n and j < n
2559    if i < n && j < n {
2560        for k in (j + 1)..a.nrows() {
2561            let tmp = a[(k, i)];
2562            a[(k, i)] = a[(k, j)];
2563            a[(k, j)] = tmp;
2564        }
2565    }
2566
2567    // Cross element a[(j,i)] stays unchanged (if j < n, it's the off-diagonal)
2568}
2569
2570/// Read symmetric entry `a(row, col)` from lower-triangle storage.
2571///
2572/// Works with both square (m×m) and rectangular (m×n, m ≥ n) matrices.
2573/// For row >= col with col < n: reads `a[(row, col)]` directly.
2574/// For row < col with col < n: reads `a[(col, row)]` (symmetric reflection).
2575/// Panics if both row and col are >= n (no data stored there).
2576#[inline]
2577fn tpp_sym_entry(a: MatRef<'_, f64>, row: usize, col: usize) -> f64 {
2578    let n = a.ncols();
2579    if col < n && row >= col {
2580        a[(row, col)]
2581    } else if row < n && col >= row {
2582        a[(col, row)]
2583    } else {
2584        panic!("tpp_sym_entry: row={} col={} both >= ncols={}", row, col, n);
2585    }
2586}
2587
2588/// Apply 1x1 pivot elimination at position `nelim`.
2589///
2590/// Computes L entries (divides column by D) for ALL rows (FS + NFS), then
2591/// performs rank-1 Schur complement update only on FS columns (0..num_fully_summed).
2592/// Works with both square (m×m) and rectangular (m×n, m ≥ n) matrices.
2593///
2594/// Returns the maximum absolute L entry.
2595fn tpp_apply_1x1(mut a: MatMut<'_, f64>, nelim: usize, m: usize, num_fully_summed: usize) -> f64 {
2596    let d = a[(nelim, nelim)];
2597    let inv_d = 1.0 / d;
2598    let p = num_fully_summed;
2599
2600    // Compute L entries for ALL rows (FS + NFS)
2601    let mut max_l = 0.0_f64;
2602    for i in (nelim + 1)..m {
2603        let l_ik = a[(i, nelim)] * inv_d;
2604        a[(i, nelim)] = l_ik;
2605        max_l = max_l.max(l_ik.abs());
2606    }
2607
2608    // Rank-1 Schur complement update: only FS columns (< p),
2609    // but update ALL rows in each column (FS + NFS)
2610    let schur_col_end = p.min(a.ncols());
2611    for j in (nelim + 1)..schur_col_end {
2612        let ldlj = a[(j, nelim)] * d;
2613        for i in j..m {
2614            a[(i, j)] -= a[(i, nelim)] * ldlj;
2615        }
2616    }
2617
2618    max_l
2619}
2620
2621/// Apply 2x2 pivot elimination at positions `(nelim, nelim+1)`.
2622///
2623/// Computes L entries using D^{-1}, then performs rank-2 Schur complement
2624/// update only on FS columns (0..num_fully_summed). Works with both square
2625/// (m×m) and rectangular (m×n, m ≥ n) matrices.
2626///
2627/// Returns the maximum absolute L entry.
2628fn tpp_apply_2x2(mut a: MatMut<'_, f64>, nelim: usize, m: usize, num_fully_summed: usize) -> f64 {
2629    let a11 = a[(nelim, nelim)];
2630    let a21 = a[(nelim + 1, nelim)];
2631    let a22 = a[(nelim + 1, nelim + 1)];
2632    let det = a11 * a22 - a21 * a21;
2633    let inv_det = 1.0 / det;
2634    let p = num_fully_summed;
2635
2636    // Compute L entries for ALL rows (FS + NFS)
2637    let mut max_l = 0.0_f64;
2638    let start = nelim + 2;
2639    for i in start..m {
2640        let ai1 = a[(i, nelim)];
2641        let ai2 = a[(i, nelim + 1)];
2642        let l_i1 = (ai1 * a22 - ai2 * a21) * inv_det;
2643        let l_i2 = (ai2 * a11 - ai1 * a21) * inv_det;
2644        a[(i, nelim)] = l_i1;
2645        a[(i, nelim + 1)] = l_i2;
2646        max_l = max_l.max(l_i1.abs()).max(l_i2.abs());
2647    }
2648
2649    // Rank-2 Schur complement update: only FS columns (< p),
2650    // but update ALL rows in each column
2651    let schur_col_end = p.min(a.ncols());
2652    for j in start..schur_col_end {
2653        let wj1 = a[(j, nelim)] * a11 + a[(j, nelim + 1)] * a21;
2654        let wj2 = a[(j, nelim)] * a21 + a[(j, nelim + 1)] * a22;
2655        for i in j..m {
2656            a[(i, j)] -= a[(i, nelim)] * wj1 + a[(i, nelim + 1)] * wj2;
2657        }
2658    }
2659
2660    // Zero D off-diagonal for extract_front_factors convention
2661    a[(nelim + 1, nelim)] = 0.0;
2662
2663    max_l
2664}
2665
2666/// Check if all entries in row/column `idx` within the uneliminated region are
2667/// smaller than `small` in absolute value. Works with both square (m×m) and
2668/// rectangular (m×n, m ≥ n) matrices.
2669// SPRAL Equivalent: `check_col_small()` in `spral/src/ssids/cpu/kernels/ldlt_tpp.cxx` (BSD-3).
2670fn tpp_is_col_small(a: MatRef<'_, f64>, idx: usize, from: usize, to: usize, small: f64) -> bool {
2671    let n = a.ncols();
2672    // Row entries: a[(idx, c)] for c < idx (lower triangle: idx > c), limited to ncols
2673    for c in from..idx.min(n) {
2674        if a[(idx, c)].abs() >= small {
2675            return false;
2676        }
2677    }
2678    // Column entries: a[(r, idx)] for r >= idx, if idx < n
2679    if idx < n {
2680        for r in idx..to {
2681            if a[(r, idx)].abs() >= small {
2682                return false;
2683            }
2684        }
2685    }
2686    true
2687}
2688
2689/// Find the column index with largest absolute entry in row `p`, scanning
2690/// columns `from..to`. Works with both square and rectangular storage.
2691// SPRAL Equivalent: `find_row_abs_max()` in `spral/src/ssids/cpu/kernels/ldlt_tpp.cxx` (BSD-3).
2692fn tpp_find_row_abs_max(a: MatRef<'_, f64>, p: usize, from: usize, to: usize) -> usize {
2693    if from >= to {
2694        return from;
2695    }
2696    let mut best_idx = from;
2697    let mut best_val = tpp_sym_entry(a, p, from).abs();
2698    for c in (from + 1)..to {
2699        let v = tpp_sym_entry(a, p, c).abs();
2700        if v > best_val {
2701            best_idx = c;
2702            best_val = v;
2703        }
2704    }
2705    best_idx
2706}
2707
2708/// Find max absolute value in row/column `col` among uneliminated positions,
2709/// excluding one index and the diagonal. Works with both square and rectangular storage.
2710// SPRAL Equivalent: `find_rc_abs_max_exclude()` in `spral/src/ssids/cpu/kernels/ldlt_tpp.cxx` (BSD-3).
2711fn tpp_find_rc_abs_max_exclude(
2712    a: MatRef<'_, f64>,
2713    col: usize,
2714    nelim: usize,
2715    m: usize,
2716    exclude: usize,
2717) -> f64 {
2718    let n = a.ncols();
2719    let mut best = 0.0_f64;
2720    // Row part: a[(col, c)] for c in nelim..col (limited to ncols)
2721    for c in nelim..col.min(n) {
2722        if c == exclude {
2723            continue;
2724        }
2725        best = best.max(a[(col, c)].abs());
2726    }
2727    // Column part: a[(r, col)] for r in col+1..m (if col < n)
2728    if col < n {
2729        for r in (col + 1)..m {
2730            if r == exclude {
2731                continue;
2732            }
2733            best = best.max(a[(r, col)].abs());
2734        }
2735    }
2736    best
2737}
2738
2739/// Factor a rectangular m×n matrix using Threshold Partial Pivoting (TPP).
2740///
2741/// This is the small-leaf fast-path kernel. Unlike `tpp_factor` which operates
2742/// on a square m×m matrix, this takes a rectangular m×n matrix (m ≥ n) where
2743/// n = num_fully_summed. The matrix stores the lower triangle of the frontal
2744/// matrix restricted to the first n columns.
2745///
2746/// L entries are stored in-place in columns 0..num_eliminated.
2747/// The NFS×FS cross-terms (rows n..m, columns 0..num_eliminated) contain valid
2748/// L21 data needed for the contribution GEMM.
2749///
2750/// # Arguments
2751///
2752/// - `a`: Rectangular m×n matrix (m rows, n fully-summed columns), mutated in place.
2753/// - `num_fully_summed`: Number of columns eligible for elimination (= n = `a.ncols()`).
2754/// - `options`: APTP configuration.
2755///
2756/// # Returns
2757///
2758/// `AptpFactorResult` with D, permutation, elimination count, and diagnostics.
2759// SPRAL Equivalent: `ldlt_tpp_factor(m, n, perm, lcol, ldl, d, ld, ...)`
2760// in `spral/src/ssids/cpu/kernels/ldlt_tpp.cxx` (BSD-3).
2761pub(super) fn tpp_factor_rect(
2762    mut a: MatMut<'_, f64>,
2763    num_fully_summed: usize,
2764    options: &AptpOptions,
2765) -> Result<AptpFactorResult, SparseError> {
2766    let m = a.nrows();
2767    let n = num_fully_summed;
2768
2769    debug_assert!(
2770        a.ncols() >= n,
2771        "tpp_factor_rect: ncols {} < num_fully_summed {}",
2772        a.ncols(),
2773        n
2774    );
2775    debug_assert!(m >= n, "tpp_factor_rect: nrows {} < ncols {}", m, n);
2776
2777    let u = options.threshold;
2778    let small = options.small;
2779
2780    let mut col_order: Vec<usize> = (0..m).collect();
2781    let mut d = MixedDiagonal::new(n);
2782    let mut stats = AptpStatistics::default();
2783    let mut pivot_log = Vec::with_capacity(n);
2784
2785    let mut nelim = 0;
2786
2787    while nelim < n {
2788        // Check if current column is effectively zero
2789        if tpp_is_col_small(a.as_ref(), nelim, nelim, m, small) {
2790            d.set_1x1(nelim, 0.0);
2791            stats.num_1x1 += 1;
2792            pivot_log.push(AptpPivotRecord {
2793                col: col_order[nelim],
2794                pivot_type: PivotType::OneByOne,
2795                max_l_entry: 0.0,
2796                was_fallback: false,
2797            });
2798            if nelim < a.ncols() {
2799                for r in (nelim + 1)..m {
2800                    a[(r, nelim)] = 0.0;
2801                }
2802            }
2803            nelim += 1;
2804            continue;
2805        }
2806
2807        // Search columns p = nelim+1..n for acceptable pivot
2808        let mut found = false;
2809        for p in (nelim + 1)..n {
2810            // Check if column p is effectively zero
2811            if tpp_is_col_small(a.as_ref(), p, nelim, m, small) {
2812                if p != nelim {
2813                    swap_rect(a.rb_mut(), p, nelim);
2814                    col_order.swap(p, nelim);
2815                }
2816                d.set_1x1(nelim, 0.0);
2817                stats.num_1x1 += 1;
2818                pivot_log.push(AptpPivotRecord {
2819                    col: col_order[nelim],
2820                    pivot_type: PivotType::OneByOne,
2821                    max_l_entry: 0.0,
2822                    was_fallback: false,
2823                });
2824                for r in (nelim + 1)..m {
2825                    a[(r, nelim)] = 0.0;
2826                }
2827                nelim += 1;
2828                found = true;
2829                break;
2830            }
2831
2832            // Find column index t of largest |a(p, c)| for c in nelim..p
2833            let t = tpp_find_row_abs_max(a.as_ref(), p, nelim, p);
2834
2835            // Try (t, p) as 2x2 pivot (requires t < p)
2836            let maxt = tpp_find_rc_abs_max_exclude(a.as_ref(), t, nelim, m, p);
2837            let maxp = tpp_find_rc_abs_max_exclude(a.as_ref(), p, nelim, m, t);
2838            // Use tpp_test_2x2 which reads from (t,t), (p,t), (p,p) in lower triangle
2839            // These are all accessible in rectangular storage since t < p < n <= ncols
2840            if tpp_test_2x2(a.as_ref(), t, p, maxt, maxp, u, small).is_some() {
2841                // Accept 2x2 pivot: swap t→nelim, p→nelim+1
2842                if t != nelim {
2843                    swap_rect(a.rb_mut(), t, nelim);
2844                    col_order.swap(t, nelim);
2845                }
2846                let new_p = if p == nelim { t } else { p };
2847                if new_p != nelim + 1 {
2848                    swap_rect(a.rb_mut(), new_p, nelim + 1);
2849                    col_order.swap(new_p, nelim + 1);
2850                }
2851
2852                let d11 = a[(nelim, nelim)];
2853                let d21 = a[(nelim + 1, nelim)];
2854                let d22 = a[(nelim + 1, nelim + 1)];
2855
2856                let max_l = tpp_apply_2x2(a.rb_mut(), nelim, m, n);
2857
2858                d.set_2x2(Block2x2 {
2859                    first_col: nelim,
2860                    a: d11,
2861                    b: d21,
2862                    c: d22,
2863                });
2864                stats.num_2x2 += 1;
2865                stats.max_l_entry = stats.max_l_entry.max(max_l);
2866                pivot_log.push(AptpPivotRecord {
2867                    col: col_order[nelim],
2868                    pivot_type: PivotType::TwoByTwo {
2869                        partner: col_order[nelim + 1],
2870                    },
2871                    max_l_entry: max_l,
2872                    was_fallback: false,
2873                });
2874                pivot_log.push(AptpPivotRecord {
2875                    col: col_order[nelim + 1],
2876                    pivot_type: PivotType::TwoByTwo {
2877                        partner: col_order[nelim],
2878                    },
2879                    max_l_entry: max_l,
2880                    was_fallback: false,
2881                });
2882                nelim += 2;
2883                found = true;
2884                break;
2885            }
2886
2887            // Try p as 1x1 pivot
2888            let maxp_with_t = maxp.max(tpp_sym_entry(a.as_ref(), p, t).abs());
2889            if a[(p, p)].abs() >= u * maxp_with_t {
2890                if p != nelim {
2891                    swap_rect(a.rb_mut(), p, nelim);
2892                    col_order.swap(p, nelim);
2893                }
2894
2895                let max_l = tpp_apply_1x1(a.rb_mut(), nelim, m, n);
2896
2897                d.set_1x1(nelim, a[(nelim, nelim)]);
2898                stats.num_1x1 += 1;
2899                stats.max_l_entry = stats.max_l_entry.max(max_l);
2900                pivot_log.push(AptpPivotRecord {
2901                    col: col_order[nelim],
2902                    pivot_type: PivotType::OneByOne,
2903                    max_l_entry: max_l,
2904                    was_fallback: false,
2905                });
2906                nelim += 1;
2907                found = true;
2908                break;
2909            }
2910        }
2911
2912        if !found {
2913            // Last resort: try column nelim as 1x1
2914            let maxp = tpp_find_rc_abs_max_exclude(a.as_ref(), nelim, nelim, m, usize::MAX);
2915            if a[(nelim, nelim)].abs() >= u * maxp {
2916                let max_l = tpp_apply_1x1(a.rb_mut(), nelim, m, n);
2917
2918                d.set_1x1(nelim, a[(nelim, nelim)]);
2919                stats.num_1x1 += 1;
2920                stats.max_l_entry = stats.max_l_entry.max(max_l);
2921                pivot_log.push(AptpPivotRecord {
2922                    col: col_order[nelim],
2923                    pivot_type: PivotType::OneByOne,
2924                    max_l_entry: max_l,
2925                    was_fallback: false,
2926                });
2927                nelim += 1;
2928            } else {
2929                break;
2930            }
2931        }
2932    }
2933
2934    d.truncate(nelim);
2935    stats.num_delayed = n - nelim;
2936
2937    let delayed_cols: Vec<usize> = (nelim..n).map(|i| col_order[i]).collect();
2938
2939    Ok(AptpFactorResult {
2940        d,
2941        perm: col_order,
2942        num_eliminated: nelim,
2943        delayed_cols,
2944        stats,
2945        pivot_log,
2946    })
2947}
2948
2949/// Compute NFS×NFS Schur complement from rectangular L storage into contrib_buffer.
2950///
2951/// Like `compute_contribution_gemm` but reads L21_NFS from a rectangular m×k
2952/// matrix instead of a square m×m frontal matrix.
2953///
2954/// # Arguments
2955///
2956/// - `l_storage`: Rectangular m×k matrix containing factored L data.
2957/// - `num_fully_summed`: k (number of fully-summed columns).
2958/// - `num_eliminated`: ne (columns successfully eliminated).
2959/// - `m`: Total front size (rows in l_storage).
2960/// - `d`: Block diagonal from factorization.
2961/// - `contrib_buffer`: Output NFS×NFS Schur complement (lower triangle).
2962/// - `ld_workspace`: Reusable W = L·D buffer.
2963/// - `par`: Parallelism control.
2964#[allow(clippy::too_many_arguments)]
2965pub(super) fn compute_contribution_gemm_rect(
2966    l_storage: &Mat<f64>,
2967    num_fully_summed: usize,
2968    num_eliminated: usize,
2969    m: usize,
2970    d: &MixedDiagonal,
2971    contrib_buffer: &mut Mat<f64>,
2972    ld_workspace: &mut Mat<f64>,
2973    par: Par,
2974) {
2975    let p = num_fully_summed;
2976    let ne = num_eliminated;
2977    let nfs = m - p;
2978
2979    if nfs == 0 || ne == 0 {
2980        return;
2981    }
2982
2983    // L21_NFS = l_storage[p..m, 0..ne]
2984    let l21_nfs = l_storage.as_ref().submatrix(p, 0, nfs, ne);
2985
2986    // Use caller-provided workspace
2987    if ld_workspace.nrows() < nfs || ld_workspace.ncols() < ne {
2988        *ld_workspace = Mat::zeros(nfs.max(ld_workspace.nrows()), ne.max(ld_workspace.ncols()));
2989    }
2990    let mut w = ld_workspace.as_mut().submatrix_mut(0, 0, nfs, ne);
2991    w.fill(0.0);
2992    compute_ld_into(l21_nfs, d, ne, w.rb_mut());
2993
2994    // Symmetric rank-ne update: contrib -= W * L21_NFS^T (lower triangle)
2995    tri_matmul::matmul_with_conj(
2996        contrib_buffer.as_mut().submatrix_mut(0, 0, nfs, nfs),
2997        BlockStructure::TriangularLower,
2998        Accum::Add,
2999        w.as_ref(),
3000        BlockStructure::Rectangular,
3001        Conj::No,
3002        l21_nfs.transpose(),
3003        BlockStructure::Rectangular,
3004        Conj::No,
3005        -1.0,
3006        par,
3007    );
3008}
3009
3010/// Extract front factors from rectangular m×k L storage.
3011///
3012/// Adapts `extract_front_factors` for the case where the factored data lives
3013/// in a rectangular m×k matrix instead of a square m×m frontal matrix.
3014/// The L11, D11, L21 extraction logic is identical; only the source layout differs.
3015pub(super) fn extract_front_factors_rect(
3016    l_storage: &Mat<f64>,
3017    m: usize,
3018    k: usize,
3019    frontal_row_indices: &[usize],
3020    result: &AptpFactorResult,
3021) -> super::numeric::FrontFactors {
3022    let ne = result.num_eliminated;
3023
3024    // Extract L11 (ne × ne)
3025    let l11 = if ne > 0 {
3026        let mut l = Mat::zeros(ne, ne);
3027        let mut col = 0;
3028        while col < ne {
3029            l[(col, col)] = 1.0;
3030            match result.d.pivot_type(col) {
3031                PivotType::OneByOne => {
3032                    let n_entries = ne - (col + 1);
3033                    if n_entries > 0 {
3034                        let src = &l_storage.col_as_slice(col)[col + 1..ne];
3035                        l.col_as_slice_mut(col)[col + 1..ne].copy_from_slice(src);
3036                    }
3037                    col += 1;
3038                }
3039                PivotType::TwoByTwo { partner } if partner > col => {
3040                    l[(col + 1, col + 1)] = 1.0;
3041                    let n_entries = ne - (col + 2);
3042                    if n_entries > 0 {
3043                        let src0 = &l_storage.col_as_slice(col)[col + 2..ne];
3044                        l.col_as_slice_mut(col)[col + 2..ne].copy_from_slice(src0);
3045                        let src1 = &l_storage.col_as_slice(col + 1)[col + 2..ne];
3046                        l.col_as_slice_mut(col + 1)[col + 2..ne].copy_from_slice(src1);
3047                    }
3048                    col += 2;
3049                }
3050                PivotType::TwoByTwo { .. } => {
3051                    col += 1;
3052                }
3053                PivotType::Delayed => {
3054                    unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
3055                }
3056            }
3057        }
3058        l
3059    } else {
3060        Mat::zeros(0, 0)
3061    };
3062
3063    // Build truncated D11
3064    let mut d11 = MixedDiagonal::new(ne);
3065    let mut col = 0;
3066    while col < ne {
3067        match result.d.pivot_type(col) {
3068            PivotType::OneByOne => {
3069                d11.set_1x1(col, result.d.diagonal_1x1(col));
3070                col += 1;
3071            }
3072            PivotType::TwoByTwo { partner: _ } => {
3073                let block = result.d.diagonal_2x2(col);
3074                d11.set_2x2(Block2x2 {
3075                    first_col: col,
3076                    a: block.a,
3077                    b: block.b,
3078                    c: block.c,
3079                });
3080                col += 2;
3081            }
3082            PivotType::Delayed => {
3083                unreachable!("unexpected Delayed pivot at col {} in 0..ne", col);
3084            }
3085        }
3086    }
3087
3088    // Extract L21 (r × ne) where r = m - ne
3089    let r = m - ne;
3090    let l21 = if ne > 0 && r > 0 {
3091        let mut l = Mat::zeros(r, ne);
3092        for j in 0..ne {
3093            let src = &l_storage.col_as_slice(j)[ne..m];
3094            l.col_as_slice_mut(j)[..r].copy_from_slice(src);
3095        }
3096        l
3097    } else {
3098        Mat::zeros(r, ne)
3099    };
3100
3101    // Local permutation and indices
3102    let local_perm = result.perm[..k].to_vec();
3103    let col_indices: Vec<usize> = local_perm[..ne]
3104        .iter()
3105        .map(|&lp| frontal_row_indices[lp])
3106        .collect();
3107
3108    let mut row_indices = Vec::with_capacity(m - ne);
3109    for &lp in &result.perm[ne..k] {
3110        row_indices.push(frontal_row_indices[lp]);
3111    }
3112    row_indices.extend_from_slice(&frontal_row_indices[k..m]);
3113
3114    super::numeric::FrontFactors {
3115        l11,
3116        d11,
3117        l21,
3118        local_perm,
3119        num_eliminated: ne,
3120        col_indices,
3121        row_indices,
3122    }
3123}
3124
3125/// Extract contribution block from rectangular m×k L storage.
3126///
3127/// Adapts `extract_contribution` for rectangular layout. The NFS×NFS Schur
3128/// complement is already in `contrib_buffer` (from `compute_contribution_gemm_rect`).
3129/// Delayed column data (if any) is read from `l_storage[ne..k, ne..k]` and
3130/// `l_storage[k..m, ne..k]`.
3131pub(super) fn extract_contribution_rect(
3132    l_storage: &Mat<f64>,
3133    m: usize,
3134    k: usize,
3135    frontal_row_indices: &[usize],
3136    result: &AptpFactorResult,
3137    mut contrib_buffer: Mat<f64>,
3138) -> super::numeric::ContributionBlock {
3139    let ne = result.num_eliminated;
3140    let size = m - ne;
3141    let num_delayed = k - ne;
3142    let nfs = m - k;
3143
3144    if num_delayed > 0 {
3145        let mut data = Mat::zeros(size, size);
3146
3147        // Copy delayed × delayed from l_storage[ne..k, ne..k]
3148        for j in 0..num_delayed {
3149            let col_len = num_delayed - j;
3150            let src = &l_storage.col_as_slice(ne + j)[ne + j..ne + j + col_len];
3151            data.col_as_slice_mut(j)[j..j + col_len].copy_from_slice(src);
3152        }
3153
3154        // Copy NFS × delayed cross-terms from l_storage[k..m, ne..k]
3155        for j in 0..num_delayed {
3156            let src = &l_storage.col_as_slice(ne + j)[k..m];
3157            data.col_as_slice_mut(j)[num_delayed..size].copy_from_slice(src);
3158        }
3159
3160        // Copy NFS × NFS from contrib_buffer
3161        for j in 0..nfs {
3162            let col_len = nfs - j;
3163            let src = &contrib_buffer.col_as_slice(j)[j..j + col_len];
3164            data.col_as_slice_mut(num_delayed + j)[num_delayed + j..size].copy_from_slice(src);
3165        }
3166
3167        contrib_buffer = data;
3168    }
3169
3170    let mut row_indices = Vec::with_capacity(size);
3171    for &lp in &result.perm[ne..k] {
3172        row_indices.push(frontal_row_indices[lp]);
3173    }
3174    row_indices.extend_from_slice(&frontal_row_indices[k..m]);
3175
3176    super::numeric::ContributionBlock {
3177        data: contrib_buffer,
3178        row_indices,
3179        num_delayed,
3180    }
3181}
3182
3183#[cfg(test)]
3184mod tests {
3185    use super::*;
3186    use faer::Mat;
3187
3188    // ---- Test infrastructure ----
3189
3190    /// Reconstruct P^T L D L^T P from factorization components.
3191    fn reconstruct_dense_ldlt(l: &Mat<f64>, d: &MixedDiagonal, perm: &[usize]) -> Mat<f64> {
3192        let n = l.nrows();
3193
3194        // Build D as dense
3195        let mut d_mat = Mat::zeros(n, n);
3196        let nd = d.dimension();
3197        let mut col = 0;
3198        while col < nd {
3199            match d.pivot_type(col) {
3200                PivotType::OneByOne => {
3201                    d_mat[(col, col)] = d.diagonal_1x1(col);
3202                    col += 1;
3203                }
3204                PivotType::TwoByTwo { partner } if partner > col => {
3205                    let block = d.diagonal_2x2(col);
3206                    d_mat[(col, col)] = block.a;
3207                    d_mat[(col, col + 1)] = block.b;
3208                    d_mat[(col + 1, col)] = block.b;
3209                    d_mat[(col + 1, col + 1)] = block.c;
3210                    col += 2;
3211                }
3212                _ => {
3213                    col += 1;
3214                }
3215            }
3216        }
3217
3218        // L * D
3219        let mut w = Mat::zeros(n, n);
3220        for i in 0..n {
3221            for j in 0..n {
3222                let mut sum = 0.0;
3223                for k in 0..n {
3224                    sum += l[(i, k)] * d_mat[(k, j)];
3225                }
3226                w[(i, j)] = sum;
3227            }
3228        }
3229
3230        // W * L^T
3231        let mut ldlt = Mat::zeros(n, n);
3232        for i in 0..n {
3233            for j in 0..n {
3234                let mut sum = 0.0;
3235                for k in 0..n {
3236                    sum += w[(i, k)] * l[(j, k)];
3237                }
3238                ldlt[(i, j)] = sum;
3239            }
3240        }
3241
3242        // Apply permutation: reconstructed[perm[i], perm[j]] = ldlt[i, j]
3243        let mut result = Mat::zeros(n, n);
3244        for i in 0..n {
3245            for j in 0..n {
3246                result[(perm[i], perm[j])] = ldlt[(i, j)];
3247            }
3248        }
3249
3250        result
3251    }
3252
3253    /// Compute ||A - P^T L D L^T P|| / ||A||
3254    fn dense_reconstruction_error(
3255        original: &Mat<f64>,
3256        l: &Mat<f64>,
3257        d: &MixedDiagonal,
3258        perm: &[usize],
3259    ) -> f64 {
3260        let reconstructed = reconstruct_dense_ldlt(l, d, perm);
3261        let n = original.nrows();
3262
3263        let mut diff_norm_sq = 0.0_f64;
3264        let mut orig_norm_sq = 0.0_f64;
3265
3266        for i in 0..n {
3267            for j in 0..n {
3268                let diff = original[(i, j)] - reconstructed[(i, j)];
3269                diff_norm_sq += diff * diff;
3270                orig_norm_sq += original[(i, j)] * original[(i, j)];
3271            }
3272        }
3273
3274        if orig_norm_sq == 0.0 {
3275            return diff_norm_sq.sqrt();
3276        }
3277        (diff_norm_sq / orig_norm_sq).sqrt()
3278    }
3279
3280    fn symmetric_matrix(n: usize, f: impl Fn(usize, usize) -> f64) -> Mat<f64> {
3281        Mat::from_fn(n, n, |i, j| if i >= j { f(i, j) } else { f(j, i) })
3282    }
3283
3284    // ---- Infrastructure test ----
3285
3286    #[test]
3287    fn test_reconstruction_trivial_identity() {
3288        let l = Mat::identity(2, 2);
3289        let mut d = MixedDiagonal::new(2);
3290        d.set_1x1(0, 1.0);
3291        d.set_1x1(1, 1.0);
3292        let perm = vec![0, 1];
3293
3294        let result = reconstruct_dense_ldlt(&l, &d, &perm);
3295        for i in 0..2 {
3296            for j in 0..2 {
3297                let expected = if i == j { 1.0 } else { 0.0 };
3298                assert!(
3299                    (result[(i, j)] - expected).abs() < 1e-14,
3300                    "({},{}) = {}, expected {}",
3301                    i,
3302                    j,
3303                    result[(i, j)],
3304                    expected
3305                );
3306            }
3307        }
3308    }
3309
3310    // ---- Complete Pivoting Tests (Algorithm 4.1) ----
3311
3312    /// Helper: run complete_pivoting_factor and extract L for reconstruction testing.
3313    fn complete_pivoting_factor_and_extract(
3314        a: &Mat<f64>,
3315    ) -> (Mat<f64>, MixedDiagonal, Vec<usize>, AptpStatistics) {
3316        let mut a_copy = a.clone();
3317        let result = complete_pivoting_factor(a_copy.as_mut(), 1e-20);
3318        let l = extract_l(a_copy.as_ref(), &result.d, result.num_eliminated);
3319        (l, result.d, result.perm, result.stats)
3320    }
3321
3322    #[test]
3323    fn test_cp_identity() {
3324        // 3×3 identity → D=[1,1,1], no permutation
3325        let a = Mat::identity(3, 3);
3326        let mut a_copy = a.clone();
3327        let result = complete_pivoting_factor(a_copy.as_mut(), 1e-20);
3328
3329        assert_eq!(result.num_eliminated, 3);
3330        assert_eq!(result.stats.num_1x1, 3);
3331        assert_eq!(result.stats.num_2x2, 0);
3332        assert_eq!(result.stats.num_delayed, 0);
3333
3334        // D should be [1, 1, 1]
3335        for i in 0..3 {
3336            assert!((result.d.diagonal_1x1(i) - 1.0).abs() < 1e-14);
3337        }
3338
3339        // No off-diagonal L entries
3340        assert!(result.stats.max_l_entry < 1e-14);
3341    }
3342
3343    #[test]
3344    fn test_cp_diagonal_pivot_ordering() {
3345        // 3×3 diagonal with known pivot ordering (largest diagonal first)
3346        let a = symmetric_matrix(3, |i, j| if i == j { [2.0, 5.0, 3.0][i] } else { 0.0 });
3347        let mut a_copy = a.clone();
3348        let result = complete_pivoting_factor(a_copy.as_mut(), 1e-20);
3349
3350        assert_eq!(result.num_eliminated, 3);
3351        assert_eq!(result.stats.num_1x1, 3);
3352
3353        // First pivot should be 5.0 (col 1), then 3.0 (col 2), then 2.0 (col 0)
3354        assert!(
3355            (result.d.diagonal_1x1(0) - 5.0).abs() < 1e-14,
3356            "first pivot should be 5.0, got {}",
3357            result.d.diagonal_1x1(0)
3358        );
3359        assert!(
3360            (result.d.diagonal_1x1(1) - 3.0).abs() < 1e-14,
3361            "second pivot should be 3.0, got {}",
3362            result.d.diagonal_1x1(1)
3363        );
3364        assert!(
3365            (result.d.diagonal_1x1(2) - 2.0).abs() < 1e-14,
3366            "third pivot should be 2.0, got {}",
3367            result.d.diagonal_1x1(2)
3368        );
3369    }
3370
3371    #[test]
3372    fn test_cp_2x2_pivot() {
3373        // 4×4 matrix requiring 2×2 pivot (off-diagonal maximum)
3374        // Designed so max entry is off-diagonal and Δ condition passes
3375        let a = symmetric_matrix(4, |i, j| {
3376            let vals = [
3377                [0.1, 10.0, 0.0, 0.0],
3378                [10.0, 0.1, 0.0, 0.0],
3379                [0.0, 0.0, 5.0, 1.0],
3380                [0.0, 0.0, 1.0, 3.0],
3381            ];
3382            vals[i][j]
3383        });
3384
3385        let (l, d, perm, stats) = complete_pivoting_factor_and_extract(&a);
3386
3387        assert!(
3388            stats.num_2x2 >= 1,
3389            "expected at least one 2×2 pivot, got {}",
3390            stats.num_2x2
3391        );
3392        // L entries bounded by 4 (complete pivoting guarantee)
3393        assert!(
3394            stats.max_l_entry <= COMPLETE_PIVOTING_GROWTH_BOUND + 1e-10,
3395            "L entries should be bounded by 4, got {}",
3396            stats.max_l_entry
3397        );
3398
3399        // Reconstruction
3400        let error = dense_reconstruction_error(&a, &l, &d, &perm);
3401        assert!(error < 1e-12, "reconstruction error {:.2e} >= 1e-12", error);
3402    }
3403
3404    #[test]
3405    fn test_cp_failed_2x2_fallback() {
3406        // 4×4 matrix where 2×2 Δ test fails → fallback to 1×1 on max diagonal
3407        // Need |Δ| < 0.5 * |a_tm|^2
3408        // a_mm * a_tt - a_tm^2 should be small relative to a_tm^2
3409        // Let a_mm = 1.0, a_tt = 1.0, a_tm = 2.0
3410        // Δ = 1*1 - 4 = -3, |Δ| = 3, 0.5*a_tm^2 = 2 → |Δ| > 0.5*a_tm^2
3411        // Need: a_mm ≈ a_tt and both ≈ a_tm
3412        // Let a_mm = 0.5, a_tt = 0.5, a_tm = 1.0
3413        // Δ = 0.25 - 1 = -0.75, |Δ| = 0.75, 0.5*a_tm^2 = 0.5 → 0.75 > 0.5, passes!
3414        // Let a_mm = 0.1, a_tt = 0.1, a_tm = 1.0
3415        // Δ = 0.01 - 1 = -0.99, |Δ| = 0.99, 0.5*a_tm^2 = 0.5 → 0.99 > 0.5, passes!
3416        // Let a_mm = 0.01, a_tt = 0.01, a_tm = 1.0
3417        // Δ = 0.0001 - 1 = -0.9999, |Δ| = 0.9999, 0.5*1 = 0.5 → passes!
3418        // Hmm, hard to fail. The condition is |det| >= 0.5*a_tm^2.
3419        // For failure: need det ≈ 0. This means a_mm*a_tt ≈ a_tm^2.
3420        // Let a_mm = 2.0, a_tt = 2.0, a_tm = 2.0
3421        // Δ = 4 - 4 = 0, |Δ| = 0 < 0.5*4 = 2 → FAILS! Good.
3422        let a = symmetric_matrix(4, |i, j| {
3423            let vals = [
3424                [2.0, 2.0, 0.1, 0.1],
3425                [2.0, 2.0, 0.1, 0.1],
3426                [0.1, 0.1, 5.0, 0.0],
3427                [0.1, 0.1, 0.0, 3.0],
3428            ];
3429            vals[i][j]
3430        });
3431
3432        let (l, d, perm, stats) = complete_pivoting_factor_and_extract(&a);
3433
3434        // With Δ ≈ 0, should fall back to 1×1 on max(|a_mm|, |a_tt|)
3435        // L entries bounded by √2 < 4 in this case (paper's bound)
3436        assert!(
3437            stats.max_l_entry <= COMPLETE_PIVOTING_GROWTH_BOUND + 1e-10,
3438            "L entries should be bounded by 4"
3439        );
3440
3441        // Reconstruction
3442        let error = dense_reconstruction_error(&a, &l, &d, &perm);
3443        assert!(error < 1e-12, "reconstruction error {:.2e} >= 1e-12", error);
3444    }
3445
3446    #[test]
3447    fn test_cp_singular_block() {
3448        // singular/near-singular block → zero pivot handling
3449        let mut a = Mat::zeros(3, 3);
3450        a[(0, 0)] = 1e-25;
3451        a[(1, 1)] = 1e-25;
3452        a[(2, 2)] = 1e-25;
3453
3454        let result = complete_pivoting_factor(a.as_mut(), 1e-20);
3455
3456        // All entries below small → all should be zero pivots
3457        assert_eq!(
3458            result.num_eliminated, 0,
3459            "near-singular block should have 0 eliminations"
3460        );
3461        assert_eq!(result.stats.num_delayed, 3);
3462    }
3463
3464    #[test]
3465    fn test_cp_reconstruction_random() {
3466        // reconstruction on random symmetric indefinite matrices
3467        // Use deterministic seed for reproducibility
3468        let sizes = [8, 16, 32];
3469        for &n in &sizes {
3470            let a = symmetric_matrix(n, |i, j| {
3471                // Deterministic pseudo-random indefinite matrix
3472                let seed = (i * 1000 + j * 7 + 13) as f64;
3473                let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
3474                if i == j {
3475                    val * 10.0 // diagonal dominance but indefinite
3476                } else {
3477                    val
3478                }
3479            });
3480
3481            let (l, d, perm, stats) = complete_pivoting_factor_and_extract(&a);
3482
3483            // Should eliminate all columns (no singular blocks)
3484            assert_eq!(
3485                stats.num_1x1 + 2 * stats.num_2x2 + stats.num_delayed,
3486                n,
3487                "statistics invariant for {}x{}",
3488                n,
3489                n
3490            );
3491
3492            if stats.num_delayed == 0 {
3493                let error = dense_reconstruction_error(&a, &l, &d, &perm);
3494                assert!(
3495                    error < 1e-12,
3496                    "complete pivoting {}x{}: reconstruction error {:.2e} >= 1e-12",
3497                    n,
3498                    n,
3499                    error
3500                );
3501            }
3502
3503            // L entries bounded by 4 (Algorithm 4.1 guarantee)
3504            assert!(
3505                stats.max_l_entry <= COMPLETE_PIVOTING_GROWTH_BOUND + 1e-10,
3506                "complete pivoting {}x{}: max_l_entry {:.2e} > 4",
3507                n,
3508                n,
3509                stats.max_l_entry
3510            );
3511        }
3512    }
3513
3514    #[test]
3515    fn test_1x1_trivial_diagonal() {
3516        let a = symmetric_matrix(2, |i, j| if i == j { [4.0, 9.0][i] } else { 0.0 });
3517
3518        let opts = AptpOptions {
3519            fallback: AptpFallback::Delay,
3520            ..AptpOptions::default()
3521        };
3522        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3523
3524        // All columns eliminated, no delays
3525        assert_eq!(result.stats.num_1x1 + 2 * result.stats.num_2x2, 2);
3526        assert_eq!(result.stats.num_delayed, 0);
3527        assert!(result.delayed_cols.is_empty());
3528
3529        let error =
3530            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
3531        assert!(error < 1e-12, "reconstruction error {:.2e} >= 1e-12", error);
3532    }
3533
3534    #[test]
3535    fn test_1x1_positive_definite_3x3() {
3536        let a = symmetric_matrix(3, |i, j| {
3537            let vals = [[4.0, 2.0, 1.0], [2.0, 5.0, 3.0], [1.0, 3.0, 6.0]];
3538            vals[i][j]
3539        });
3540
3541        let opts = AptpOptions::default();
3542        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3543
3544        // All columns eliminated, no delays
3545        assert_eq!(result.stats.num_1x1 + 2 * result.stats.num_2x2, 3);
3546        assert_eq!(result.stats.num_delayed, 0);
3547
3548        let error =
3549            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
3550        assert!(error < 1e-12, "reconstruction error {:.2e} >= 1e-12", error);
3551    }
3552
3553    #[test]
3554    fn test_all_delayed_zero_matrix() {
3555        let n = 4;
3556        let a = Mat::zeros(n, n);
3557
3558        let opts = AptpOptions {
3559            failed_pivot_method: FailedPivotMethod::Pass,
3560            ..AptpOptions::default()
3561        };
3562        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3563
3564        // TPP treats zero columns as zero pivots (1x1 with D=0), not delays.
3565        // With FailedPivotMethod::Pass, TPP is still used as primary for small
3566        // matrices, and it handles zero columns by recording them as zero pivots.
3567        // Total columns accounted for:
3568        let total = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
3569        assert_eq!(total, n, "total pivots + delays should equal n");
3570    }
3571
3572    #[test]
3573    fn test_1x1_singularity_detection() {
3574        let a = symmetric_matrix(3, |i, j| if i == j { [4.0, 1e-25, 9.0][i] } else { 0.0 });
3575
3576        let opts = AptpOptions {
3577            fallback: AptpFallback::Delay,
3578            failed_pivot_method: FailedPivotMethod::Pass,
3579            ..AptpOptions::default()
3580        };
3581        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3582
3583        // The near-zero entry should be detected (as zero pivot or delay).
3584        // TPP records zero columns as zero pivots (1x1 with D=0), while APTP
3585        // delays them. Either way, the other 2 columns should be eliminated.
3586        let eliminated = result.stats.num_1x1 + 2 * result.stats.num_2x2;
3587        assert!(
3588            eliminated >= 2,
3589            "should eliminate at least 2 columns, got {}",
3590            eliminated
3591        );
3592        let total = eliminated + result.stats.num_delayed;
3593        assert_eq!(total, 3, "total pivots + delays should equal n");
3594    }
3595
3596    #[test]
3597    fn test_stability_bound_enforced() {
3598        // With complete pivoting, this matrix is handled via 2×2 pivot
3599        // (max off-diagonal entry 1.0 triggers 2×2 with Δ condition).
3600        // Complete pivoting bounds L entries by 4 (u=0.25), so no delays.
3601        let a = symmetric_matrix(2, |i, j| {
3602            let vals = [[1e-4, 1.0], [1.0, 1.0]];
3603            vals[i][j]
3604        });
3605
3606        let opts = AptpOptions {
3607            fallback: AptpFallback::Delay,
3608            ..AptpOptions::default()
3609        };
3610        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3611
3612        // Complete pivoting eliminates all columns (no delays)
3613        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
3614        assert_eq!(sum, 2);
3615        // Verify correctness via reconstruction
3616        let error =
3617            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
3618        assert!(error < 1e-12, "reconstruction error {:.2e} >= 1e-12", error);
3619    }
3620
3621    #[test]
3622    fn test_1x1_matrix() {
3623        let a = Mat::from_fn(1, 1, |_, _| 5.0);
3624
3625        let opts = AptpOptions::default();
3626        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3627
3628        assert_eq!(result.stats.num_1x1, 1);
3629        assert_eq!(result.stats.num_delayed, 0);
3630
3631        let error =
3632            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
3633        assert!(error < 1e-14, "reconstruction error {:.2e}", error);
3634    }
3635
3636    #[test]
3637    fn test_statistics_sum_invariant() {
3638        let a = symmetric_matrix(5, |i, j| {
3639            let vals = [
3640                [10.0, 1.0, 0.0, 0.0, 0.0],
3641                [1.0, 20.0, 2.0, 0.0, 0.0],
3642                [0.0, 2.0, 30.0, 3.0, 0.0],
3643                [0.0, 0.0, 3.0, 40.0, 4.0],
3644                [0.0, 0.0, 0.0, 4.0, 50.0],
3645            ];
3646            vals[i][j]
3647        });
3648
3649        let opts = AptpOptions::default();
3650        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3651
3652        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
3653        assert_eq!(sum, 5, "statistics sum {} != n=5", sum);
3654    }
3655
3656    #[test]
3657    fn test_2x2_pivot_known_indefinite() {
3658        let a = symmetric_matrix(4, |i, j| {
3659            let vals = [
3660                [0.01, 10.0, 0.0, 0.0],
3661                [10.0, 0.01, 0.0, 0.0],
3662                [0.0, 0.0, 5.0, 1.0],
3663                [0.0, 0.0, 1.0, 3.0],
3664            ];
3665            vals[i][j]
3666        });
3667
3668        let opts = AptpOptions {
3669            fallback: AptpFallback::BunchKaufman,
3670            ..AptpOptions::default()
3671        };
3672        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3673
3674        assert!(
3675            result.stats.num_2x2 >= 1,
3676            "expected 2x2 pivot, got num_2x2={}",
3677            result.stats.num_2x2
3678        );
3679
3680        let error =
3681            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
3682        assert!(error < 1e-12, "reconstruction error {:.2e} >= 1e-12", error);
3683    }
3684
3685    #[test]
3686    fn test_2x2_stability_test() {
3687        let a_good = symmetric_matrix(2, |i, j| {
3688            let vals = [[1.0, 5.0], [5.0, 1.0]];
3689            vals[i][j]
3690        });
3691        let opts = AptpOptions {
3692            fallback: AptpFallback::BunchKaufman,
3693            ..AptpOptions::default()
3694        };
3695        let result = aptp_factor(a_good.as_ref(), &opts).unwrap();
3696        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
3697        assert_eq!(sum, 2);
3698    }
3699
3700    #[test]
3701    fn test_bk_vs_delay_fallback() {
3702        let a = symmetric_matrix(4, |i, j| {
3703            let vals = [
3704                [0.01, 10.0, 0.0, 0.0],
3705                [10.0, 0.01, 0.0, 0.0],
3706                [0.0, 0.0, 5.0, 1.0],
3707                [0.0, 0.0, 1.0, 3.0],
3708            ];
3709            vals[i][j]
3710        });
3711
3712        let bk_opts = AptpOptions {
3713            fallback: AptpFallback::BunchKaufman,
3714            ..AptpOptions::default()
3715        };
3716        let delay_opts = AptpOptions {
3717            fallback: AptpFallback::Delay,
3718            ..AptpOptions::default()
3719        };
3720
3721        let bk_result = aptp_factor(a.as_ref(), &bk_opts).unwrap();
3722        let delay_result = aptp_factor(a.as_ref(), &delay_opts).unwrap();
3723
3724        assert!(
3725            bk_result.stats.num_delayed <= delay_result.stats.num_delayed,
3726            "BK delayed {} > Delay delayed {}",
3727            bk_result.stats.num_delayed,
3728            delay_result.stats.num_delayed
3729        );
3730
3731        if bk_result.stats.num_delayed == 0 {
3732            let error = dense_reconstruction_error(
3733                &a,
3734                &bk_result.l,
3735                &bk_result.d,
3736                bk_result.perm.as_ref().arrays().0,
3737            );
3738            assert!(error < 1e-12, "BK reconstruction error {:.2e}", error);
3739        }
3740    }
3741
3742    #[test]
3743    fn test_strict_threshold() {
3744        let a = symmetric_matrix(4, |i, j| {
3745            let vals = [
3746                [4.0, 2.0, 1.0, 0.5],
3747                [2.0, 5.0, 2.0, 1.0],
3748                [1.0, 2.0, 6.0, 2.0],
3749                [0.5, 1.0, 2.0, 7.0],
3750            ];
3751            vals[i][j]
3752        });
3753
3754        let loose = AptpOptions {
3755            threshold: 0.01,
3756            fallback: AptpFallback::Delay,
3757            ..AptpOptions::default()
3758        };
3759        let strict = AptpOptions {
3760            threshold: 0.5,
3761            fallback: AptpFallback::Delay,
3762            ..AptpOptions::default()
3763        };
3764
3765        let loose_result = aptp_factor(a.as_ref(), &loose).unwrap();
3766        let strict_result = aptp_factor(a.as_ref(), &strict).unwrap();
3767
3768        assert!(
3769            strict_result.stats.num_delayed >= loose_result.stats.num_delayed,
3770            "strict delayed {} < loose delayed {}",
3771            strict_result.stats.num_delayed,
3772            loose_result.stats.num_delayed,
3773        );
3774    }
3775
3776    #[test]
3777    fn test_permutation_valid() {
3778        let a = symmetric_matrix(4, |i, j| {
3779            let vals = [
3780                [0.01, 10.0, 0.0, 0.0],
3781                [10.0, 0.01, 0.0, 0.0],
3782                [0.0, 0.0, 5.0, 1.0],
3783                [0.0, 0.0, 1.0, 3.0],
3784            ];
3785            vals[i][j]
3786        });
3787
3788        let opts = AptpOptions {
3789            fallback: AptpFallback::BunchKaufman,
3790            ..AptpOptions::default()
3791        };
3792        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3793
3794        let (fwd, inv) = result.perm.as_ref().arrays();
3795        let n = fwd.len();
3796        assert_eq!(n, 4);
3797        let mut seen = vec![false; n];
3798        for &v in fwd {
3799            assert!(v < n, "perm value {} >= n={}", v, n);
3800            assert!(!seen[v], "duplicate perm value {}", v);
3801            seen[v] = true;
3802        }
3803        for i in 0..n {
3804            assert_eq!(inv[fwd[i]], i);
3805            assert_eq!(fwd[inv[i]], i);
3806        }
3807    }
3808
3809    #[test]
3810    fn test_pd_statistics() {
3811        let a = symmetric_matrix(4, |i, j| {
3812            let vals = [
3813                [10.0, 1.0, 0.0, 0.0],
3814                [1.0, 20.0, 2.0, 0.0],
3815                [0.0, 2.0, 30.0, 3.0],
3816                [0.0, 0.0, 3.0, 40.0],
3817            ];
3818            vals[i][j]
3819        });
3820
3821        let opts = AptpOptions::default();
3822        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3823
3824        // All columns eliminated, no delays
3825        assert_eq!(result.stats.num_1x1 + 2 * result.stats.num_2x2, 4);
3826        assert_eq!(result.stats.num_delayed, 0);
3827        assert!(result.stats.max_l_entry < 1.0 / opts.threshold);
3828    }
3829
3830    #[test]
3831    fn test_max_l_entry_accuracy() {
3832        let a = symmetric_matrix(4, |i, j| {
3833            let vals = [
3834                [4.0, 2.0, 1.0, 0.5],
3835                [2.0, 5.0, 2.0, 1.0],
3836                [1.0, 2.0, 6.0, 2.0],
3837                [0.5, 1.0, 2.0, 7.0],
3838            ];
3839            vals[i][j]
3840        });
3841
3842        let opts = AptpOptions::default();
3843        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3844
3845        let n = result.l.nrows();
3846        let mut actual_max = 0.0_f64;
3847        for j in 0..n {
3848            for i in (j + 1)..n {
3849                let val = result.l[(i, j)].abs();
3850                if val > actual_max {
3851                    actual_max = val;
3852                }
3853            }
3854        }
3855
3856        assert!(
3857            (result.stats.max_l_entry - actual_max).abs() < 1e-14,
3858            "stats.max_l_entry={:.6e}, actual={:.6e}",
3859            result.stats.max_l_entry,
3860            actual_max
3861        );
3862    }
3863
3864    #[test]
3865    fn test_pivot_log_completeness() {
3866        let a = symmetric_matrix(4, |i, j| {
3867            let vals = [
3868                [4.0, 2.0, 1.0, 0.5],
3869                [2.0, 5.0, 2.0, 1.0],
3870                [1.0, 2.0, 6.0, 2.0],
3871                [0.5, 1.0, 2.0, 7.0],
3872            ];
3873            vals[i][j]
3874        });
3875
3876        let opts = AptpOptions::default();
3877        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3878
3879        assert_eq!(result.pivot_log.len(), 4);
3880    }
3881
3882    #[test]
3883    fn test_inertia_from_d() {
3884        let a = symmetric_matrix(5, |i, j| {
3885            if i == j {
3886                [3.0, -2.0, 1.0, -4.0, 5.0][i]
3887            } else {
3888                0.0
3889            }
3890        });
3891
3892        let opts = AptpOptions {
3893            fallback: AptpFallback::Delay,
3894            ..AptpOptions::default()
3895        };
3896        let result = aptp_factor(a.as_ref(), &opts).unwrap();
3897
3898        assert_eq!(result.stats.num_delayed, 0, "should have no delays");
3899
3900        let inertia = result.d.compute_inertia();
3901        assert_eq!(inertia.positive, 3, "expected 3 positive");
3902        assert_eq!(inertia.negative, 2, "expected 2 negative");
3903        assert_eq!(inertia.zero, 0, "expected 0 zero");
3904    }
3905
3906    // ---- Polish tests ----
3907
3908    #[test]
3909    fn test_partial_factorization() {
3910        let n = 8;
3911        let p = 4;
3912        let a = symmetric_matrix(n, |i, j| {
3913            if i == j {
3914                10.0 + i as f64
3915            } else {
3916                1.0 / (1.0 + (i as f64 - j as f64).abs())
3917            }
3918        });
3919
3920        let opts = AptpOptions::default();
3921        let mut a_copy = a.clone();
3922        let result = aptp_factor_in_place(a_copy.as_mut(), p, &opts).unwrap();
3923
3924        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
3925        assert_eq!(sum, p, "statistics should sum to num_fully_summed={}", p);
3926    }
3927
3928    #[test]
3929    fn test_edge_case_extreme_thresholds() {
3930        let a = symmetric_matrix(3, |i, j| {
3931            let vals = [[4.0, 2.0, 1.0], [2.0, 5.0, 2.0], [1.0, 2.0, 6.0]];
3932            vals[i][j]
3933        });
3934
3935        let loose = AptpOptions {
3936            threshold: 0.001,
3937            fallback: AptpFallback::Delay,
3938            ..AptpOptions::default()
3939        };
3940        let result_loose = aptp_factor(a.as_ref(), &loose).unwrap();
3941        assert_eq!(result_loose.stats.num_delayed, 0);
3942
3943        let strict = AptpOptions {
3944            threshold: 1.0,
3945            fallback: AptpFallback::Delay,
3946            ..AptpOptions::default()
3947        };
3948        let result_strict = aptp_factor(a.as_ref(), &strict).unwrap();
3949        let sum = result_strict.stats.num_1x1
3950            + 2 * result_strict.stats.num_2x2
3951            + result_strict.stats.num_delayed;
3952        assert_eq!(sum, 3);
3953    }
3954
3955    #[test]
3956    fn test_both_fallback_strategies_valid() {
3957        let matrices = [
3958            symmetric_matrix(3, |i, j| {
3959                let vals = [[1.0, 2.0, 0.0], [2.0, -1.0, 1.0], [0.0, 1.0, 3.0]];
3960                vals[i][j]
3961            }),
3962            symmetric_matrix(4, |i, j| {
3963                let vals = [
3964                    [0.01, 10.0, 0.0, 0.0],
3965                    [10.0, 0.01, 0.0, 0.0],
3966                    [0.0, 0.0, 5.0, 1.0],
3967                    [0.0, 0.0, 1.0, 3.0],
3968                ];
3969                vals[i][j]
3970            }),
3971        ];
3972
3973        for (idx, a) in matrices.iter().enumerate() {
3974            for fallback in [AptpFallback::BunchKaufman, AptpFallback::Delay] {
3975                let opts = AptpOptions {
3976                    fallback,
3977                    ..AptpOptions::default()
3978                };
3979                let result = aptp_factor(a.as_ref(), &opts).unwrap();
3980
3981                if result.stats.num_delayed == 0 {
3982                    let error = dense_reconstruction_error(
3983                        a,
3984                        &result.l,
3985                        &result.d,
3986                        result.perm.as_ref().arrays().0,
3987                    );
3988                    assert!(
3989                        error < 1e-12,
3990                        "matrix {} {:?}: error {:.2e}",
3991                        idx,
3992                        fallback,
3993                        error
3994                    );
3995                }
3996
3997                let sum =
3998                    result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
3999                let n = a.nrows();
4000                assert_eq!(sum, n, "statistics invariant broken for matrix {}", idx);
4001            }
4002        }
4003    }
4004
4005    // ---- Input validation tests ----
4006
4007    #[test]
4008    fn test_invalid_non_square() {
4009        let a = Mat::zeros(3, 4);
4010        let opts = AptpOptions::default();
4011        let result = aptp_factor(a.as_ref(), &opts);
4012        assert!(matches!(result, Err(SparseError::InvalidInput { .. })));
4013    }
4014
4015    #[test]
4016    fn test_invalid_threshold() {
4017        let a = Mat::zeros(2, 2);
4018        let opts = AptpOptions {
4019            threshold: 0.0,
4020            ..AptpOptions::default()
4021        };
4022        let result = aptp_factor(a.as_ref(), &opts);
4023        assert!(matches!(result, Err(SparseError::InvalidInput { .. })));
4024
4025        let opts2 = AptpOptions {
4026            threshold: 1.5,
4027            ..AptpOptions::default()
4028        };
4029        let result2 = aptp_factor(a.as_ref(), &opts2);
4030        assert!(matches!(result2, Err(SparseError::InvalidInput { .. })));
4031    }
4032
4033    #[test]
4034    fn test_invalid_num_fully_summed_too_large() {
4035        let mut a = Mat::zeros(3, 3);
4036        let opts = AptpOptions::default();
4037        let result = aptp_factor_in_place(a.as_mut(), 5, &opts);
4038        assert!(matches!(result, Err(SparseError::InvalidInput { .. })));
4039    }
4040
4041    // ---- Edge case and regression tests ----
4042
4043    #[test]
4044    fn test_zero_fully_summed() {
4045        let a = symmetric_matrix(3, |i, j| {
4046            let vals = [[4.0, 2.0, 1.0], [2.0, 5.0, 3.0], [1.0, 3.0, 6.0]];
4047            vals[i][j]
4048        });
4049
4050        let opts = AptpOptions::default();
4051        let mut a_copy = a.clone();
4052        let result = aptp_factor_in_place(a_copy.as_mut(), 0, &opts).unwrap();
4053
4054        assert_eq!(result.num_eliminated, 0);
4055        assert_eq!(result.stats.num_1x1, 0);
4056        assert_eq!(result.stats.num_2x2, 0);
4057        assert_eq!(result.stats.num_delayed, 0);
4058        assert!(result.pivot_log.is_empty());
4059    }
4060
4061    #[test]
4062    fn test_2x2_fallback_also_fails() {
4063        // With complete pivoting, the max entry is 12.0 on diagonal (1,1).
4064        // Complete pivoting starts with the largest entry and can handle
4065        // matrices that the old threshold-based approach would delay.
4066        let a = symmetric_matrix(3, |i, j| {
4067            let vals = [[0.001, 0.11, 0.0], [0.11, 12.0, 0.1], [0.0, 0.1, 5.0]];
4068            vals[i][j]
4069        });
4070
4071        let opts = AptpOptions {
4072            fallback: AptpFallback::BunchKaufman,
4073            ..AptpOptions::default()
4074        };
4075        let result = aptp_factor(a.as_ref(), &opts).unwrap();
4076
4077        // Complete pivoting eliminates all columns successfully
4078        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
4079        assert_eq!(sum, 3, "statistics sum {} != n=3", sum);
4080
4081        // Verify correctness via reconstruction
4082        if result.stats.num_delayed == 0 {
4083            let error = dense_reconstruction_error(
4084                &a,
4085                &result.l,
4086                &result.d,
4087                result.perm.as_ref().arrays().0,
4088            );
4089            assert!(error < 1e-12, "reconstruction error {:.2e}", error);
4090        }
4091
4092        // L entries bounded by 4 (complete pivoting guarantee)
4093        assert!(
4094            result.stats.max_l_entry <= COMPLETE_PIVOTING_GROWTH_BOUND + 1e-10,
4095            "max_l_entry {} > 4",
4096            result.stats.max_l_entry
4097        );
4098    }
4099
4100    #[test]
4101    fn test_invalid_small_negative() {
4102        let a = Mat::zeros(2, 2);
4103        let opts = AptpOptions {
4104            small: -1.0,
4105            ..AptpOptions::default()
4106        };
4107        let result = aptp_factor(a.as_ref(), &opts);
4108        assert!(matches!(result, Err(SparseError::InvalidInput { .. })));
4109    }
4110
4111    // ---- Random matrix stress tests ----
4112
4113    #[cfg(feature = "test-util")]
4114    mod stress_tests {
4115        use super::*;
4116        use crate::testing::generators::{RandomMatrixConfig, generate_random_symmetric};
4117        use rand::SeedableRng;
4118        use rand::rngs::StdRng;
4119
4120        /// Generate a dense symmetric PD matrix from the sparse generator.
4121        fn random_dense_pd(n: usize, rng: &mut impl rand::Rng) -> Mat<f64> {
4122            let config = RandomMatrixConfig {
4123                size: n,
4124                target_nnz: n * n / 2,
4125                positive_definite: true,
4126            };
4127            generate_random_symmetric(&config, rng).unwrap().to_dense()
4128        }
4129
4130        /// Generate a dense symmetric indefinite matrix from the sparse generator.
4131        fn random_dense_indefinite(n: usize, rng: &mut impl rand::Rng) -> Mat<f64> {
4132            let config = RandomMatrixConfig {
4133                size: n,
4134                target_nnz: n * n / 2,
4135                positive_definite: false,
4136            };
4137            generate_random_symmetric(&config, rng).unwrap().to_dense()
4138        }
4139
4140        #[test]
4141        fn test_random_pd_matrices() {
4142            let mut rng = StdRng::seed_from_u64(42);
4143            let opts = AptpOptions::default();
4144            let sizes = [5, 10, 20, 50];
4145            let mut total = 0;
4146
4147            for &n in &sizes {
4148                for _ in 0..25 {
4149                    let a = random_dense_pd(n, &mut rng);
4150                    let result = aptp_factor(a.as_ref(), &opts).unwrap();
4151
4152                    assert_eq!(
4153                        result.stats.num_delayed, 0,
4154                        "PD matrix {}x{} should have zero delays",
4155                        n, n
4156                    );
4157                    let total_elim = result.stats.num_1x1 + 2 * result.stats.num_2x2;
4158                    assert_eq!(
4159                        total_elim, n,
4160                        "PD matrix {}x{} should eliminate all columns (1x1={}, 2x2={})",
4161                        n, n, result.stats.num_1x1, result.stats.num_2x2
4162                    );
4163
4164                    let error = dense_reconstruction_error(
4165                        &a,
4166                        &result.l,
4167                        &result.d,
4168                        result.perm.as_ref().arrays().0,
4169                    );
4170                    assert!(
4171                        error < 1e-12,
4172                        "PD {}x{}: reconstruction error {:.2e}",
4173                        n,
4174                        n,
4175                        error
4176                    );
4177                    total += 1;
4178                }
4179            }
4180            assert!(total >= 100, "ran {} PD tests, need >= 100", total);
4181        }
4182
4183        #[test]
4184        fn test_random_indefinite_matrices() {
4185            let mut rng = StdRng::seed_from_u64(123);
4186            let opts = AptpOptions {
4187                fallback: AptpFallback::BunchKaufman,
4188                ..AptpOptions::default()
4189            };
4190            let sizes = [5, 10, 20, 50];
4191            let mut total = 0;
4192
4193            for &n in &sizes {
4194                for _ in 0..25 {
4195                    let a = random_dense_indefinite(n, &mut rng);
4196                    let result = aptp_factor(a.as_ref(), &opts).unwrap();
4197
4198                    // Statistics invariant
4199                    let sum =
4200                        result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
4201                    assert_eq!(sum, n, "stats invariant for {}x{}", n, n);
4202
4203                    // Stability bound
4204                    assert!(
4205                        result.stats.max_l_entry < 1.0 / opts.threshold,
4206                        "stability bound violated for {}x{}",
4207                        n,
4208                        n
4209                    );
4210
4211                    // Reconstruction (only if fully eliminated)
4212                    if result.stats.num_delayed == 0 {
4213                        let error = dense_reconstruction_error(
4214                            &a,
4215                            &result.l,
4216                            &result.d,
4217                            result.perm.as_ref().arrays().0,
4218                        );
4219                        assert!(
4220                            error < 1e-12,
4221                            "indefinite {}x{}: reconstruction error {:.2e}",
4222                            n,
4223                            n,
4224                            error
4225                        );
4226                    }
4227                    total += 1;
4228                }
4229            }
4230            assert!(total >= 100, "ran {} indefinite tests, need >= 100", total);
4231        }
4232    }
4233
4234    // ---- Integration tests with test data ----
4235
4236    #[cfg(feature = "test-util")]
4237    mod integration_tests {
4238        use super::*;
4239        use crate::testing::cases::{TestCaseFilter, load_test_cases};
4240
4241        #[test]
4242        #[ignore] // Requires test-data/ on disk
4243        fn test_hand_constructed_matrices() {
4244            let cases = load_test_cases(&TestCaseFilter::hand_constructed())
4245                .expect("failed to load hand-constructed matrices");
4246            assert_eq!(cases.len(), 15, "expected 15 hand-constructed matrices");
4247
4248            let opts = AptpOptions::default();
4249            let mut passed = 0;
4250
4251            for case in &cases {
4252                let dense = case.matrix.to_dense();
4253                let n = dense.nrows();
4254
4255                // Skip very large matrices (dense APTP is O(n^3))
4256                if n > 500 {
4257                    continue;
4258                }
4259
4260                let result = aptp_factor(dense.as_ref(), &opts).unwrap();
4261
4262                let sum =
4263                    result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
4264                assert_eq!(sum, n, "{}: stats invariant failed", case.name);
4265
4266                if result.stats.num_delayed == 0 {
4267                    let error = dense_reconstruction_error(
4268                        &dense,
4269                        &result.l,
4270                        &result.d,
4271                        result.perm.as_ref().arrays().0,
4272                    );
4273                    assert!(
4274                        error < 1e-12,
4275                        "{}: reconstruction error {:.2e}",
4276                        case.name,
4277                        error
4278                    );
4279                }
4280                passed += 1;
4281            }
4282            assert!(passed >= 10, "only {} hand-constructed passed", passed);
4283        }
4284
4285        #[test]
4286        #[ignore] // Requires SuiteSparse CI-subset on disk
4287        fn test_suitesparse_ci_dense() {
4288            let cases = load_test_cases(&TestCaseFilter::ci_subset())
4289                .expect("failed to load CI-subset matrices");
4290
4291            let opts = AptpOptions::default();
4292            let mut tested = 0;
4293
4294            for case in &cases {
4295                let n = case.matrix.nrows();
4296
4297                // Only test matrices small enough for dense factorization.
4298                // Dense O(n^2) memory + O(n^3) time: cap at 200 to avoid OOM.
4299                if n > 200 {
4300                    continue;
4301                }
4302
4303                let dense = case.matrix.to_dense();
4304
4305                let result = aptp_factor(dense.as_ref(), &opts).unwrap();
4306
4307                let sum =
4308                    result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
4309                assert_eq!(sum, n, "{}: stats invariant failed", case.name);
4310
4311                if result.stats.num_delayed == 0 {
4312                    let error = dense_reconstruction_error(
4313                        &dense,
4314                        &result.l,
4315                        &result.d,
4316                        result.perm.as_ref().arrays().0,
4317                    );
4318                    assert!(
4319                        error < 1e-12,
4320                        "{}: reconstruction error {:.2e}",
4321                        case.name,
4322                        error
4323                    );
4324                }
4325                tested += 1;
4326            }
4327            assert!(
4328                tested > 0,
4329                "no CI-subset matrices small enough for dense test"
4330            );
4331        }
4332    }
4333
4334    // ---- factor_inner tests ----
4335
4336    #[test]
4337    fn test_factor_inner_reconstruction_moderate() {
4338        // factor_inner on matrices of moderate size (128, 256)
4339        // verifying reconstruction < 1e-12.
4340        let sizes = [64, 128, 256];
4341        let opts = AptpOptions::default();
4342
4343        for &n in &sizes {
4344            let a = symmetric_matrix(n, |i, j| {
4345                let seed = (i * 1000 + j * 7 + 13) as f64;
4346                let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4347                if i == j { val * 10.0 } else { val }
4348            });
4349
4350            let result = aptp_factor(a.as_ref(), &opts).unwrap();
4351
4352            // Statistics invariant
4353            assert_eq!(
4354                result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed,
4355                n,
4356                "factor_inner {}x{}: statistics invariant",
4357                n,
4358                n
4359            );
4360
4361            if result.stats.num_delayed == 0 {
4362                let error = dense_reconstruction_error(
4363                    &a,
4364                    &result.l,
4365                    &result.d,
4366                    result.perm.as_ref().arrays().0,
4367                );
4368                assert!(
4369                    error < 1e-12,
4370                    "factor_inner {}x{}: reconstruction error {:.2e} >= 1e-12",
4371                    n,
4372                    n,
4373                    error
4374                );
4375            }
4376        }
4377    }
4378
4379    #[test]
4380    fn test_factor_inner_partial_factorization() {
4381        // factor_inner with num_fully_summed < m (contribution block present)
4382        let n = 64;
4383        let p = 48; // Only factor 48 of 64 columns
4384        let a = symmetric_matrix(n, |i, j| {
4385            let seed = (i * 1000 + j * 7 + 13) as f64;
4386            let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4387            if i == j { val * 10.0 } else { val }
4388        });
4389
4390        let opts = AptpOptions::default();
4391        let mut a_copy = a.to_owned();
4392        let result = aptp_factor_in_place(a_copy.as_mut(), p, &opts).unwrap();
4393
4394        // Should have eliminated <= p columns
4395        assert!(
4396            result.num_eliminated <= p,
4397            "eliminated {} > p={}",
4398            result.num_eliminated,
4399            p
4400        );
4401
4402        // Statistics should account for all p columns
4403        assert_eq!(
4404            result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed,
4405            p,
4406            "statistics invariant for partial factorization"
4407        );
4408    }
4409
4410    // ---- Two-level integration tests ----
4411
4412    #[test]
4413    fn test_two_level_dispatch_small_block_size() {
4414        // Test the two-level dispatch by setting outer_block_size small
4415        // so matrices of moderate size trigger two_level_factor.
4416        let sizes = [33, 64, 100];
4417        let opts = AptpOptions {
4418            outer_block_size: 32,
4419            inner_block_size: 8,
4420            ..AptpOptions::default()
4421        };
4422
4423        for &n in &sizes {
4424            let a = symmetric_matrix(n, |i, j| {
4425                let seed = (i * 1000 + j * 7 + 13) as f64;
4426                let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4427                if i == j { val * 10.0 } else { val }
4428            });
4429
4430            let result = aptp_factor(a.as_ref(), &opts).unwrap();
4431
4432            assert_eq!(
4433                result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed,
4434                n,
4435                "two-level {}x{}: statistics invariant",
4436                n,
4437                n
4438            );
4439
4440            if result.stats.num_delayed == 0 {
4441                let error = dense_reconstruction_error(
4442                    &a,
4443                    &result.l,
4444                    &result.d,
4445                    result.perm.as_ref().arrays().0,
4446                );
4447                assert!(
4448                    error < 1e-12,
4449                    "two-level {}x{}: reconstruction error {:.2e} >= 1e-12",
4450                    n,
4451                    n,
4452                    error
4453                );
4454            }
4455        }
4456    }
4457
4458    #[test]
4459    fn test_two_level_single_outer_block() {
4460        // frontal dimension == nb → single outer block, equivalent to factor_inner
4461        let n = 32;
4462        let opts = AptpOptions {
4463            outer_block_size: 32,
4464            inner_block_size: 8,
4465            ..AptpOptions::default()
4466        };
4467
4468        let a = symmetric_matrix(n, |i, j| {
4469            let seed = (i * 1000 + j * 7 + 13) as f64;
4470            let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4471            if i == j { val * 10.0 } else { val }
4472        });
4473
4474        let result = aptp_factor(a.as_ref(), &opts).unwrap();
4475        if result.stats.num_delayed == 0 {
4476            let error = dense_reconstruction_error(
4477                &a,
4478                &result.l,
4479                &result.d,
4480                result.perm.as_ref().arrays().0,
4481            );
4482            assert!(error < 1e-12, "single block: error {:.2e}", error);
4483        }
4484    }
4485
4486    #[test]
4487    fn test_two_level_boundary_nb_plus_1() {
4488        // frontal dimension == nb+1 → two blocks, second block has 1 column
4489        let n = 33;
4490        let opts = AptpOptions {
4491            outer_block_size: 32,
4492            inner_block_size: 8,
4493            ..AptpOptions::default()
4494        };
4495
4496        let a = symmetric_matrix(n, |i, j| {
4497            let seed = (i * 1000 + j * 7 + 13) as f64;
4498            let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4499            if i == j { val * 10.0 } else { val }
4500        });
4501
4502        let result = aptp_factor(a.as_ref(), &opts).unwrap();
4503        assert_eq!(
4504            result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed,
4505            n,
4506        );
4507        if result.stats.num_delayed == 0 {
4508            let error = dense_reconstruction_error(
4509                &a,
4510                &result.l,
4511                &result.d,
4512                result.perm.as_ref().arrays().0,
4513            );
4514            assert!(error < 1e-12, "nb+1 boundary: error {:.2e}", error);
4515        }
4516    }
4517
4518    #[test]
4519    fn test_two_level_partial_factorization() {
4520        // partial factorization (num_fully_summed < m) with dimension > nb
4521        // This triggers two_level_factor with a contribution block.
4522        let n = 80;
4523        let p = 50; // > outer_block_size=32, triggers two-level
4524        let opts = AptpOptions {
4525            outer_block_size: 32,
4526            inner_block_size: 8,
4527            ..AptpOptions::default()
4528        };
4529
4530        let a = symmetric_matrix(n, |i, j| {
4531            let seed = (i * 1000 + j * 7 + 13) as f64;
4532            let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4533            if i == j { val * 10.0 } else { val }
4534        });
4535
4536        let mut a_copy = a.to_owned();
4537        let result = aptp_factor_in_place(a_copy.as_mut(), p, &opts).unwrap();
4538
4539        assert!(result.num_eliminated <= p);
4540        assert_eq!(
4541            result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed,
4542            p,
4543        );
4544    }
4545
4546    #[test]
4547    fn test_two_level_vs_single_level_equivalence() {
4548        // Verify that two-level and single-level produce equivalent reconstruction
4549        // error on the same matrix (by setting outer_block_size to force each path).
4550        let n = 64;
4551        let a = symmetric_matrix(n, |i, j| {
4552            let seed = (i * 1000 + j * 7 + 13) as f64;
4553            let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4554            if i == j { val * 10.0 } else { val }
4555        });
4556
4557        // Single-level: outer_block_size >= n
4558        let opts_single = AptpOptions {
4559            outer_block_size: 256,
4560            inner_block_size: 32,
4561            ..AptpOptions::default()
4562        };
4563        let result_single = aptp_factor(a.as_ref(), &opts_single).unwrap();
4564
4565        // Two-level: outer_block_size < n
4566        let opts_two = AptpOptions {
4567            outer_block_size: 16,
4568            inner_block_size: 8,
4569            ..AptpOptions::default()
4570        };
4571        let result_two = aptp_factor(a.as_ref(), &opts_two).unwrap();
4572
4573        // Both should achieve reconstruction < 1e-12
4574        if result_single.stats.num_delayed == 0 {
4575            let err_s = dense_reconstruction_error(
4576                &a,
4577                &result_single.l,
4578                &result_single.d,
4579                result_single.perm.as_ref().arrays().0,
4580            );
4581            assert!(err_s < 1e-12, "single-level error {:.2e}", err_s);
4582        }
4583        if result_two.stats.num_delayed == 0 {
4584            let err_t = dense_reconstruction_error(
4585                &a,
4586                &result_two.l,
4587                &result_two.d,
4588                result_two.perm.as_ref().arrays().0,
4589            );
4590            assert!(err_t < 1e-12, "two-level error {:.2e}", err_t);
4591        }
4592    }
4593
4594    #[test]
4595    fn test_two_level_vs_unblocked_reconstruction() {
4596        // Regression test: col_order tracking corruption in two_level_factor when
4597        // block_perm was applied after delay swap, causing incorrect column mapping.
4598        //
4599        // When delay swaps happened BEFORE block_perm was applied to col_order,
4600        // the permutation mapping was corrupted, causing massive reconstruction
4601        // errors with small outer_block_size. This test verifies that both
4602        // two-level (ob=128, forcing 4 outer iterations) and unblocked (ob=huge)
4603        // paths produce equivalent, accurate reconstruction.
4604        let n = 512;
4605
4606        // Build a symmetric indefinite matrix with small diagonal entries to force
4607        // 2x2 pivots and some delays. Seed = golden ratio hash for reproducibility.
4608        let a = symmetric_matrix(n, |i, j| {
4609            let seed = (i * 997 + j * 1013 + 42) as f64;
4610            let val = (seed * 0.618033988749).fract() * 2.0 - 1.0;
4611            if i == j {
4612                // Small diagonal → more 2x2 pivots and delays
4613                val * 0.5
4614            } else {
4615                val
4616            }
4617        });
4618
4619        // Unblocked: outer_block_size >= n → single factor_inner call
4620        let opts_unblocked = AptpOptions {
4621            outer_block_size: 100_000,
4622            inner_block_size: 32,
4623            ..AptpOptions::default()
4624        };
4625        let result_unblocked = aptp_factor(a.as_ref(), &opts_unblocked).unwrap();
4626        let err_unblocked = dense_reconstruction_error(
4627            &a,
4628            &result_unblocked.l,
4629            &result_unblocked.d,
4630            result_unblocked.perm.as_ref().arrays().0,
4631        );
4632        assert!(
4633            err_unblocked < 1e-12,
4634            "unblocked reconstruction error {:.2e} exceeds 1e-12",
4635            err_unblocked,
4636        );
4637
4638        // Two-level: outer_block_size=128 → ~4 outer iterations
4639        let opts_two_level = AptpOptions {
4640            outer_block_size: 128,
4641            inner_block_size: 32,
4642            ..AptpOptions::default()
4643        };
4644        let result_two_level = aptp_factor(a.as_ref(), &opts_two_level).unwrap();
4645        let err_two_level = dense_reconstruction_error(
4646            &a,
4647            &result_two_level.l,
4648            &result_two_level.d,
4649            result_two_level.perm.as_ref().arrays().0,
4650        );
4651        assert!(
4652            err_two_level < 1e-12,
4653            "two-level reconstruction error {:.2e} exceeds 1e-12 \
4654             (unblocked was {:.2e})",
4655            err_two_level,
4656            err_unblocked,
4657        );
4658
4659        // Two-level error should be within 10x of unblocked
4660        let ratio = if err_unblocked > 0.0 {
4661            err_two_level / err_unblocked
4662        } else {
4663            1.0
4664        };
4665        assert!(
4666            ratio < 10.0,
4667            "two-level error ({:.2e}) is {:.1}x worse than unblocked ({:.2e})",
4668            err_two_level,
4669            ratio,
4670            err_unblocked,
4671        );
4672    }
4673
4674    // ---- BLAS-3 pipeline tests ----
4675
4676    #[test]
4677    fn test_factor_block_diagonal_basic() {
4678        // Factor 8×8 identity with block_size=4.
4679        // D=[1,1,1,1], identity permutation, no L entries within block.
4680        let mut a = Mat::identity(8, 8);
4681        let (d, perm, nelim, stats, _log) = factor_block_diagonal(a.as_mut(), 0, 4, 1e-20, 4);
4682
4683        assert_eq!(nelim, 4);
4684        assert_eq!(stats.num_1x1, 4);
4685        assert_eq!(stats.num_2x2, 0);
4686        assert_eq!(stats.num_delayed, 0);
4687        assert!(stats.max_l_entry < 1e-14);
4688
4689        // D should be [1, 1, 1, 1]
4690        for i in 0..4 {
4691            assert!((d.diagonal_1x1(i) - 1.0).abs() < 1e-14);
4692        }
4693
4694        // Identity permutation
4695        assert_eq!(perm, vec![0, 1, 2, 3]);
4696
4697        // Panel rows (4-7) should be untouched (still identity entries)
4698        for i in 4..8 {
4699            for j in 0..4 {
4700                assert!(
4701                    a[(i, j)].abs() < 1e-14,
4702                    "panel entry ({},{}) should be 0, got {}",
4703                    i,
4704                    j,
4705                    a[(i, j)]
4706                );
4707            }
4708        }
4709    }
4710
4711    #[test]
4712    fn test_factor_block_diagonal_block_scoped_swap() {
4713        // 8×8 matrix where max entry in block forces a swap.
4714        // With block-scoped swaps (row_limit=4), panel rows (4-7) should NOT be
4715        // affected by factor_block_diagonal. Panel permutation is handled separately.
4716        let mut a = symmetric_matrix(8, |i, j| {
4717            if i == j {
4718                [1.0, 5.0, 2.0, 3.0, 0.1, 0.1, 0.1, 0.1][i]
4719            } else if (i == 4 && j == 1) || (i == 1 && j == 4) {
4720                // Panel entry at (4,1)
4721                0.99
4722            } else {
4723                0.0
4724            }
4725        });
4726
4727        // Save panel row 4's original column values before factor
4728        let panel_row_before: Vec<f64> = (0..4).map(|j| a[(4, j)]).collect();
4729
4730        let (_d, perm, nelim, _stats, _log) = factor_block_diagonal(a.as_mut(), 0, 4, 1e-20, 4);
4731
4732        assert!(nelim > 0, "should eliminate at least 1 column");
4733
4734        // The max diagonal is 5.0 at column 1, so it should be swapped to position 0.
4735        assert_eq!(
4736            perm[0], 1,
4737            "first pivot should be original column 1 (max diag = 5.0)"
4738        );
4739
4740        // Panel rows should be UNCHANGED (block-scoped swap with row_limit=4)
4741        for j in 0..4 {
4742            assert!(
4743                (a[(4, j)] - panel_row_before[j]).abs() < 1e-14,
4744                "panel row (4,{}) should be unchanged: got {}, expected {}",
4745                j,
4746                a[(4, j)],
4747                panel_row_before[j]
4748            );
4749        }
4750    }
4751
4752    #[test]
4753    fn test_blas3_pipeline_reconstruction() {
4754        // Full Factor→Apply→Update on an 8×8 matrix via factor_inner.
4755        // Verify reconstruction ||PAP^T - LDL^T|| < 1e-12.
4756        let a = symmetric_matrix(8, |i, j| {
4757            if i == j {
4758                10.0 + i as f64
4759            } else {
4760                1.0 / (1.0 + (i as f64 - j as f64).abs())
4761            }
4762        });
4763
4764        let opts = AptpOptions {
4765            inner_block_size: 4, // Force 2 blocks for 8×8
4766            ..AptpOptions::default()
4767        };
4768        let result = aptp_factor(a.as_ref(), &opts).unwrap();
4769
4770        assert_eq!(result.stats.num_delayed, 0, "should have no delays");
4771
4772        let error =
4773            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
4774        assert!(
4775            error < 1e-12,
4776            "BLAS-3 pipeline reconstruction error {:.2e} >= 1e-12",
4777            error
4778        );
4779    }
4780
4781    #[test]
4782    fn test_blas3_threshold_failure_and_retry() {
4783        // Construct matrix where panel threshold is likely to fail for at least
4784        // one column, forcing backup/restore/delay/re-factor.
4785        // Use very strict threshold to trigger failures.
4786        let a = symmetric_matrix(8, |i, j| {
4787            let vals = [
4788                [1e-3, 1.0, 0.5, 0.5, 0.1, 0.1, 0.1, 0.1],
4789                [1.0, 1e-3, 0.5, 0.5, 0.1, 0.1, 0.1, 0.1],
4790                [0.5, 0.5, 10.0, 1.0, 0.1, 0.1, 0.1, 0.1],
4791                [0.5, 0.5, 1.0, 10.0, 0.1, 0.1, 0.1, 0.1],
4792                [0.1, 0.1, 0.1, 0.1, 5.0, 0.5, 0.0, 0.0],
4793                [0.1, 0.1, 0.1, 0.1, 0.5, 6.0, 0.0, 0.0],
4794                [0.1, 0.1, 0.1, 0.1, 0.0, 0.0, 7.0, 0.5],
4795                [0.1, 0.1, 0.1, 0.1, 0.0, 0.0, 0.5, 8.0],
4796            ];
4797            vals[i][j]
4798        });
4799
4800        let opts = AptpOptions {
4801            inner_block_size: 4,
4802            threshold: 0.01, // Standard threshold
4803            ..AptpOptions::default()
4804        };
4805        let result = aptp_factor(a.as_ref(), &opts).unwrap();
4806
4807        // Statistics invariant
4808        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
4809        assert_eq!(sum, 8, "statistics sum {} != 8", sum);
4810
4811        // Reconstruction (if fully eliminated)
4812        if result.stats.num_delayed == 0 {
4813            let error = dense_reconstruction_error(
4814                &a,
4815                &result.l,
4816                &result.d,
4817                result.perm.as_ref().arrays().0,
4818            );
4819            assert!(
4820                error < 1e-12,
4821                "threshold failure retry: reconstruction error {:.2e}",
4822                error
4823            );
4824        }
4825    }
4826
4827    #[test]
4828    fn test_adjust_for_2x2_boundary() {
4829        // Test that adjust_for_2x2_boundary decrements when last accepted pivot
4830        // is the first half of a 2×2 pair.
4831        let mut d = MixedDiagonal::new(4);
4832        d.set_1x1(0, 1.0);
4833        d.set_2x2(Block2x2 {
4834            first_col: 1,
4835            a: 1.0,
4836            b: 0.5,
4837            c: 2.0,
4838        });
4839        d.set_1x1(3, 3.0);
4840
4841        // If effective_nelim = 2, last is position 1 which is first of 2×2 pair (1,2).
4842        // Partner = 2 > 1, so should decrement to 1.
4843        assert_eq!(adjust_for_2x2_boundary(2, &d), 1);
4844
4845        // If effective_nelim = 3, last is position 2 which is second of 2×2 pair.
4846        // Partner = 1 < 2, so no adjustment.
4847        assert_eq!(adjust_for_2x2_boundary(3, &d), 3);
4848
4849        // If effective_nelim = 1, last is position 0 which is 1×1. No adjustment.
4850        assert_eq!(adjust_for_2x2_boundary(1, &d), 1);
4851
4852        // If effective_nelim = 4, last is position 3 which is 1×1. No adjustment.
4853        assert_eq!(adjust_for_2x2_boundary(4, &d), 4);
4854
4855        // Edge case: effective_nelim = 0
4856        assert_eq!(adjust_for_2x2_boundary(0, &d), 0);
4857    }
4858
4859    #[test]
4860    fn test_blas3_full_block_singular() {
4861        // Block where all entries < small. Verify all columns delayed.
4862        let mut a = Mat::zeros(8, 8);
4863        for i in 0..8 {
4864            a[(i, i)] = 1e-25;
4865        }
4866
4867        let opts = AptpOptions {
4868            inner_block_size: 4,
4869            failed_pivot_method: FailedPivotMethod::Pass,
4870            ..AptpOptions::default()
4871        };
4872        let mut a_copy = a.clone();
4873        let result = aptp_factor_in_place(a_copy.as_mut(), 8, &opts).unwrap();
4874
4875        assert_eq!(result.num_eliminated, 0);
4876        assert_eq!(result.stats.num_delayed, 8);
4877        assert_eq!(result.delayed_cols.len(), 8);
4878    }
4879
4880    // ---- Regression test: factor_inner with threshold failures (delays) ----
4881
4882    /// Generate a deterministic pseudo-random symmetric indefinite matrix.
4883    ///
4884    /// Uses a simple LCG-based hash for reproducibility without external deps.
4885    /// Entries are in [-1, 1] with diagonal scaled by `diag_scale`.
4886    fn deterministic_indefinite_matrix(n: usize, seed: u64, diag_scale: f64) -> Mat<f64> {
4887        // Pure function of (i,j) — compatible with symmetric_matrix's Fn requirement.
4888        let hash = |a: usize, b: usize| -> f64 {
4889            let mut s = seed
4890                .wrapping_add((a * 10007) as u64)
4891                .wrapping_add((b * 7) as u64);
4892            s = s
4893                .wrapping_mul(6364136223846793005)
4894                .wrapping_add(1442695040888963407);
4895            s = s
4896                .wrapping_mul(6364136223846793005)
4897                .wrapping_add(1442695040888963407);
4898            ((s >> 33) as f64) / (u32::MAX as f64 / 2.0) - 1.0
4899        };
4900
4901        symmetric_matrix(n, |i, j| {
4902            if i == j {
4903                hash(i, i) * diag_scale
4904            } else {
4905                // Use min/max so (i,j) and (j,i) produce the same value
4906                hash(i.min(j), i.max(j) + n)
4907            }
4908        })
4909    }
4910
4911    /// Apply the "cause_delays" pattern from SPRAL testing: multiply n/8 random
4912    /// rows (and corresponding columns, to maintain symmetry) by a large factor.
4913    /// This creates large off-diagonal entries that cause L entries to exceed 1/threshold.
4914    fn cause_delays(a: &mut Mat<f64>, seed: u64, scale: f64) {
4915        let n = a.nrows();
4916        let n_scaled = (n / 8).max(1);
4917
4918        // Deterministically select which rows to scale
4919        let mut state = seed;
4920        let mut next_idx = || -> usize {
4921            state = state
4922                .wrapping_mul(6364136223846793005)
4923                .wrapping_add(1442695040888963407);
4924            ((state >> 33) as usize) % n
4925        };
4926
4927        let mut scaled_rows = Vec::new();
4928        while scaled_rows.len() < n_scaled {
4929            let idx = next_idx();
4930            if !scaled_rows.contains(&idx) {
4931                scaled_rows.push(idx);
4932            }
4933        }
4934
4935        // Scale rows and columns symmetrically: A -> S * A * S
4936        // where S = diag(s_1, ..., s_n) with s_i = scale if i in scaled_rows, else 1
4937        for &r in &scaled_rows {
4938            for j in 0..n {
4939                a[(r, j)] *= scale;
4940                a[(j, r)] *= scale;
4941            }
4942            // Diagonal gets scaled twice (row and column), which is correct for S*A*S
4943        }
4944    }
4945
4946    #[test]
4947    fn test_factor_inner_with_delays() {
4948        // Regression test: factor_inner with threshold failures.
4949        //
4950        // Constructs matrices using the "cause_delays" pattern (SPRAL testing):
4951        // take a random symmetric indefinite matrix, then multiply n/8 random
4952        // rows and corresponding columns by 1000. This creates large off-diagonal
4953        // entries that cause L entries to exceed 1/threshold, triggering the
4954        // backup/restore/delay path in factor_inner.
4955        //
4956        // Tests multiple (n, ib) combinations to exercise different code paths
4957        // in the BLAS-3 Factor/Apply/Update loop.
4958
4959        struct TestConfig {
4960            n: usize,
4961            ib: usize,
4962            seed: u64,
4963            scale: f64,
4964        }
4965
4966        let configs = [
4967            // Small: 8x8 with ib=2 (4 blocks)
4968            TestConfig {
4969                n: 8,
4970                ib: 2,
4971                seed: 42,
4972                scale: 1000.0,
4973            },
4974            // Small: 8x8 with ib=4 (2 blocks)
4975            TestConfig {
4976                n: 8,
4977                ib: 4,
4978                seed: 42,
4979                scale: 1000.0,
4980            },
4981            // Medium: 16x16 with ib=4 (4 blocks)
4982            TestConfig {
4983                n: 16,
4984                ib: 4,
4985                seed: 42,
4986                scale: 1000.0,
4987            },
4988            // Medium: 16x16 with ib=2 (8 blocks)
4989            TestConfig {
4990                n: 16,
4991                ib: 2,
4992                seed: 42,
4993                scale: 1000.0,
4994            },
4995            // Larger: 32x32 with ib=4
4996            TestConfig {
4997                n: 32,
4998                ib: 4,
4999                seed: 42,
5000                scale: 1000.0,
5001            },
5002            // Larger: 32x32 with ib=8
5003            TestConfig {
5004                n: 32,
5005                ib: 8,
5006                seed: 42,
5007                scale: 1000.0,
5008            },
5009            // Different seed
5010            TestConfig {
5011                n: 16,
5012                ib: 4,
5013                seed: 137,
5014                scale: 1000.0,
5015            },
5016            // Extreme scale
5017            TestConfig {
5018                n: 16,
5019                ib: 4,
5020                seed: 42,
5021                scale: 1e6,
5022            },
5023            // 64x64 with ib=8
5024            TestConfig {
5025                n: 64,
5026                ib: 8,
5027                seed: 42,
5028                scale: 1000.0,
5029            },
5030            // 64x64 with ib=16
5031            TestConfig {
5032                n: 64,
5033                ib: 16,
5034                seed: 42,
5035                scale: 1000.0,
5036            },
5037        ];
5038
5039        let mut _any_delays_found = false;
5040        let mut any_failures = false;
5041
5042        for (idx, config) in configs.iter().enumerate() {
5043            let mut a = deterministic_indefinite_matrix(config.n, config.seed, 5.0);
5044            cause_delays(&mut a, config.seed + 1000, config.scale);
5045
5046            let opts = AptpOptions {
5047                inner_block_size: config.ib,
5048                outer_block_size: config.n.max(config.ib),
5049                threshold: 0.01,
5050                ..AptpOptions::default()
5051            };
5052
5053            let result = aptp_factor(a.as_ref(), &opts).unwrap();
5054
5055            let n = config.n;
5056            let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
5057            assert_eq!(
5058                sum, n,
5059                "config {}: statistics invariant broken: {} != {}",
5060                idx, sum, n
5061            );
5062
5063            if result.stats.num_delayed > 0 {
5064                _any_delays_found = true;
5065            }
5066
5067            // If fully eliminated, check reconstruction error
5068            if result.stats.num_delayed == 0 {
5069                let error = dense_reconstruction_error(
5070                    &a,
5071                    &result.l,
5072                    &result.d,
5073                    result.perm.as_ref().arrays().0,
5074                );
5075                if error >= 1e-12 {
5076                    any_failures = true;
5077                }
5078                assert!(
5079                    error < 1e-10,
5080                    "config {} (n={}, ib={}): reconstruction error {:.2e} >= 1e-10 \
5081                     (no delays but bad reconstruction indicates factor_inner bug)",
5082                    idx,
5083                    config.n,
5084                    config.ib,
5085                    error
5086                );
5087            }
5088        }
5089
5090        // At least one config should have triggered delays (confirming the
5091        // cause_delays pattern works with the threshold).
5092        // Note: we don't assert any_delays_found because with complete pivoting
5093        // the diagonal block never delays — only apply_and_check can reduce
5094        // effective_nelim. Whether this triggers depends on the matrix structure.
5095
5096        assert!(
5097            !any_failures,
5098            "Some configs had reconstruction error >= 1e-12 without delays. \
5099             This indicates a bug in factor_inner's backup/restore/update logic."
5100        );
5101    }
5102
5103    #[test]
5104    fn test_factor_inner_with_delays_targeted() {
5105        // More targeted test: construct a matrix specifically designed to trigger
5106        // threshold failure in factor_inner's apply_and_check step.
5107        //
5108        // Strategy: Create a matrix where complete pivoting on the ib×ib diagonal
5109        // block succeeds (all ib columns eliminated within the block), but the
5110        // panel entries below the block exceed 1/threshold after TRSM + D scaling.
5111        //
5112        // Matrix structure (8x8, ib=4):
5113        //   Block [0:4, 0:4]: moderate diagonal, small off-diagonal -- complete pivoting
5114        //     eliminates all 4 columns within the block with |L_block| <= 4
5115        //   Panel [4:8, 0:4]: large entries that exceed 1/threshold after TRSM
5116        //     Specifically, panel entries ~ 2.0 with pivots ~ 0.005, so L ~ 400 >> 100
5117        //   Block [4:8, 4:8]: moderate diagonal
5118        //
5119        // Complete pivoting searches only within the ib×ib diagonal block, so panel
5120        // entries don't affect pivot selection.
5121        //
5122        // Also tested with ib=2 to get partial threshold failure (first 2 columns
5123        // may pass if their diagonal is large enough).
5124        let n = 8;
5125
5126        for &ib in &[4, 2] {
5127            let a = symmetric_matrix(n, |i, j| {
5128                match (i, j) {
5129                    // Diagonal block [0:4, 0:4]
5130                    // Complete pivoting picks largest first: 10.0, 10.0, then 0.005, 0.005
5131                    // For ib=4: all 4 eliminated in block, but panel L ~ 2.0/0.005 = 400
5132                    // For ib=2: block [0:2,0:2] → pivots 10.0,10.0 → panel L ~ 2.0/10 = 0.2 (OK)
5133                    //           block [2:4,2:4] → pivots 0.005,0.005 → panel L ~ 2.0/0.005 = 400 (FAIL)
5134                    (0, 0) => 10.0,
5135                    (1, 1) => 10.0,
5136                    (2, 2) => 0.005,
5137                    (3, 3) => 0.005,
5138                    (i, j) if i < 4 && j < 4 && i != j => 0.001,
5139
5140                    // Panel [4:8, 0:4]
5141                    (_, j) if j < 4 => 2.0,
5142
5143                    // Lower-right block [4:8, 4:8]
5144                    (i, _) if i == j => 20.0,
5145                    (_, _) => 0.5,
5146                }
5147            });
5148
5149            // Test APTP delay behavior (TPP disabled so we can observe delays)
5150            let opts_pass = AptpOptions {
5151                inner_block_size: ib,
5152                outer_block_size: 256,
5153                threshold: 0.01,
5154                failed_pivot_method: FailedPivotMethod::Pass,
5155                ..AptpOptions::default()
5156            };
5157
5158            let result = aptp_factor(a.as_ref(), &opts_pass).unwrap();
5159
5160            let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
5161            assert_eq!(sum, n, "ib={}: statistics invariant: {} != {}", ib, sum, n);
5162
5163            // Check reconstruction if fully eliminated
5164            if result.stats.num_delayed == 0 {
5165                let error = dense_reconstruction_error(
5166                    &a,
5167                    &result.l,
5168                    &result.d,
5169                    result.perm.as_ref().arrays().0,
5170                );
5171                assert!(
5172                    error < 1e-12,
5173                    "targeted (ib={}): reconstruction error {:.2e} >= 1e-12 (no delays)",
5174                    ib,
5175                    error
5176                );
5177            }
5178
5179            // Verify that delays happened (at least for ib=4, columns 2,3 should be
5180            // delayed because L panel entries exceed 1/0.01=100)
5181            if ib == 4 || ib == 2 {
5182                // With small pivots 0.005 and panel entries 2.0:
5183                // L = 2.0 / 0.005 = 400, which far exceeds 100
5184                // Expect delays for the small-pivot columns
5185                assert!(
5186                    result.stats.num_delayed > 0,
5187                    "ib={}: expected some delays for small-pivot columns",
5188                    ib
5189                );
5190            }
5191
5192            // Also check partial factorization with contribution block
5193            {
5194                let p = 6; // Only 6 of 8 fully summed
5195                let mut a_copy = a.clone();
5196                let opts_partial = AptpOptions {
5197                    inner_block_size: ib,
5198                    outer_block_size: 256,
5199                    threshold: 0.01,
5200                    failed_pivot_method: FailedPivotMethod::Pass,
5201                    ..AptpOptions::default()
5202                };
5203                let result_p = aptp_factor_in_place(a_copy.as_mut(), p, &opts_partial).unwrap();
5204                let error_p = check_partial_factorization_in_place(&a, &a_copy, p, &result_p);
5205                assert!(
5206                    error_p < 1e-10,
5207                    "targeted partial (ib={}, p={}): error {:.2e} >= 1e-10",
5208                    ib,
5209                    p,
5210                    error_p
5211                );
5212            }
5213        }
5214    }
5215
5216    #[test]
5217    fn test_factor_inner_with_delays_aggressive() {
5218        // Aggressive test using a Lehmer-like matrix structure with perturbations
5219        // designed to trigger many threshold failures.
5220        //
5221        // The matrix is constructed as: A = Q * diag(d_i) * Q^T where d_i are
5222        // alternating +/- with varying magnitudes. Then specific rows are scaled
5223        // to create problematic panel entries.
5224        //
5225        // We test with multiple (n, ib) combos and multiple seeds.
5226
5227        let test_cases: Vec<(usize, usize, u64)> = vec![
5228            (12, 4, 1),
5229            (12, 4, 2),
5230            (12, 4, 3),
5231            (16, 4, 1),
5232            (16, 4, 2),
5233            (16, 8, 1),
5234            (20, 4, 1),
5235            (24, 4, 1),
5236            (24, 8, 1),
5237            (32, 4, 1),
5238            (32, 8, 1),
5239            (32, 16, 1),
5240            (48, 8, 1),
5241            (64, 8, 1),
5242            (64, 16, 1),
5243        ];
5244
5245        let mut _total_delays = 0;
5246        let mut max_error = 0.0_f64;
5247        let mut _worst_config = String::new();
5248
5249        for &(n, ib, seed) in &test_cases {
5250            // Build a random symmetric indefinite matrix
5251            let mut a = deterministic_indefinite_matrix(n, seed * 31337, 5.0);
5252
5253            // Apply cause_delays to create threshold failures
5254            cause_delays(&mut a, seed * 31337 + 7919, 1000.0);
5255
5256            let opts = AptpOptions {
5257                inner_block_size: ib,
5258                outer_block_size: n.max(ib),
5259                threshold: 0.01,
5260                ..AptpOptions::default()
5261            };
5262
5263            let result = aptp_factor(a.as_ref(), &opts).unwrap();
5264
5265            let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
5266            assert_eq!(
5267                sum, n,
5268                "(n={}, ib={}, seed={}): stats invariant {} != {}",
5269                n, ib, seed, sum, n
5270            );
5271
5272            _total_delays += result.stats.num_delayed;
5273
5274            if result.stats.num_delayed == 0 {
5275                let error = dense_reconstruction_error(
5276                    &a,
5277                    &result.l,
5278                    &result.d,
5279                    result.perm.as_ref().arrays().0,
5280                );
5281                if error > max_error {
5282                    max_error = error;
5283                    _worst_config =
5284                        format!("n={}, ib={}, seed={}: error={:.2e}", n, ib, seed, error);
5285                }
5286                assert!(
5287                    error < 1e-10,
5288                    "(n={}, ib={}, seed={}): reconstruction error {:.2e} >= 1e-10",
5289                    n,
5290                    ib,
5291                    seed,
5292                    error
5293                );
5294            }
5295        }
5296    }
5297
5298    #[test]
5299    fn test_factor_inner_cause_delays_then_compare_single_vs_blocked() {
5300        // Compare factor_inner (ib > 1) vs ib=n (effectively single-block).
5301        // Both should produce reconstruction error < 1e-12 if no delays,
5302        // and the same number of eliminations.
5303        //
5304        // This catches bugs where blocking introduces errors that single-block
5305        // factorization does not.
5306
5307        let seeds = [42u64, 137, 271, 314, 997];
5308        let n = 16;
5309
5310        for &seed in &seeds {
5311            let mut a = deterministic_indefinite_matrix(n, seed, 5.0);
5312            cause_delays(&mut a, seed + 5000, 500.0);
5313
5314            // Single-block: ib = n (factor_block_diagonal processes entire matrix)
5315            let opts_single = AptpOptions {
5316                inner_block_size: n,
5317                outer_block_size: n,
5318                threshold: 0.01,
5319                ..AptpOptions::default()
5320            };
5321            let result_single = aptp_factor(a.as_ref(), &opts_single).unwrap();
5322
5323            // Blocked: ib = 4
5324            let opts_blocked = AptpOptions {
5325                inner_block_size: 4,
5326                outer_block_size: n,
5327                threshold: 0.01,
5328                ..AptpOptions::default()
5329            };
5330            let result_blocked = aptp_factor(a.as_ref(), &opts_blocked).unwrap();
5331
5332            // Both should achieve good reconstruction when fully eliminated
5333            if result_single.stats.num_delayed == 0 {
5334                let error_s = dense_reconstruction_error(
5335                    &a,
5336                    &result_single.l,
5337                    &result_single.d,
5338                    result_single.perm.as_ref().arrays().0,
5339                );
5340                assert!(
5341                    error_s < 1e-12,
5342                    "seed={}: single-block error {:.2e}",
5343                    seed,
5344                    error_s
5345                );
5346            }
5347
5348            if result_blocked.stats.num_delayed == 0 {
5349                let error_b = dense_reconstruction_error(
5350                    &a,
5351                    &result_blocked.l,
5352                    &result_blocked.d,
5353                    result_blocked.perm.as_ref().arrays().0,
5354                );
5355                assert!(
5356                    error_b < 1e-12,
5357                    "seed={}: blocked error {:.2e} >= 1e-12",
5358                    seed,
5359                    error_b
5360                );
5361            }
5362        }
5363    }
5364
5365    /// Check partial factorization: P^T A P = L D L^T + [0; 0; contribution]
5366    ///
5367    /// After factor_inner with num_fully_summed = p < m, the matrix contains:
5368    /// - L entries in a[0..q, 0..q] (lower triangle, unit diagonal implicit)
5369    /// - L panel in a[q..m, 0..q]
5370    /// - Schur complement (contribution) in a[q..m, q..m]
5371    ///
5372    /// where q = num_eliminated <= p.
5373    ///
5374    /// Correctness condition: for the permuted matrix PAP^T,
5375    ///   PAP^T[0..m, 0..m] = L[0..m, 0..q] * D[0..q, 0..q] * L[0..m, 0..q]^T + [0_{q,q} 0; 0 S]
5376    /// where S = a[q..m, q..m] after factorization (the Schur complement).
5377    fn check_partial_factorization_in_place(
5378        original: &Mat<f64>,
5379        factored: &Mat<f64>,
5380        num_fully_summed: usize,
5381        result: &AptpFactorResult,
5382    ) -> f64 {
5383        let m = original.nrows();
5384        let q = result.num_eliminated;
5385        let p = num_fully_summed;
5386        let perm = &result.perm;
5387
5388        // Apply deferred contribution GEMM to get the full Schur complement.
5389        // After our restructuring, aptp_factor_in_place leaves A[p..m, p..m]
5390        // with only assembled values (no per-block trailing updates).
5391        // We need to apply the deferred GEMM to get the actual Schur complement.
5392        let mut factored = factored.clone();
5393        let nfs = m - p;
5394        if nfs > 0 && q > 0 {
5395            let mut contrib_buffer = Mat::zeros(nfs, nfs);
5396            let mut ld_ws = Mat::new();
5397            compute_contribution_gemm(
5398                &factored,
5399                p,
5400                q,
5401                m,
5402                &result.d,
5403                &mut contrib_buffer,
5404                &mut ld_ws,
5405                Par::Seq,
5406            );
5407            // Copy the Schur complement back into the factored matrix
5408            for i in 0..nfs {
5409                for j in 0..=i {
5410                    factored[(p + i, p + j)] = contrib_buffer[(i, j)];
5411                }
5412            }
5413        }
5414        let d = &result.d;
5415
5416        // Build PAP^T
5417        let mut papt = Mat::zeros(m, m);
5418        for i in 0..m {
5419            for j in 0..m {
5420                papt[(i, j)] = original[(perm[i], perm[j])];
5421            }
5422        }
5423
5424        // Extract L (m x q, unit lower triangular in first q columns)
5425        let mut l_full = Mat::zeros(m, q);
5426        for i in 0..q {
5427            l_full[(i, i)] = 1.0;
5428        }
5429        let mut col = 0;
5430        while col < q {
5431            match d.pivot_type(col) {
5432                PivotType::OneByOne => {
5433                    for i in (col + 1)..m {
5434                        l_full[(i, col)] = factored[(i, col)];
5435                    }
5436                    col += 1;
5437                }
5438                PivotType::TwoByTwo { partner } if partner > col => {
5439                    // a[(col+1, col)] is D off-diagonal, not L
5440                    for i in (col + 2)..m {
5441                        l_full[(i, col)] = factored[(i, col)];
5442                        l_full[(i, col + 1)] = factored[(i, col + 1)];
5443                    }
5444                    col += 2;
5445                }
5446                _ => {
5447                    col += 1;
5448                }
5449            }
5450        }
5451
5452        // Build D (q x q)
5453        let mut d_mat = Mat::zeros(q, q);
5454        col = 0;
5455        while col < q {
5456            match d.pivot_type(col) {
5457                PivotType::OneByOne => {
5458                    d_mat[(col, col)] = d.diagonal_1x1(col);
5459                    col += 1;
5460                }
5461                PivotType::TwoByTwo { partner } if partner > col => {
5462                    let block = d.diagonal_2x2(col);
5463                    d_mat[(col, col)] = block.a;
5464                    d_mat[(col, col + 1)] = block.b;
5465                    d_mat[(col + 1, col)] = block.b;
5466                    d_mat[(col + 1, col + 1)] = block.c;
5467                    col += 2;
5468                }
5469                _ => {
5470                    col += 1;
5471                }
5472            }
5473        }
5474
5475        // Compute L * D * L^T (m x m)
5476        // First: W = L * D (m x q)
5477        let mut w = Mat::zeros(m, q);
5478        for i in 0..m {
5479            for j in 0..q {
5480                let mut sum = 0.0;
5481                for k in 0..q {
5482                    sum += l_full[(i, k)] * d_mat[(k, j)];
5483                }
5484                w[(i, j)] = sum;
5485            }
5486        }
5487        // Then: LDL^T = W * L^T (m x m)
5488        let mut ldlt = Mat::zeros(m, m);
5489        for i in 0..m {
5490            for j in 0..m {
5491                let mut sum = 0.0;
5492                for k in 0..q {
5493                    sum += w[(i, k)] * l_full[(j, k)];
5494                }
5495                ldlt[(i, j)] = sum;
5496            }
5497        }
5498
5499        // Extract Schur complement S = factored[q..m, q..m]
5500        // The residual should be: PAP^T - LDL^T = [0 0; 0 S]
5501        // So: PAP^T[i,j] - LDL^T[i,j] should be:
5502        //   0 if i < q or j < q
5503        //   S[i-q, j-q] = factored[i,j] if i >= q and j >= q
5504        let mut max_error = 0.0_f64;
5505        let mut norm_sq = 0.0_f64;
5506        let mut diff_sq = 0.0_f64;
5507
5508        // Only check lower triangle (i >= j) because swap_symmetric only
5509        // maintains the lower triangle of the dense frontal matrix.
5510        // Production code (extract_contribution, extend_add) also reads
5511        // only the lower triangle.
5512        for i in 0..m {
5513            for j in 0..=i {
5514                // Count lower triangle entries twice for norm (symmetric)
5515                let weight = if i == j { 1.0 } else { 2.0 };
5516                norm_sq += weight * papt[(i, j)] * papt[(i, j)];
5517                let residual = papt[(i, j)] - ldlt[(i, j)];
5518                if i >= q && j >= q {
5519                    // This should equal factored[i, j] (the Schur complement)
5520                    let schur_entry = factored[(i, j)];
5521                    let err = (residual - schur_entry).abs();
5522                    if err > max_error {
5523                        max_error = err;
5524                    }
5525                    diff_sq += weight * (residual - schur_entry) * (residual - schur_entry);
5526                } else {
5527                    // Should be zero
5528                    diff_sq += weight * residual * residual;
5529                    if residual.abs() > max_error {
5530                        max_error = residual.abs();
5531                    }
5532                }
5533            }
5534        }
5535
5536        if norm_sq == 0.0 {
5537            diff_sq.sqrt()
5538        } else {
5539            (diff_sq / norm_sq).sqrt()
5540        }
5541    }
5542
5543    #[test]
5544    fn test_factor_inner_partial_with_delays_schur_check() {
5545        // This test exercises factor_inner on frontal matrices where
5546        // num_fully_summed < m (partial factorization with contribution block),
5547        // AND delays occur. This is exactly the scenario in multifrontal
5548        // factorization where the bug was observed.
5549        //
5550        // The test verifies that:
5551        //   PAP^T = L * D * L^T + [0 0; 0 S]
5552        // where S is the Schur complement stored in the lower-right of the
5553        // factored matrix.
5554        //
5555        // If backup/restore/update_cross_terms is wrong, S will be corrupted.
5556
5557        struct PartialConfig {
5558            m: usize,  // total matrix dimension
5559            p: usize,  // num_fully_summed
5560            ib: usize, // inner block size
5561            seed: u64,
5562            scale: f64,
5563        }
5564
5565        let configs = [
5566            // Small cases: 12x12 with p=8, ib=4 (2 inner blocks, 4 contribution rows)
5567            PartialConfig {
5568                m: 12,
5569                p: 8,
5570                ib: 4,
5571                seed: 42,
5572                scale: 1000.0,
5573            },
5574            PartialConfig {
5575                m: 12,
5576                p: 8,
5577                ib: 4,
5578                seed: 137,
5579                scale: 1000.0,
5580            },
5581            PartialConfig {
5582                m: 12,
5583                p: 8,
5584                ib: 2,
5585                seed: 42,
5586                scale: 1000.0,
5587            },
5588            // Medium: 16x12 (p=12 of 16), ib=4
5589            PartialConfig {
5590                m: 16,
5591                p: 12,
5592                ib: 4,
5593                seed: 42,
5594                scale: 1000.0,
5595            },
5596            PartialConfig {
5597                m: 16,
5598                p: 12,
5599                ib: 4,
5600                seed: 271,
5601                scale: 1000.0,
5602            },
5603            // 20x16, ib=4
5604            PartialConfig {
5605                m: 20,
5606                p: 16,
5607                ib: 4,
5608                seed: 42,
5609                scale: 1000.0,
5610            },
5611            PartialConfig {
5612                m: 20,
5613                p: 16,
5614                ib: 8,
5615                seed: 42,
5616                scale: 1000.0,
5617            },
5618            // Larger: 32x24, ib=8
5619            PartialConfig {
5620                m: 32,
5621                p: 24,
5622                ib: 8,
5623                seed: 42,
5624                scale: 1000.0,
5625            },
5626            PartialConfig {
5627                m: 32,
5628                p: 24,
5629                ib: 4,
5630                seed: 42,
5631                scale: 1000.0,
5632            },
5633            // 48x32, ib=8
5634            PartialConfig {
5635                m: 48,
5636                p: 32,
5637                ib: 8,
5638                seed: 42,
5639                scale: 1000.0,
5640            },
5641            // Extreme scale
5642            PartialConfig {
5643                m: 16,
5644                p: 12,
5645                ib: 4,
5646                seed: 42,
5647                scale: 1e6,
5648            },
5649            // More seeds
5650            PartialConfig {
5651                m: 16,
5652                p: 12,
5653                ib: 4,
5654                seed: 314,
5655                scale: 1000.0,
5656            },
5657            PartialConfig {
5658                m: 16,
5659                p: 12,
5660                ib: 4,
5661                seed: 997,
5662                scale: 1000.0,
5663            },
5664        ];
5665
5666        let mut any_delays = false;
5667        let mut worst_error = 0.0_f64;
5668        let mut worst_config = String::new();
5669
5670        for (idx, config) in configs.iter().enumerate() {
5671            let mut a = deterministic_indefinite_matrix(config.m, config.seed, 5.0);
5672            cause_delays(&mut a, config.seed + 1000, config.scale);
5673
5674            let opts = AptpOptions {
5675                inner_block_size: config.ib,
5676                outer_block_size: config.m.max(config.ib),
5677                threshold: 0.01,
5678                failed_pivot_method: FailedPivotMethod::Pass,
5679                ..AptpOptions::default()
5680            };
5681
5682            let original = a.clone();
5683            let result = aptp_factor_in_place(a.as_mut(), config.p, &opts).unwrap();
5684
5685            let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
5686            assert_eq!(
5687                sum, config.p,
5688                "config {} (m={}, p={}, ib={}): stats invariant {} != {}",
5689                idx, config.m, config.p, config.ib, sum, config.p
5690            );
5691
5692            if result.stats.num_delayed > 0 {
5693                any_delays = true;
5694            }
5695
5696            // Check partial factorization correctness (PAP^T = LDL^T + [0;S])
5697            let error = check_partial_factorization_in_place(&original, &a, config.p, &result);
5698
5699            if error > worst_error {
5700                worst_error = error;
5701                worst_config = format!(
5702                    "config {} (m={}, p={}, ib={}, seed={})",
5703                    idx, config.m, config.p, config.ib, config.seed
5704                );
5705            }
5706        }
5707
5708        assert!(
5709            any_delays,
5710            "No configurations triggered threshold delays. \
5711             Adjust scale or matrix construction to create delays."
5712        );
5713
5714        assert!(
5715            worst_error < 1e-10,
5716            "Worst partial factorization error: {:.2e} at {}\n\
5717             This indicates corrupted Schur complement — bug in \
5718             backup/restore/update_cross_terms logic in factor_inner.",
5719            worst_error,
5720            worst_config
5721        );
5722    }
5723
5724    // ---- NFS boundary in two-level factor tests ----
5725
5726    #[test]
5727    fn test_two_level_nfs_boundary_mid_block() {
5728        // Exercises the local nfs_boundary = p - col_start computation in
5729        // two_level_factor. When p is not a multiple of outer_block_size,
5730        // the last outer block has nfs_boundary < block_cols, meaning the
5731        // NFS region starts inside an inner block. update_trailing must
5732        // correctly restrict its Schur update to the FS region and skip
5733        // the NFS×NFS region (deferred to compute_contribution_gemm).
5734        //
5735        // If nfs_boundary is wrong, the Schur complement (contribution
5736        // block) will be corrupted, failing the reconstruction check.
5737
5738        struct Config {
5739            m: usize,
5740            p: usize,
5741            nb: usize,
5742            ib: usize,
5743            seed: u64,
5744        }
5745
5746        let configs = [
5747            // p=20, nb=16: first block nfs_boundary=20, second block nfs_boundary=4
5748            // (NFS starts at col 4 of the second 16-col outer block)
5749            Config {
5750                m: 48,
5751                p: 20,
5752                nb: 16,
5753                ib: 8,
5754                seed: 42,
5755            },
5756            // p=40, nb=32: first block nfs_boundary=40, second block nfs_boundary=8
5757            Config {
5758                m: 64,
5759                p: 40,
5760                nb: 32,
5761                ib: 8,
5762                seed: 42,
5763            },
5764            // p=10, nb=8: first block nfs_boundary=10, second block nfs_boundary=2
5765            // (very small NFS boundary in inner block)
5766            Config {
5767                m: 24,
5768                p: 10,
5769                nb: 8,
5770                ib: 4,
5771                seed: 137,
5772            },
5773            // p=25, nb=16: blocks at col_start=0 (nfs=25), col_start=16 (nfs=9)
5774            Config {
5775                m: 48,
5776                p: 25,
5777                nb: 16,
5778                ib: 8,
5779                seed: 271,
5780            },
5781            // Deliberately misalign p with ib too: p=13, nb=8, ib=4
5782            // block 0: nfs_boundary=13, block 1: nfs_boundary=5 (mid-ib boundary)
5783            Config {
5784                m: 32,
5785                p: 13,
5786                nb: 8,
5787                ib: 4,
5788                seed: 42,
5789            },
5790        ];
5791
5792        let mut worst_error = 0.0_f64;
5793        let mut worst_label = String::new();
5794
5795        for (idx, c) in configs.iter().enumerate() {
5796            let a = deterministic_indefinite_matrix(c.m, c.seed, 5.0);
5797
5798            let opts = AptpOptions {
5799                outer_block_size: c.nb,
5800                inner_block_size: c.ib,
5801                threshold: 0.01,
5802                failed_pivot_method: FailedPivotMethod::Pass,
5803                ..AptpOptions::default()
5804            };
5805
5806            let original = a.clone();
5807            let mut a_copy = a;
5808            let result = aptp_factor_in_place(a_copy.as_mut(), c.p, &opts).unwrap();
5809
5810            let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
5811            assert_eq!(
5812                sum, c.p,
5813                "config {}: statistics invariant {} != {}",
5814                idx, sum, c.p
5815            );
5816
5817            let error = check_partial_factorization_in_place(&original, &a_copy, c.p, &result);
5818
5819            if error > worst_error {
5820                worst_error = error;
5821                worst_label = format!(
5822                    "config {} (m={}, p={}, nb={}, ib={}, seed={})",
5823                    idx, c.m, c.p, c.nb, c.ib, c.seed
5824                );
5825            }
5826        }
5827
5828        assert!(
5829            worst_error < 1e-10,
5830            "Worst reconstruction error: {:.2e} at {}\n\
5831             This indicates the local nfs_boundary computation in \
5832             two_level_factor is producing incorrect Schur complements.",
5833            worst_error,
5834            worst_label
5835        );
5836    }
5837
5838    // ---- TPP (Threshold Partial Pivoting) fallback tests ----
5839
5840    #[test]
5841    fn test_tpp_helpers() {
5842        // 4x4 lower triangle:
5843        //  [  4.0   .    .    .  ]
5844        //  [  0.5  -3.0  .    .  ]
5845        //  [  0.1   0.2  5.0  .  ]
5846        //  [  0.3  -0.4  0.6  2.0]
5847        let a = symmetric_matrix(4, |i, j| {
5848            let vals = [
5849                [4.0, 0.5, 0.1, 0.3],
5850                [0.5, -3.0, 0.2, -0.4],
5851                [0.1, 0.2, 5.0, 0.6],
5852                [0.3, -0.4, 0.6, 2.0],
5853            ];
5854            vals[i][j]
5855        });
5856
5857        // tpp_is_col_small: col 0, from=0, to=4, small=5.0 → true (all entries < 5.0)
5858        assert!(tpp_is_col_small(a.as_ref(), 0, 0, 4, 5.0));
5859        // With small=0.01 → false (0.5, 0.1, 0.3 ≥ 0.01 and diag 4.0 ≥ 0.01)
5860        assert!(!tpp_is_col_small(a.as_ref(), 0, 0, 4, 0.01));
5861
5862        // Zero column → true
5863        let z = Mat::zeros(4, 4);
5864        assert!(tpp_is_col_small(z.as_ref(), 0, 0, 4, 1e-20));
5865
5866        // tpp_find_row_abs_max: row 3, cols 0..3
5867        // a[(3,0)]=0.3, a[(3,1)]=-0.4, a[(3,2)]=0.6 → max at col 2
5868        let t = tpp_find_row_abs_max(a.as_ref(), 3, 0, 3);
5869        assert_eq!(t, 2, "expected col 2, got {}", t);
5870
5871        // tpp_find_rc_abs_max_exclude: col 1, nelim=0, m=4, exclude=3
5872        // row part: a[(1,0)]=0.5
5873        // col part: a[(2,1)]=0.2 (excluding row 3)
5874        // max = 0.5
5875        let max_exc = tpp_find_rc_abs_max_exclude(a.as_ref(), 1, 0, 4, 3);
5876        assert!(
5877            (max_exc - 0.5).abs() < 1e-15,
5878            "expected 0.5, got {}",
5879            max_exc
5880        );
5881
5882        // tpp_test_2x2: (0, 1) with a11=4, a21=0.5, a22=-3
5883        // det = 4*(-3) - 0.25 = -12.25 → non-singular
5884        // With small=1e-20, u=0.01
5885        let result = tpp_test_2x2(a.as_ref(), 0, 1, 0.6, 0.6, 0.01, 1e-20);
5886        assert!(result.is_some(), "2x2 pivot (0,1) should pass");
5887        let (d11, d12, d22) = result.unwrap();
5888        assert!((d11 - 4.0).abs() < 1e-15);
5889        assert!((d12 - 0.5).abs() < 1e-15);
5890        assert!((d22 - (-3.0)).abs() < 1e-15);
5891
5892        // tpp_test_2x2 with near-singular block
5893        let tiny = symmetric_matrix(2, |_, _| 1e-25);
5894        assert!(tpp_test_2x2(tiny.as_ref(), 0, 1, 1e-25, 1e-25, 0.01, 1e-20).is_none());
5895    }
5896
5897    #[test]
5898    fn test_tpp_basic_1x1() {
5899        // 4x4 diagonal-dominant matrix: all diagonals are acceptable 1x1 pivots
5900        let a = symmetric_matrix(4, |i, j| {
5901            if i == j {
5902                [10.0, -8.0, 5.0, 7.0][i]
5903            } else {
5904                0.1
5905            }
5906        });
5907
5908        let opts = AptpOptions::default();
5909        let result = aptp_factor(a.as_ref(), &opts).unwrap();
5910
5911        assert_eq!(result.stats.num_delayed, 0);
5912        assert_eq!(result.stats.num_1x1 + 2 * result.stats.num_2x2, 4);
5913
5914        let error =
5915            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
5916        assert!(error < 1e-12, "reconstruction error {:.2e}", error);
5917    }
5918
5919    #[test]
5920    fn test_tpp_basic_2x2() {
5921        // 4x4 matrix where diagonal entries are small (fail 1x1 threshold)
5922        // but off-diagonal entries create good 2x2 pivots.
5923        let a = symmetric_matrix(4, |i, j| {
5924            let vals = [
5925                [0.001, 5.0, 0.01, 0.01],
5926                [5.0, 0.001, 0.01, 0.01],
5927                [0.01, 0.01, 0.001, 5.0],
5928                [0.01, 0.01, 5.0, 0.001],
5929            ];
5930            vals[i][j]
5931        });
5932
5933        let opts = AptpOptions::default();
5934        let result = aptp_factor(a.as_ref(), &opts).unwrap();
5935
5936        assert_eq!(result.stats.num_delayed, 0);
5937
5938        let error =
5939            dense_reconstruction_error(&a, &result.l, &result.d, result.perm.as_ref().arrays().0);
5940        assert!(error < 1e-12, "reconstruction error {:.2e}", error);
5941    }
5942
5943    #[test]
5944    fn test_tpp_fallback_after_aptp() {
5945        // 16x16 matrix designed so APTP delays columns that TPP recovers.
5946        // Small inner block size forces many ib-scoped searches that fail,
5947        // but TPP's exhaustive search finds pivots.
5948        //
5949        // Matrix structure:
5950        //   Block [0:8, 0:8]: diagonal 10.0, small off-diag → easy APTP pivots
5951        //   Block [8:12, 8:12]: diagonal 0.005, large off-diag → APTP delays
5952        //   Block [12:16, 12:16]: diagonal 20.0 → easy pivots
5953        //   Cross [8:12, 0:8]: large entries → panel L exceeds threshold
5954        let n = 16;
5955        let a = symmetric_matrix(n, |i, j| {
5956            match (i, j) {
5957                // Top-left 8x8: easy pivots
5958                (i, j) if i < 8 && j < 8 => {
5959                    if i == j {
5960                        10.0
5961                    } else {
5962                        0.01
5963                    }
5964                }
5965                // Middle 4x4: hard pivots (small diag, large off-diag)
5966                (i, j) if (8..12).contains(&i) && (8..12).contains(&j) => {
5967                    if i == j {
5968                        0.005
5969                    } else {
5970                        2.0
5971                    }
5972                }
5973                // Bottom-right 4x4: easy pivots
5974                (i, j) if i >= 12 && j >= 12 => {
5975                    if i == j {
5976                        20.0
5977                    } else {
5978                        0.1
5979                    }
5980                }
5981                // Cross terms: large
5982                (i, j) if (8..12).contains(&i) && j < 8 => 3.0,
5983                (i, j) if i < 8 && (8..12).contains(&j) => 3.0,
5984                // Small cross terms elsewhere
5985                _ => 0.01,
5986            }
5987        });
5988
5989        // With TPP disabled: APTP delays some columns
5990        let opts_pass = AptpOptions {
5991            inner_block_size: 4,
5992            failed_pivot_method: FailedPivotMethod::Pass,
5993            ..AptpOptions::default()
5994        };
5995        let result_pass = aptp_factor(a.as_ref(), &opts_pass).unwrap();
5996
5997        // With TPP enabled: should eliminate more columns
5998        let opts_tpp = AptpOptions {
5999            inner_block_size: 4,
6000            failed_pivot_method: FailedPivotMethod::Tpp,
6001            ..AptpOptions::default()
6002        };
6003        let result_tpp = aptp_factor(a.as_ref(), &opts_tpp).unwrap();
6004
6005        // TPP should recover at least some of APTP's delays
6006        assert!(
6007            result_tpp.stats.num_delayed <= result_pass.stats.num_delayed,
6008            "TPP should not increase delays"
6009        );
6010
6011        // If fully eliminated, check reconstruction
6012        if result_tpp.stats.num_delayed == 0 {
6013            let error = dense_reconstruction_error(
6014                &a,
6015                &result_tpp.l,
6016                &result_tpp.d,
6017                result_tpp.perm.as_ref().arrays().0,
6018            );
6019            assert!(
6020                error < 1e-12,
6021                "TPP reconstruction error {:.2e} >= 1e-12",
6022                error
6023            );
6024        }
6025    }
6026
6027    #[test]
6028    fn test_tpp_reconstruction_stress() {
6029        // 256x256 random indefinite matrix. Factor with TPP and verify reconstruction.
6030        use rand::SeedableRng;
6031        use rand::rngs::StdRng;
6032        use rand_distr::{Distribution, Uniform};
6033
6034        let n = 256;
6035        let mut rng = StdRng::seed_from_u64(42);
6036        let dist = Uniform::new(-1.0f64, 1.0).unwrap();
6037
6038        let mut a = Mat::zeros(n, n);
6039        for i in 0..n {
6040            for j in 0..=i {
6041                let v: f64 = dist.sample(&mut rng);
6042                a[(i, j)] = v;
6043                a[(j, i)] = v;
6044            }
6045            // Strengthen diagonal to avoid excessive delays
6046            let sign = if i % 3 == 0 { -1.0 } else { 1.0 };
6047            a[(i, i)] = sign * (5.0 + dist.sample(&mut rng).abs());
6048        }
6049
6050        let opts = AptpOptions {
6051            failed_pivot_method: FailedPivotMethod::Tpp,
6052            ..AptpOptions::default()
6053        };
6054        let result = aptp_factor(a.as_ref(), &opts).unwrap();
6055
6056        if result.stats.num_delayed == 0 {
6057            let error = dense_reconstruction_error(
6058                &a,
6059                &result.l,
6060                &result.d,
6061                result.perm.as_ref().arrays().0,
6062            );
6063            assert!(
6064                error < 1e-10,
6065                "stress reconstruction error {:.2e} >= 1e-10",
6066                error
6067            );
6068        }
6069
6070        // Invariant check
6071        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
6072        assert_eq!(sum, n, "statistics invariant: {} != {}", sum, n);
6073    }
6074
6075    #[test]
6076    fn test_failed_pivot_method_pass() {
6077        // Verify FailedPivotMethod::Pass skips TPP and preserves delays.
6078        // Use a matrix that APTP cannot fully factor (small diag, large off-diag).
6079        let n = 4;
6080        let a = symmetric_matrix(n, |i, j| {
6081            let vals = [
6082                [0.001, 5.0, 0.01, 0.01],
6083                [5.0, 0.001, 0.01, 0.01],
6084                [0.01, 0.01, 0.001, 5.0],
6085                [0.01, 0.01, 5.0, 0.001],
6086            ];
6087            vals[i][j]
6088        });
6089
6090        // With Pass: APTP's complete pivoting on ib-blocks should still handle
6091        // this via 2x2 pivots. But with a very small block size, it might delay.
6092        let opts_pass = AptpOptions {
6093            inner_block_size: 2,
6094            failed_pivot_method: FailedPivotMethod::Pass,
6095            ..AptpOptions::default()
6096        };
6097        let result_pass = aptp_factor(a.as_ref(), &opts_pass).unwrap();
6098
6099        let opts_tpp = AptpOptions {
6100            inner_block_size: 2,
6101            failed_pivot_method: FailedPivotMethod::Tpp,
6102            ..AptpOptions::default()
6103        };
6104        let result_tpp = aptp_factor(a.as_ref(), &opts_tpp).unwrap();
6105
6106        // TPP should not have more delays than Pass
6107        assert!(
6108            result_tpp.stats.num_delayed <= result_pass.stats.num_delayed,
6109            "TPP delays {} > Pass delays {}",
6110            result_tpp.stats.num_delayed,
6111            result_pass.stats.num_delayed
6112        );
6113
6114        // Both must satisfy invariant
6115        let sum_pass = result_pass.stats.num_1x1
6116            + 2 * result_pass.stats.num_2x2
6117            + result_pass.stats.num_delayed;
6118        let sum_tpp =
6119            result_tpp.stats.num_1x1 + 2 * result_tpp.stats.num_2x2 + result_tpp.stats.num_delayed;
6120        assert_eq!(sum_pass, n);
6121        assert_eq!(sum_tpp, n);
6122    }
6123
6124    #[test]
6125    fn test_tpp_zero_pivot_handling() {
6126        // Matrix with some zero columns — TPP should handle gracefully.
6127        let n = 4;
6128        let a = symmetric_matrix(n, |i, j| {
6129            if i == j {
6130                [5.0, 0.0, 0.0, 3.0][i]
6131            } else if (i == 0 && j == 3) || (i == 3 && j == 0) {
6132                0.1
6133            } else {
6134                0.0
6135            }
6136        });
6137
6138        let opts = AptpOptions {
6139            failed_pivot_method: FailedPivotMethod::Tpp,
6140            ..AptpOptions::default()
6141        };
6142        let result = aptp_factor(a.as_ref(), &opts).unwrap();
6143
6144        // Should handle zero pivots (columns 1, 2) as zero 1x1 pivots
6145        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
6146        assert_eq!(sum, n, "invariant: {} != {}", sum, n);
6147    }
6148
6149    // ---- Rectangular TPP kernel tests ----
6150
6151    #[test]
6152    fn test_tpp_helpers_rect() {
6153        // Test unified helpers on a 5×3 rectangular matrix (m=5, n=3).
6154        // Lower-triangle-like storage:
6155        //   col 0    col 1    col 2
6156        //  [ 4.0      .        .    ]  row 0
6157        //  [ 0.5     -3.0      .    ]  row 1
6158        //  [ 0.1      0.2     5.0   ]  row 2
6159        //  [ 0.3     -0.4     0.6   ]  row 3
6160        //  [ 0.7      0.8    -0.9   ]  row 4
6161        let a = Mat::from_fn(5, 3, |i, j| {
6162            let vals: [[f64; 3]; 5] = [
6163                [4.0, 0.0, 0.0],
6164                [0.5, -3.0, 0.0],
6165                [0.1, 0.2, 5.0],
6166                [0.3, -0.4, 0.6],
6167                [0.7, 0.8, -0.9],
6168            ];
6169            vals[i][j]
6170        });
6171
6172        // tpp_sym_entry: row=3, col=1 → a[(3,1)] = -0.4 (row >= col, col < n)
6173        assert!((tpp_sym_entry(a.as_ref(), 3, 1) - (-0.4)).abs() < 1e-15);
6174        // tpp_sym_entry: row=0, col=2 → a[(2,0)] = 0.1 (row < col, symmetric)
6175        assert!((tpp_sym_entry(a.as_ref(), 0, 2) - 0.1).abs() < 1e-15);
6176        // tpp_sym_entry: row=1, col=0 → a[(1,0)] = 0.5 (row >= col)
6177        assert!((tpp_sym_entry(a.as_ref(), 1, 0) - 0.5).abs() < 1e-15);
6178
6179        // tpp_is_col_small on rect: col 0, from=0, to=5, small=5.0 → true
6180        assert!(tpp_is_col_small(a.as_ref(), 0, 0, 5, 5.0));
6181        // col 0 with small=0.01 → false (0.5, 0.3, 0.7 ≥ 0.01)
6182        assert!(!tpp_is_col_small(a.as_ref(), 0, 0, 5, 0.01));
6183        // col 3 (idx >= ncols=3): row entries a[(3,0)]=0.3, a[(3,1)]=-0.4, a[(3,2)]=0.6
6184        // column entries: idx=3 >= n=3, so no column scan
6185        assert!(!tpp_is_col_small(a.as_ref(), 3, 0, 5, 0.5));
6186        // with larger threshold
6187        assert!(tpp_is_col_small(a.as_ref(), 3, 0, 5, 1.0));
6188
6189        // tpp_find_row_abs_max: row 4, cols 0..3
6190        // a[(4,0)]=0.7, a[(4,1)]=0.8, a[(4,2)]=-0.9 → max at col 2
6191        let t = tpp_find_row_abs_max(a.as_ref(), 4, 0, 3);
6192        assert_eq!(t, 2, "expected col 2, got {t}");
6193
6194        // tpp_find_rc_abs_max_exclude on rect: col 1, nelim=0, m=5, exclude=3
6195        // row: a[(1,0)]=0.5
6196        // col: a[(2,1)]=0.2, a[(4,1)]=0.8 (excluding row 3)
6197        // max = 0.8
6198        let max_exc = tpp_find_rc_abs_max_exclude(a.as_ref(), 1, 0, 5, 3);
6199        assert!((max_exc - 0.8).abs() < 1e-15, "expected 0.8, got {max_exc}");
6200
6201        // tpp_find_rc_abs_max_exclude for col >= ncols: col 3, nelim=0, m=5, exclude=usize::MAX
6202        // row: a[(3,0)]=0.3, a[(3,1)]=-0.4, a[(3,2)]=0.6
6203        // col: col=3 >= n=3, no column scan
6204        // max = 0.6
6205        let max_exc2 = tpp_find_rc_abs_max_exclude(a.as_ref(), 3, 0, 5, usize::MAX);
6206        assert!(
6207            (max_exc2 - 0.6).abs() < 1e-15,
6208            "expected 0.6, got {max_exc2}"
6209        );
6210    }
6211
6212    /// Reconstruct P^T A_FS P = L D L^T from rectangular factorization results.
6213    ///
6214    /// For a rectangular m×k factorization with ne eliminated columns:
6215    ///   - A_FS = the k×k fully-summed subblock of the original matrix
6216    ///   - L = ne×ne unit lower triangular (from l_storage[0..ne, 0..ne])
6217    ///   - D = ne×ne block diagonal
6218    ///   - P = permutation of the k FS columns
6219    ///
6220    /// Returns ||P^T A_FS P - L D L^T|| / ||A_FS|| for the leading ne×ne block.
6221    fn rect_reconstruction_error(
6222        original_rect: &Mat<f64>,
6223        factored_rect: &Mat<f64>,
6224        k: usize,
6225        result: &AptpFactorResult,
6226    ) -> f64 {
6227        let ne = result.num_eliminated;
6228        if ne == 0 {
6229            return 0.0;
6230        }
6231
6232        // Build L11 (ne × ne) unit lower triangular
6233        let mut l = Mat::zeros(ne, ne);
6234        let mut col = 0;
6235        while col < ne {
6236            l[(col, col)] = 1.0;
6237            match result.d.pivot_type(col) {
6238                PivotType::OneByOne => {
6239                    for r in (col + 1)..ne {
6240                        l[(r, col)] = factored_rect[(r, col)];
6241                    }
6242                    col += 1;
6243                }
6244                PivotType::TwoByTwo { partner } if partner > col => {
6245                    l[(col + 1, col + 1)] = 1.0;
6246                    for r in (col + 2)..ne {
6247                        l[(r, col)] = factored_rect[(r, col)];
6248                        l[(r, col + 1)] = factored_rect[(r, col + 1)];
6249                    }
6250                    col += 2;
6251                }
6252                _ => {
6253                    col += 1;
6254                }
6255            }
6256        }
6257
6258        // Build D (ne × ne)
6259        let mut d_mat = Mat::zeros(ne, ne);
6260        col = 0;
6261        while col < ne {
6262            match result.d.pivot_type(col) {
6263                PivotType::OneByOne => {
6264                    d_mat[(col, col)] = result.d.diagonal_1x1(col);
6265                    col += 1;
6266                }
6267                PivotType::TwoByTwo { partner } if partner > col => {
6268                    let block = result.d.diagonal_2x2(col);
6269                    d_mat[(col, col)] = block.a;
6270                    d_mat[(col, col + 1)] = block.b;
6271                    d_mat[(col + 1, col)] = block.b;
6272                    d_mat[(col + 1, col + 1)] = block.c;
6273                    col += 2;
6274                }
6275                _ => {
6276                    col += 1;
6277                }
6278            }
6279        }
6280
6281        // L * D
6282        let mut w = Mat::zeros(ne, ne);
6283        for i in 0..ne {
6284            for j in 0..ne {
6285                let mut sum = 0.0;
6286                for kk in 0..ne {
6287                    sum += l[(i, kk)] * d_mat[(kk, j)];
6288                }
6289                w[(i, j)] = sum;
6290            }
6291        }
6292
6293        // W * L^T = L D L^T
6294        let mut ldlt = Mat::zeros(ne, ne);
6295        for i in 0..ne {
6296            for j in 0..ne {
6297                let mut sum = 0.0;
6298                for kk in 0..ne {
6299                    sum += w[(i, kk)] * l[(j, kk)];
6300                }
6301                ldlt[(i, j)] = sum;
6302            }
6303        }
6304
6305        // Build P^T A_FS P (ne × ne leading block)
6306        // The original matrix is m×k (rectangular). The FS block is the k×k
6307        // symmetric subblock stored in the lower triangle of original_rect[0..k, 0..k].
6308        let perm = &result.perm;
6309        let mut pap = Mat::zeros(ne, ne);
6310        for i in 0..ne {
6311            for j in 0..ne {
6312                let pi = perm[i];
6313                let pj = perm[j];
6314                // Read from original_rect using symmetric access
6315                let val = if pi >= pj && pj < k {
6316                    original_rect[(pi, pj)]
6317                } else if pj > pi && pi < k {
6318                    original_rect[(pj, pi)]
6319                } else {
6320                    0.0
6321                };
6322                pap[(i, j)] = val;
6323            }
6324        }
6325
6326        // ||P^T A P - L D L^T|| / ||A_FS||
6327        let mut diff_sq = 0.0_f64;
6328        let mut orig_sq = 0.0_f64;
6329        for i in 0..ne {
6330            for j in 0..ne {
6331                let d = pap[(i, j)] - ldlt[(i, j)];
6332                diff_sq += d * d;
6333                orig_sq += pap[(i, j)] * pap[(i, j)];
6334            }
6335        }
6336        if orig_sq == 0.0 {
6337            return diff_sq.sqrt();
6338        }
6339        (diff_sq / orig_sq).sqrt()
6340    }
6341
6342    #[test]
6343    fn test_tpp_factor_rect_basic_1x1() {
6344        // 6×4 rect matrix (m=6, k=4 fully-summed) with diagonal-dominant structure.
6345        // All pivots should be 1x1.
6346        let m = 6;
6347        let k = 4;
6348        let mut a = Mat::zeros(m, k);
6349        // Fill lower-triangle-like data: diag-dominant FS block + NFS rows
6350        let vals: [[f64; 4]; 6] = [
6351            [10.0, 0.0, 0.0, 0.0],
6352            [0.1, -8.0, 0.0, 0.0],
6353            [0.2, 0.1, 12.0, 0.0],
6354            [0.1, -0.2, 0.3, 7.0],
6355            [0.3, 0.1, -0.1, 0.2], // NFS row
6356            [0.1, -0.3, 0.2, 0.1], // NFS row
6357        ];
6358        for i in 0..m {
6359            for j in 0..k {
6360                a[(i, j)] = vals[i][j];
6361            }
6362        }
6363
6364        let original = a.clone();
6365        let opts = AptpOptions::default();
6366        let result = tpp_factor_rect(a.as_mut(), k, &opts).unwrap();
6367
6368        assert_eq!(
6369            result.num_eliminated, 4,
6370            "expected all 4 columns eliminated"
6371        );
6372        assert_eq!(result.stats.num_delayed, 0);
6373
6374        let error = rect_reconstruction_error(&original, &a, k, &result);
6375        assert!(
6376            error < 1e-12,
6377            "rect 1x1 reconstruction error {error:.2e} >= 1e-12"
6378        );
6379    }
6380
6381    #[test]
6382    fn test_tpp_factor_rect_basic_2x2() {
6383        // 6×4 rect matrix with small diagonals and large off-diagonals
6384        // to force 2x2 pivots.
6385        let m = 6;
6386        let k = 4;
6387        let mut a = Mat::zeros(m, k);
6388        let vals: [[f64; 4]; 6] = [
6389            [0.001, 0.0, 0.0, 0.0],
6390            [5.0, 0.001, 0.0, 0.0],
6391            [0.01, 0.01, 0.001, 0.0],
6392            [0.01, 0.01, 5.0, 0.001],
6393            [0.1, 0.2, 0.1, 0.2], // NFS row
6394            [0.3, 0.1, 0.3, 0.1], // NFS row
6395        ];
6396        for i in 0..m {
6397            for j in 0..k {
6398                a[(i, j)] = vals[i][j];
6399            }
6400        }
6401
6402        let original = a.clone();
6403        let opts = AptpOptions::default();
6404        let result = tpp_factor_rect(a.as_mut(), k, &opts).unwrap();
6405
6406        assert_eq!(result.stats.num_delayed, 0, "expected no delays");
6407        assert!(result.stats.num_2x2 > 0, "expected at least one 2x2 pivot");
6408
6409        let error = rect_reconstruction_error(&original, &a, k, &result);
6410        assert!(
6411            error < 1e-12,
6412            "rect 2x2 reconstruction error {error:.2e} >= 1e-12"
6413        );
6414    }
6415
6416    #[test]
6417    fn test_tpp_factor_rect_with_delays() {
6418        // 8×4 rect matrix designed to produce delays.
6419        // All FS diagonals are tiny, off-diagonals create a poorly conditioned block.
6420        let m = 8;
6421        let k = 4;
6422        let mut a = Mat::zeros(m, k);
6423        // Very small diagonals, moderately sized but not pairable off-diags
6424        let vals: [[f64; 4]; 8] = [
6425            [1e-25, 0.0, 0.0, 0.0],
6426            [1e-25, 1e-25, 0.0, 0.0],
6427            [1e-25, 1e-25, 1e-25, 0.0],
6428            [1e-25, 1e-25, 1e-25, 1e-25],
6429            [0.1, 0.2, 0.3, 0.4],
6430            [0.5, 0.6, 0.7, 0.8],
6431            [0.2, 0.3, 0.4, 0.5],
6432            [0.1, 0.1, 0.1, 0.1],
6433        ];
6434        for i in 0..m {
6435            for j in 0..k {
6436                a[(i, j)] = vals[i][j];
6437            }
6438        }
6439
6440        let opts = AptpOptions::default();
6441        let result = tpp_factor_rect(a.as_mut(), k, &opts).unwrap();
6442
6443        // With all-tiny entries, the zero-pivot path should handle most columns
6444        // (set D=0, zero out L entries). This exercises the zero-pivot + rect path.
6445        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
6446        assert_eq!(sum, k, "invariant: {sum} != {k}");
6447    }
6448
6449    #[test]
6450    fn test_tpp_factor_rect_square_matches() {
6451        // Factor a 4×4 matrix with both tpp_factor_rect (4×4 rect, m=n=k=4) and
6452        // tpp_factor (square, start_col=0). Verify bit-identical results.
6453        let n = 4;
6454        let a = symmetric_matrix(n, |i, j| {
6455            let vals = [
6456                [10.0, 0.5, 0.1, 0.3],
6457                [0.5, -8.0, 0.2, -0.4],
6458                [0.1, 0.2, 12.0, 0.6],
6459                [0.3, -0.4, 0.6, -5.0],
6460            ];
6461            vals[i][j]
6462        });
6463
6464        let opts = AptpOptions::default();
6465
6466        // Path 1: tpp_factor_rect on 4×4 (square)
6467        let mut a_rect = a.clone();
6468        let result_rect = tpp_factor_rect(a_rect.as_mut(), n, &opts).unwrap();
6469
6470        // Path 2: tpp_factor (the fallback, with start_col=0, on a square matrix)
6471        let mut a_sq = a.clone();
6472        let mut col_order: Vec<usize> = (0..n).collect();
6473        let mut d_sq = MixedDiagonal::new(n);
6474        let mut stats_sq = AptpStatistics::default();
6475        let mut pivot_log_sq = Vec::with_capacity(n);
6476        let ne_sq = tpp_factor(
6477            a_sq.as_mut(),
6478            0,
6479            n,
6480            &mut col_order,
6481            &mut d_sq,
6482            &mut stats_sq,
6483            &mut pivot_log_sq,
6484            &opts,
6485        );
6486        d_sq.truncate(ne_sq);
6487
6488        // Verify identical num_eliminated
6489        assert_eq!(
6490            result_rect.num_eliminated, ne_sq,
6491            "num_eliminated: rect={} sq={}",
6492            result_rect.num_eliminated, ne_sq
6493        );
6494
6495        // Verify identical permutation
6496        assert_eq!(&result_rect.perm[..n], &col_order[..n], "perm mismatch");
6497
6498        // Verify identical D values
6499        let ne = result_rect.num_eliminated;
6500        let mut col = 0;
6501        while col < ne {
6502            match result_rect.d.pivot_type(col) {
6503                PivotType::OneByOne => {
6504                    let d_r = result_rect.d.diagonal_1x1(col);
6505                    let d_s = d_sq.diagonal_1x1(col);
6506                    assert!(
6507                        (d_r - d_s).abs() < 1e-15,
6508                        "D mismatch at col {col}: rect={d_r} sq={d_s}"
6509                    );
6510                    col += 1;
6511                }
6512                PivotType::TwoByTwo { partner } if partner > col => {
6513                    let br = result_rect.d.diagonal_2x2(col);
6514                    let bs = d_sq.diagonal_2x2(col);
6515                    assert!((br.a - bs.a).abs() < 1e-15, "D.a mismatch at col {col}");
6516                    assert!((br.b - bs.b).abs() < 1e-15, "D.b mismatch at col {col}");
6517                    assert!((br.c - bs.c).abs() < 1e-15, "D.c mismatch at col {col}");
6518                    col += 2;
6519                }
6520                _ => {
6521                    col += 1;
6522                }
6523            }
6524        }
6525
6526        // Verify bit-identical factored matrix contents (lower triangle)
6527        for j in 0..n {
6528            for i in j..n {
6529                let vr = a_rect[(i, j)];
6530                let vs = a_sq[(i, j)];
6531                assert!(
6532                    (vr - vs).abs() == 0.0,
6533                    "matrix mismatch at ({i},{j}): rect={vr} sq={vs}"
6534                );
6535            }
6536        }
6537    }
6538
6539    #[test]
6540    fn test_tpp_factor_rect_reconstruction_stress() {
6541        // 64×32 random indefinite rect matrix. Factor and verify reconstruction.
6542        use rand::SeedableRng;
6543        use rand::rngs::StdRng;
6544        use rand_distr::{Distribution, Uniform};
6545
6546        let m = 64;
6547        let k = 32;
6548        let mut rng = StdRng::seed_from_u64(99);
6549        let dist = Uniform::new(-1.0f64, 1.0).unwrap();
6550
6551        let mut a = Mat::zeros(m, k);
6552        // Fill FS block (k×k lower triangle) as symmetric indefinite
6553        for i in 0..k {
6554            for j in 0..=i {
6555                let v: f64 = dist.sample(&mut rng);
6556                a[(i, j)] = v;
6557            }
6558            // Strengthen diagonal
6559            let sign = if i % 3 == 0 { -1.0 } else { 1.0 };
6560            a[(i, i)] = sign * (5.0 + dist.sample(&mut rng).abs());
6561        }
6562        // Fill NFS rows
6563        for i in k..m {
6564            for j in 0..k {
6565                a[(i, j)] = dist.sample(&mut rng);
6566            }
6567        }
6568
6569        let original = a.clone();
6570        let opts = AptpOptions::default();
6571        let result = tpp_factor_rect(a.as_mut(), k, &opts).unwrap();
6572
6573        let error = rect_reconstruction_error(&original, &a, k, &result);
6574        assert!(
6575            error < 1e-10,
6576            "rect stress reconstruction error {error:.2e} >= 1e-10"
6577        );
6578
6579        // Invariant check
6580        let sum = result.stats.num_1x1 + 2 * result.stats.num_2x2 + result.stats.num_delayed;
6581        assert_eq!(sum, k, "statistics invariant: {sum} != {k}");
6582    }
6583
6584    #[test]
6585    fn test_compute_contribution_gemm_rect_matches_square() {
6586        // Set up identical factored data in both square (m×m) and rect (m×k) layout.
6587        // Run both compute_contribution_gemm and compute_contribution_gemm_rect.
6588        // Verify NFS×NFS output is bit-identical.
6589        let m = 8;
6590        let k = 4;
6591        let nfs = m - k;
6592
6593        // Build a symmetric 8×8 matrix and factor its 4×4 FS block
6594        let a_full = symmetric_matrix(m, |i, j| {
6595            if i == j {
6596                [10.0, -8.0, 12.0, 7.0, 3.0, -2.0, 5.0, 4.0][i]
6597            } else {
6598                0.1 * (i as f64 + j as f64 + 1.0) * if (i + j) % 2 == 0 { 1.0 } else { -1.0 }
6599            }
6600        });
6601
6602        // Factor with tpp_factor_rect on rect view
6603        let mut a_rect = Mat::zeros(m, k);
6604        for i in 0..m {
6605            for j in 0..k {
6606                a_rect[(i, j)] = a_full[(i, j)];
6607            }
6608        }
6609        let opts = AptpOptions::default();
6610        let result = tpp_factor_rect(a_rect.as_mut(), k, &opts).unwrap();
6611        let ne = result.num_eliminated;
6612
6613        if ne == 0 {
6614            return; // Can't compare without elimination
6615        }
6616
6617        // For a fair comparison, factor the square matrix identically:
6618        let mut a_sq_factor = a_full.clone();
6619        let result_sq = tpp_factor_rect(a_sq_factor.as_mut(), k, &opts).unwrap();
6620        // Since a_full is square (8×8 with ncols=8), this is a valid rect factorization
6621        // with k=4 FS columns. The L values in columns 0..k are identical.
6622
6623        // Compute contribution via rect path
6624        let mut contrib_rect = Mat::zeros(nfs, nfs);
6625        // Scatter NFS×NFS assembled data from original (via permutation)
6626        let perm = &result.perm;
6627        for i in 0..nfs {
6628            for j in 0..=i {
6629                let pi = perm[k + i];
6630                let pj = perm[k + j];
6631                let val = if pi >= pj {
6632                    a_full[(pi, pj)]
6633                } else {
6634                    a_full[(pj, pi)]
6635                };
6636                contrib_rect[(i, j)] = val;
6637            }
6638        }
6639        let mut contrib_rect2 = contrib_rect.clone();
6640        let mut ld_ws = Mat::zeros(nfs, ne);
6641        compute_contribution_gemm_rect(
6642            &a_rect,
6643            k,
6644            ne,
6645            m,
6646            &result.d,
6647            &mut contrib_rect2,
6648            &mut ld_ws,
6649            Par::Seq,
6650        );
6651
6652        // Compute contribution via square path
6653        let mut contrib_sq = Mat::zeros(nfs, nfs);
6654        for i in 0..nfs {
6655            for j in 0..=i {
6656                let pi = perm[k + i];
6657                let pj = perm[k + j];
6658                let val = if pi >= pj {
6659                    a_full[(pi, pj)]
6660                } else {
6661                    a_full[(pj, pi)]
6662                };
6663                contrib_sq[(i, j)] = val;
6664            }
6665        }
6666        // For the square path, we need the frontal data with NFS×NFS in place.
6667        // Build a square matrix with factored columns and original NFS×NFS
6668        let mut frontal_sq = Mat::zeros(m, m);
6669        // Copy factored L columns 0..k
6670        for j in 0..k {
6671            for i in 0..m {
6672                frontal_sq[(i, j)] = a_sq_factor[(i, j)];
6673            }
6674        }
6675        // Copy NFS×NFS block from original (permuted)
6676        for i in 0..nfs {
6677            for j in 0..=i {
6678                frontal_sq[(k + i, k + j)] = contrib_sq[(i, j)];
6679            }
6680        }
6681        let mut ld_ws2 = Mat::zeros(nfs, ne);
6682        compute_contribution_gemm(
6683            &frontal_sq,
6684            k,
6685            ne,
6686            m,
6687            &result_sq.d,
6688            &mut contrib_sq,
6689            &mut ld_ws2,
6690            Par::Seq,
6691        );
6692
6693        // Compare: both should produce the same NFS×NFS Schur complement
6694        for i in 0..nfs {
6695            for j in 0..=i {
6696                let vr = contrib_rect2[(i, j)];
6697                let vs = contrib_sq[(i, j)];
6698                assert!(
6699                    (vr - vs).abs() < 1e-12,
6700                    "contrib mismatch at ({i},{j}): rect={vr} sq={vs}"
6701                );
6702            }
6703        }
6704    }
6705
6706    #[test]
6707    fn test_extract_front_factors_rect_round_trip() {
6708        // Factor a small rect matrix, call extract_front_factors_rect, verify
6709        // L11 is unit lower triangular, D11 matches, reconstruction holds.
6710        let m = 6;
6711        let k = 4;
6712        let mut a = Mat::zeros(m, k);
6713        let vals: [[f64; 4]; 6] = [
6714            [10.0, 0.0, 0.0, 0.0],
6715            [0.5, -8.0, 0.0, 0.0],
6716            [0.2, 0.3, 12.0, 0.0],
6717            [0.1, -0.1, 0.4, 7.0],
6718            [0.3, 0.1, -0.2, 0.5],
6719            [0.1, -0.3, 0.2, 0.1],
6720        ];
6721        for i in 0..m {
6722            for j in 0..k {
6723                a[(i, j)] = vals[i][j];
6724            }
6725        }
6726
6727        let opts = AptpOptions::default();
6728        let result = tpp_factor_rect(a.as_mut(), k, &opts).unwrap();
6729        let ne = result.num_eliminated;
6730        assert!(ne > 0, "need at least one eliminated column");
6731
6732        let row_indices: Vec<usize> = (0..m).collect();
6733        let ff = extract_front_factors_rect(&a, m, k, &row_indices, &result);
6734
6735        assert_eq!(ff.num_eliminated, ne);
6736        assert_eq!(ff.l11.nrows(), ne);
6737        assert_eq!(ff.l11.ncols(), ne);
6738
6739        // L11 should be unit lower triangular
6740        for i in 0..ne {
6741            assert!(
6742                (ff.l11[(i, i)] - 1.0).abs() < 1e-15,
6743                "L11 diagonal at {i} is not 1.0: {}",
6744                ff.l11[(i, i)]
6745            );
6746            for j in (i + 1)..ne {
6747                assert!(
6748                    ff.l11[(i, j)].abs() < 1e-15,
6749                    "L11 upper triangle at ({i},{j}) is not 0: {}",
6750                    ff.l11[(i, j)]
6751                );
6752            }
6753        }
6754
6755        // L21 should have (m - ne) rows and ne columns
6756        assert_eq!(ff.l21.nrows(), m - ne);
6757        assert_eq!(ff.l21.ncols(), ne);
6758
6759        // D11 dimension should match ne
6760        assert_eq!(ff.d11.dimension(), ne);
6761    }
6762
6763    #[test]
6764    fn test_extract_contribution_rect_with_delays() {
6765        // Factor a rect matrix that produces some zero-pivot "delays",
6766        // call extract_contribution_rect, verify structure.
6767        let m = 6;
6768        let k = 4;
6769        let mut a = Mat::zeros(m, k);
6770        // Two good pivots (cols 0,1) and two zero-ish pivots (cols 2,3)
6771        let vals: [[f64; 4]; 6] = [
6772            [10.0, 0.0, 0.0, 0.0],
6773            [0.5, -8.0, 0.0, 0.0],
6774            [0.0, 0.0, 0.0, 0.0], // zero column → zero pivot
6775            [0.0, 0.0, 0.0, 0.0], // zero column → zero pivot
6776            [0.3, 0.1, 0.0, 0.0],
6777            [0.1, -0.3, 0.0, 0.0],
6778        ];
6779        for i in 0..m {
6780            for j in 0..k {
6781                a[(i, j)] = vals[i][j];
6782            }
6783        }
6784
6785        let opts = AptpOptions::default();
6786        let result = tpp_factor_rect(a.as_mut(), k, &opts).unwrap();
6787        let ne = result.num_eliminated;
6788
6789        // All 4 should be eliminated (2 real + 2 zero pivots)
6790        assert_eq!(ne, 4, "expected all 4 eliminated (2 real + 2 zero)");
6791
6792        let nfs = m - k;
6793        let mut contrib_buffer = Mat::zeros(nfs, nfs);
6794        let mut ld_ws = Mat::zeros(nfs, ne);
6795        compute_contribution_gemm_rect(
6796            &a,
6797            k,
6798            ne,
6799            m,
6800            &result.d,
6801            &mut contrib_buffer,
6802            &mut ld_ws,
6803            Par::Seq,
6804        );
6805
6806        let row_indices: Vec<usize> = (0..m).collect();
6807        let contrib = extract_contribution_rect(&a, m, k, &row_indices, &result, contrib_buffer);
6808
6809        // With all columns eliminated, num_delayed = 0
6810        assert_eq!(contrib.num_delayed, 0);
6811        // Contribution block size = m - ne = 2
6812        assert_eq!(contrib.data.nrows(), nfs);
6813        assert_eq!(contrib.row_indices.len(), nfs);
6814    }
6815
6816    #[test]
6817    fn test_swap_rect_boundary() {
6818        // Test swap_rect on a 5×3 matrix.
6819        let mut a = Mat::from_fn(5, 3, |i, j| (i * 10 + j) as f64);
6820
6821        // Swap (1, 4) where j=4 >= ncols=3
6822        // Before: row 1 = [10, 11, 12], row 4 = [40, 41, 42]
6823        swap_rect(a.as_mut(), 1, 4);
6824        // After swap: rows 1 and 4 should exchange their entries in columns 0..3
6825        // Row data for cols < min(i,j)=1: a[(1,0)] ↔ a[(4,0)]
6826        assert!((a[(1, 0)] - 40.0).abs() < 1e-15, "a[(1,0)]={}", a[(1, 0)]);
6827        assert!((a[(4, 0)] - 10.0).abs() < 1e-15, "a[(4,0)]={}", a[(4, 0)]);
6828        // Intermediate rows 2,3: a[(k,1)] ↔ a[(4,k)] for k in 2..4
6829        // a[(2,1)] was 21, a[(4,2)] was 42
6830        assert!((a[(2, 1)] - 42.0).abs() < 1e-15, "a[(2,1)]={}", a[(2, 1)]);
6831        assert!((a[(4, 2)] - 21.0).abs() < 1e-15, "a[(4,2)]={}", a[(4, 2)]);
6832
6833        // Now test swap where both < ncols: swap (0, 2) on a fresh matrix
6834        let mut b = Mat::from_fn(5, 3, |i, j| (i * 10 + j) as f64);
6835        // a[(0,0)]=0, a[(2,2)]=22, a[(1,0)]=10, a[(2,0)]=20, a[(2,1)]=21
6836        swap_rect(b.as_mut(), 0, 2);
6837        // Diagonals swap: b[(0,0)] ↔ b[(2,2)]
6838        assert!((b[(0, 0)] - 22.0).abs() < 1e-15, "b[(0,0)]={}", b[(0, 0)]);
6839        assert!((b[(2, 2)] - 0.0).abs() < 1e-15, "b[(2,2)]={}", b[(2, 2)]);
6840        // Rows k > j=2: b[(k,0)] ↔ b[(k,2)]
6841        assert!((b[(3, 0)] - 32.0).abs() < 1e-15, "b[(3,0)]={}", b[(3, 0)]);
6842        assert!((b[(3, 2)] - 30.0).abs() < 1e-15, "b[(3,2)]={}", b[(3, 2)]);
6843    }
6844
6845    // =====================================================================
6846    // SPRAL-Style Torture Tests
6847    // =====================================================================
6848    //
6849    // These tests exercise the dense APTP kernel with randomized adversarial
6850    // perturbations, matching SPRAL's torture test approach.
6851    //
6852    // References:
6853    // - SPRAL tests/ssids/kernels/ldlt_app.cxx (BSD-3): torture test pattern
6854    // - SPRAL tests/ssids/kernels/ldlt_tpp.cxx (BSD-3): TPP torture test pattern
6855
6856    use crate::testing::perturbations::TortureTestConfig;
6857    use rand::RngExt;
6858    use rand::SeedableRng;
6859    use rand::rngs::StdRng;
6860
6861    /// Run APP (complete pivoting) torture test for a single (m, n) configuration.
6862    fn ldlt_app_torture_test(m: usize, n: usize, config: &TortureTestConfig) {
6863        use crate::testing::perturbations;
6864
6865        let options = AptpOptions {
6866            inner_block_size: 32.min(n),
6867            ..AptpOptions::default()
6868        };
6869        let num_fully_summed = n;
6870
6871        let mut rng = StdRng::seed_from_u64(config.seed);
6872        let mut failures = Vec::new();
6873
6874        for instance in 0..config.num_instances {
6875            let mut a = perturbations::generate_dense_symmetric_indefinite(m, &mut rng);
6876
6877            // Apply probabilistic perturbation (70/20/10 split)
6878            let roll: f64 = rng.random::<f64>();
6879            let is_singular;
6880            if roll < config.delay_probability {
6881                perturbations::cause_delays(&mut a, options.inner_block_size, &mut rng);
6882                is_singular = false;
6883            } else if roll < config.delay_probability + config.singular_probability {
6884                if m >= 2 {
6885                    let col1 = RngExt::random_range(&mut rng, 0..m);
6886                    let mut col2 = RngExt::random_range(&mut rng, 0..m);
6887                    while col2 == col1 {
6888                        col2 = RngExt::random_range(&mut rng, 0..m);
6889                    }
6890                    perturbations::make_singular(&mut a, col1, col2);
6891                }
6892                is_singular = true;
6893            } else {
6894                if m >= options.inner_block_size && options.inner_block_size >= 2 {
6895                    let max_start = m - options.inner_block_size;
6896                    let block_row = RngExt::random_range(&mut rng, 0..=max_start);
6897                    perturbations::make_dblk_singular(&mut a, block_row, options.inner_block_size);
6898                }
6899                is_singular = true;
6900            }
6901
6902            // Capture the perturbed matrix BEFORE factorization (which modifies in-place)
6903            let original = a.clone();
6904
6905            let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
6906                aptp_factor_in_place(a.as_mut(), num_fully_summed, &options)
6907            }));
6908
6909            match result {
6910                Err(_) => {
6911                    failures.push(format!("instance {}: PANIC", instance));
6912                }
6913                Ok(Err(e)) => {
6914                    if !is_singular {
6915                        failures.push(format!("instance {}: unexpected error: {}", instance, e));
6916                    }
6917                }
6918                Ok(Ok(ref fr)) => {
6919                    let total = fr.num_eliminated + fr.delayed_cols.len();
6920                    if total != num_fully_summed {
6921                        failures.push(format!(
6922                            "instance {}: elim({}) + delayed({}) = {} != nfs({})",
6923                            instance,
6924                            fr.num_eliminated,
6925                            fr.delayed_cols.len(),
6926                            total,
6927                            num_fully_summed
6928                        ));
6929                        continue;
6930                    }
6931                    // Reconstruction check only for square (m == n) non-singular cases.
6932                    if m == n && !is_singular && fr.num_eliminated == num_fully_summed {
6933                        let l = extract_l(a.as_ref(), &fr.d, fr.num_eliminated);
6934                        let err = dense_reconstruction_error(
6935                            &original,
6936                            &l,
6937                            &fr.d,
6938                            &fr.perm[..fr.num_eliminated],
6939                        );
6940                        if err > config.backward_error_threshold {
6941                            failures.push(format!(
6942                                "instance {}: recon error {:.2e} > {:.2e}",
6943                                instance, err, config.backward_error_threshold
6944                            ));
6945                        }
6946                    }
6947                }
6948            }
6949        }
6950
6951        assert!(
6952            failures.is_empty(),
6953            "APP torture (m={}, n={}) had {} failures:\n{}",
6954            m,
6955            n,
6956            failures.len(),
6957            failures.join("\n")
6958        );
6959    }
6960
6961    /// Run TPP (threshold partial pivoting) torture test for a single (m, n) config.
6962    fn ldlt_tpp_torture_test(m: usize, n: usize, config: &TortureTestConfig) {
6963        use crate::testing::perturbations;
6964
6965        let options = AptpOptions {
6966            inner_block_size: n + 1, // ensure TPP path
6967            ..AptpOptions::default()
6968        };
6969        let num_fully_summed = n;
6970
6971        let mut rng = StdRng::seed_from_u64(config.seed);
6972        let mut failures = Vec::new();
6973
6974        for instance in 0..config.num_instances {
6975            let mut a = perturbations::generate_dense_symmetric_indefinite(m, &mut rng);
6976
6977            // TPP perturbation: 70% delays, 30% singular (no dblk_singular)
6978            let roll: f64 = rng.random::<f64>();
6979            let is_singular;
6980            if roll < config.delay_probability {
6981                perturbations::cause_delays(&mut a, options.inner_block_size, &mut rng);
6982                is_singular = false;
6983            } else {
6984                if m >= 2 {
6985                    let col1 = RngExt::random_range(&mut rng, 0..m);
6986                    let mut col2 = RngExt::random_range(&mut rng, 0..m);
6987                    while col2 == col1 {
6988                        col2 = RngExt::random_range(&mut rng, 0..m);
6989                    }
6990                    perturbations::make_singular(&mut a, col1, col2);
6991                }
6992                is_singular = true;
6993            }
6994
6995            // Capture perturbed matrix BEFORE factorization
6996            let original = a.clone();
6997
6998            let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
6999                aptp_factor_in_place(a.as_mut(), num_fully_summed, &options)
7000            }));
7001
7002            match result {
7003                Err(_) => {
7004                    failures.push(format!("instance {}: PANIC", instance));
7005                }
7006                Ok(Err(e)) => {
7007                    if !is_singular {
7008                        failures.push(format!("instance {}: unexpected error: {}", instance, e));
7009                    }
7010                }
7011                Ok(Ok(ref fr)) => {
7012                    let total = fr.num_eliminated + fr.delayed_cols.len();
7013                    if total != num_fully_summed {
7014                        failures.push(format!(
7015                            "instance {}: elim({}) + delayed({}) = {} != nfs({})",
7016                            instance,
7017                            fr.num_eliminated,
7018                            fr.delayed_cols.len(),
7019                            total,
7020                            num_fully_summed
7021                        ));
7022                        continue;
7023                    }
7024                    // Reconstruction + L growth check only for square non-singular
7025                    if m == n && !is_singular && fr.num_eliminated == num_fully_summed {
7026                        let l = extract_l(a.as_ref(), &fr.d, fr.num_eliminated);
7027                        let err = dense_reconstruction_error(
7028                            &original,
7029                            &l,
7030                            &fr.d,
7031                            &fr.perm[..fr.num_eliminated],
7032                        );
7033                        if err > config.backward_error_threshold {
7034                            failures.push(format!(
7035                                "instance {}: recon error {:.2e} > {:.2e}",
7036                                instance, err, config.backward_error_threshold
7037                            ));
7038                        }
7039                        // L growth bound: max |L_ij| <= 1/threshold
7040                        let l_bound = 1.0 / options.threshold;
7041                        let mut max_l = 0.0_f64;
7042                        for i in 0..l.nrows() {
7043                            for j in 0..fr.num_eliminated {
7044                                if i != j {
7045                                    max_l = max_l.max(l[(i, j)].abs());
7046                                }
7047                            }
7048                        }
7049                        if max_l > l_bound * (1.0 + 1e-10) {
7050                            failures.push(format!(
7051                                "instance {}: L growth |L|={:.4} > 1/u={:.4}",
7052                                instance, max_l, l_bound
7053                            ));
7054                        }
7055                    }
7056                }
7057            }
7058        }
7059
7060        assert!(
7061            failures.is_empty(),
7062            "TPP torture (m={}, n={}) had {} failures:\n{}",
7063            m,
7064            n,
7065            failures.len(),
7066            failures.join("\n")
7067        );
7068    }
7069
7070    // ---- APP Torture Test Entry Points ----
7071
7072    #[test]
7073    #[ignore] // Long-running: ~500 factorizations per test
7074    fn torture_app_32x32() {
7075        let config = TortureTestConfig {
7076            num_instances: 500,
7077            seed: 42_000,
7078            ..TortureTestConfig::default()
7079        };
7080        ldlt_app_torture_test(32, 32, &config);
7081    }
7082
7083    #[test]
7084    #[ignore]
7085    fn torture_app_64x64() {
7086        let config = TortureTestConfig {
7087            num_instances: 500,
7088            seed: 42_001,
7089            ..TortureTestConfig::default()
7090        };
7091        ldlt_app_torture_test(64, 64, &config);
7092    }
7093
7094    #[test]
7095    #[ignore]
7096    fn torture_app_128x128() {
7097        let config = TortureTestConfig {
7098            num_instances: 500,
7099            seed: 42_002,
7100            ..TortureTestConfig::default()
7101        };
7102        ldlt_app_torture_test(128, 128, &config);
7103    }
7104
7105    #[test]
7106    #[ignore]
7107    fn torture_app_128x48() {
7108        let config = TortureTestConfig {
7109            num_instances: 500,
7110            seed: 42_003,
7111            ..TortureTestConfig::default()
7112        };
7113        ldlt_app_torture_test(128, 48, &config);
7114    }
7115
7116    #[test]
7117    #[ignore]
7118    fn torture_app_256x256() {
7119        let config = TortureTestConfig {
7120            num_instances: 500,
7121            seed: 42_004,
7122            ..TortureTestConfig::default()
7123        };
7124        ldlt_app_torture_test(256, 256, &config);
7125    }
7126
7127    // ---- TPP Torture Test Entry Points ----
7128
7129    #[test]
7130    #[ignore]
7131    fn torture_tpp_8x4() {
7132        let config = TortureTestConfig {
7133            num_instances: 500,
7134            seed: 43_000,
7135            ..TortureTestConfig::default()
7136        };
7137        ldlt_tpp_torture_test(8, 4, &config);
7138    }
7139
7140    #[test]
7141    #[ignore]
7142    fn torture_tpp_33x21() {
7143        let config = TortureTestConfig {
7144            num_instances: 500,
7145            seed: 43_001,
7146            ..TortureTestConfig::default()
7147        };
7148        ldlt_tpp_torture_test(33, 21, &config);
7149    }
7150
7151    #[test]
7152    #[ignore]
7153    fn torture_tpp_100x100() {
7154        let config = TortureTestConfig {
7155            num_instances: 500,
7156            seed: 43_002,
7157            ..TortureTestConfig::default()
7158        };
7159        ldlt_tpp_torture_test(100, 100, &config);
7160    }
7161
7162    #[test]
7163    #[ignore]
7164    fn torture_tpp_100x50() {
7165        let config = TortureTestConfig {
7166            num_instances: 500,
7167            seed: 43_003,
7168            ..TortureTestConfig::default()
7169        };
7170        ldlt_tpp_torture_test(100, 50, &config);
7171    }
7172
7173    // =====================================================================
7174    // Property-Based Tests — Kernel Level
7175    // =====================================================================
7176
7177    use crate::testing::strategies;
7178    use proptest::prelude::*;
7179
7180    proptest! {
7181        #![proptest_config(ProptestConfig::with_cases(256))]
7182
7183        #[test]
7184        fn property_pd_reconstruction(
7185            a in strategies::arb_symmetric_pd(5..=100)
7186        ) {
7187            let n = a.nrows();
7188            let options = AptpOptions::default();
7189            let result = aptp_factor(a.as_ref(), &options);
7190            if let Ok(ref f) = result {
7191                let perm_fwd = f.perm.as_ref().arrays().0;
7192                let err = dense_reconstruction_error(&a, &f.l, &f.d, perm_fwd);
7193                prop_assert!(
7194                    err < 1e-12,
7195                    "PD reconstruction error {:.2e} for n={}", err, n
7196                );
7197            }
7198        }
7199
7200        #[test]
7201        fn property_inertia_sum(
7202            a in strategies::arb_symmetric_indefinite(5..=100)
7203        ) {
7204            let n = a.nrows();
7205            let options = AptpOptions::default();
7206            let result = aptp_factor(a.as_ref(), &options);
7207            if let Ok(ref f) = result {
7208                let inertia = f.d.compute_inertia();
7209                let num_eliminated = n - f.delayed_cols.len();
7210                // Inertia covers exactly the eliminated columns
7211                prop_assert_eq!(
7212                    inertia.dimension(), num_eliminated,
7213                    "inertia sum {} != num_eliminated={} (n={}, delayed={})",
7214                    inertia.dimension(), num_eliminated, n, f.delayed_cols.len()
7215                );
7216            }
7217        }
7218
7219        #[test]
7220        fn property_permutation_valid(
7221            a in strategies::arb_symmetric_indefinite(5..=100)
7222        ) {
7223            let n = a.nrows();
7224            let options = AptpOptions::default();
7225            let result = aptp_factor(a.as_ref(), &options);
7226            if let Ok(ref f) = result {
7227                let (fwd, inv) = f.perm.as_ref().arrays();
7228                // Check fwd is a valid permutation
7229                let mut seen = vec![false; n];
7230                for &idx in fwd {
7231                    prop_assert!(idx < n, "perm fwd index {} out of bounds for n={}", idx, n);
7232                    prop_assert!(!seen[idx], "duplicate in perm fwd: {}", idx);
7233                    seen[idx] = true;
7234                }
7235                // Check fwd and inv are inverses
7236                for i in 0..n {
7237                    prop_assert_eq!(
7238                        inv[fwd[i]], i,
7239                        "perm fwd/inv not inverses at i={}", i
7240                    );
7241                }
7242            }
7243        }
7244
7245        #[test]
7246        fn property_no_panics_perturbed(
7247            a in strategies::arb_symmetric_indefinite(5..=50),
7248            seed in any::<u64>()
7249        ) {
7250            use crate::testing::perturbations as perturb;
7251            let mut a_mut = a.clone();
7252            let mut rng = StdRng::seed_from_u64(seed);
7253            perturb::cause_delays(&mut a_mut, 32, &mut rng);
7254
7255            let options = AptpOptions::default();
7256            // Should not panic — may return Ok or Err
7257            let _ = aptp_factor(a_mut.as_ref(), &options);
7258        }
7259    }
7260}