feral 0.11.1

Sparse symmetric indefinite direct solver in pure Rust, with certified inertia counts.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
//! Sparse unsymmetric LU factorization (left-looking, Gilbert–Peierls style).
//!
//! Factors `P A Q = L U` with threshold partial pivoting. `Q` is the
//! fill-reducing column ordering from [`SparseLuSymbolic`]; `P` is the row
//! permutation from pivoting. `L` is unit lower triangular (strict-lower stored
//! CSC, unit diagonal implicit); `U` is upper triangular stored row-wise (CSR,
//! strict-upper, plus an explicit diagonal) so the Forrest–Tomlin update can do
//! row operations on it. Both are in pivot-position coordinates.
//!
//! Each column `k` (processing original column `qcol[k]`) is computed by a
//! forward substitution `L w = A(:,qcol[k])`, output-sensitive via a
//! Gilbert–Peierls depth-first reach: the numeric solve runs only over the
//! pivot positions reachable from the nonzeros of `A(:,qcol[k])`.

use super::scaling::{compute_lu_scale, LuScale};
use super::sparse_symbolic::SparseLuSymbolic;
use super::{LuParams, LuScaling, LuSingularAction};
use crate::error::FeralError;
use crate::lu::sparse_matrix::SparseColMatrix;

/// One elementary operation of a Forrest–Tomlin update's bump elimination,
/// recorded so it can be replayed on a solve vector (and transposed for btran).
#[derive(Debug, Clone)]
pub(super) enum FtOp {
    /// Swap two pivot positions (partial-pivot row interchange in the bump).
    Swap(usize, usize),
    /// `target_row -= mult · src_row` (Gauss elimination of a sub-diagonal).
    Axpy {
        target: usize,
        src: usize,
        mult: f64,
    },
}

/// One Forrest–Tomlin column-replacement update: the sequence of elementary row
/// operations (partial-pivot swaps + eliminations) that re-triangularized `U`
/// after the spike was inserted. The base `L` is never touched; these ops are
/// replayed on the solve vector between the `L`-solve and the `U`-solve in
/// `ftran` (transposed, in reverse, between `Uᵀ` and `Lᵀ` in `btran`). Because
/// the bump is local and sparse, each eta is `O(bump)` — no dense `τ`.
#[derive(Debug, Clone)]
pub(super) struct FtEta {
    pub ops: Vec<FtOp>,
}

impl FtEta {
    /// Apply this elimination `E` forward to `y` (`y ← E y`).
    pub(super) fn apply_forward(&self, y: &mut [f64]) {
        for op in self.ops.iter() {
            match *op {
                FtOp::Swap(a, b) => y.swap(a, b),
                FtOp::Axpy { target, src, mult } => y[target] -= mult * y[src],
            }
        }
    }

    /// Apply `Eᵀ` to `y` (reverse the ops, transpose each).
    pub(super) fn apply_transpose(&self, y: &mut [f64]) {
        for op in self.ops.iter().rev() {
            match *op {
                FtOp::Swap(a, b) => y.swap(a, b),
                FtOp::Axpy { target, src, mult } => y[src] -= mult * y[target],
            }
        }
    }
}

/// Sparse LU factorization of a square basis.
#[derive(Debug, Clone)]
pub struct SparseLu {
    pub(super) m: usize,
    // L: unit lower triangular, strict-lower stored CSC, pivot-position rows.
    pub(super) l_col_ptr: Vec<usize>,
    pub(super) l_row_idx: Vec<usize>,
    pub(super) l_val: Vec<f64>,
    // U: upper triangular, one sorted row per pivot position, each row a list
    // of `(column_position, value)` with column >= row and the diagonal entry
    // FIRST (column == row). Mutable per-row storage so the Forrest–Tomlin
    // update can do in-place row operations.
    pub(super) u_rows: Vec<Vec<(usize, f64)>>,
    /// `perm[k]` = original row in pivot position `k` (`(P a)[k] = a[perm[k]]`).
    pub(super) perm: Vec<usize>,
    /// Inverse of `perm`: `perm_inv[orig_row] = pivot_position`. Used to seed the
    /// sparse spike solve in the Forrest–Tomlin update.
    pub(super) perm_inv: Vec<usize>,
    /// Column order: factorization column `k` is original column `qcol[k]`.
    pub(super) qcol: Vec<usize>,
    /// Inverse of `qcol`: `qcol_inv[original_col] = column_position`.
    pub(super) qcol_inv: Vec<usize>,
    /// Column-wise index of `U`'s strict-upper part: `u_above[c]` is the sorted
    /// list of rows `i < c` with `U[i,c] != 0`. Lets the FT update find column
    /// `r`'s existing entries (for in-place replacement) without scanning all
    /// rows. Maintained across updates; not used by the solves.
    pub(super) u_above: Vec<Vec<usize>>,
    /// Forrest–Tomlin column-replacement updates applied since the last
    /// factor/refactor. Each is a replayable bump elimination (`O(bump)`), so
    /// warm solves stay sparse (no dense eta chain).
    pub(super) etas: Vec<FtEta>,
    /// Running growth monitor: the ‖U‖∞ element-growth high-water ratio
    /// (largest `max|U|` over the updates ÷ [`Self::u_max0`]). Compounds across
    /// a chain of updates, unlike a max-single-multiplier monitor (L5).
    pub(super) growth: f64,
    /// `max|U|` immediately after factor — denominator of the element-growth
    /// monitor. Floored away from zero.
    pub(super) u_max0: f64,
    /// Total Gilbert–Peierls reach nodes visited during the factor — a
    /// structural scalability witness (`O(nnz(U))`, not `O(n²)`).
    pub(super) reach_visits: usize,
    pub(super) params: LuParams,
    /// Two-sided scaling of the factored matrix (identity when unscaled).
    pub(super) scale: LuScale,
    /// Inverse of `scale.rperm` (original row -> scaled-row position), so the
    /// sparse FT update can map an entering column's nonzeros into spike space
    /// without an `O(n)` scan.
    pub(super) scale_rperm_inv: Vec<usize>,
    pub(super) scratch: Vec<f64>,
    /// Reusable length-`m` boolean marker for the FT update's sparse spike
    /// (tracks touched positions so they can be cleared in `O(touched)`).
    pub(super) scratch_mark: Vec<bool>,
    /// Dedicated length-`m` work buffer for the FT update's sparse spike, kept
    /// zeroed between updates. Separate from `scratch` (which the solves dirty),
    /// so `compute_spike`'s sparse scatter can assume a clean buffer.
    pub(super) ft_work: Vec<f64>,
    /// Pooled length-`m` buffer for the scaled `ftran`/`btran` wrappers' inner
    /// RHS (`bt`); distinct from `scratch`, which the core solve dirties (L3).
    pub(super) scratch_b: Vec<f64>,
    /// Pooled length-`m` residual buffer for iterative refinement (`r`);
    /// distinct from `scratch`/`scratch_b`, which the inner solve uses (L3).
    pub(super) scratch_c: Vec<f64>,
    /// Pooled length-`m` buffer holding the refinement's original-RHS snapshot
    /// (`a`); live across the whole refine loop, so it cannot reuse the others.
    pub(super) scratch_d: Vec<f64>,
}

impl SparseLu {
    /// Factor `a` using the column ordering in `symbolic`.
    pub fn factor(
        a: &SparseColMatrix,
        symbolic: &SparseLuSymbolic,
        params: LuParams,
    ) -> Result<Self, FeralError> {
        params.validate()?;
        let m = a.m;
        if symbolic.m != m {
            return Err(FeralError::DimensionMismatch {
                expected: m,
                got: symbolic.m,
            });
        }
        // Scaling: factor à = D_row Π A D_col (pattern is invariant under row
        // permutation/scaling, so the column ordering `symbolic` still applies).
        let (scale, scaled) = if params.scaling == LuScaling::None {
            (LuScale::identity(m), None)
        } else {
            let scale = compute_lu_scale(a, params.scaling)?;
            let mat = scale.apply_sparse(a)?;
            (scale, Some(mat))
        };
        let a: &SparseColMatrix = scaled.as_ref().unwrap_or(a);

        let qcol = symbolic.qcol.clone();
        let qcol_inv = symbolic.qcol_inv.clone();

        let mut w = vec![0.0_f64; m];
        let mut mark = vec![false; m];
        let mut touched: Vec<usize> = Vec::new();
        let mut pinv: Vec<isize> = vec![-1; m];
        let mut perm = vec![0usize; m]; // pivot pos -> orig row

        let mut l_col_ptr = Vec::with_capacity(m + 1);
        let mut l_row_idx: Vec<usize> = Vec::new(); // original rows, remapped later
        let mut l_val: Vec<f64> = Vec::new();
        l_col_ptr.push(0);
        let mut u_col_ptr = Vec::with_capacity(m + 1);
        let mut u_row_idx: Vec<usize> = Vec::new();
        let mut u_val: Vec<f64> = Vec::new();
        u_col_ptr.push(0);
        let mut u_diag = vec![0.0_f64; m];

        // Gilbert–Peierls symbolic workspace: depth-first reach of each column.
        let mut reach_mark = vec![false; m];
        let mut reach: Vec<usize> = Vec::new();
        let mut dfs_stack: Vec<usize> = Vec::new();

        let utol = params.pivot_threshold;
        // L6 (dev/research/repo-review-2026-06-09.md): scale the zero-pivot
        // tolerance by the matrix magnitude, matching the dense path. An absolute
        // `zero_pivot_tol` declared a uniformly small but perfectly conditioned
        // basis singular and gave a large-magnitude basis effectively no
        // singularity detection. `a_max == 0` only for the (genuinely singular)
        // zero matrix, where the exact-zero test still fires.
        let a_max = a.values.iter().fold(0.0_f64, |acc, &x| acc.max(x.abs()));
        let ztol = params.zero_pivot_tol * a_max;
        let mut reach_visits = 0usize;

        for k in 0..m {
            // Scatter A(:, qcol[k]) into w.
            let (rows, vals) = a.column(qcol[k]);
            for (&i, &v) in rows.iter().zip(vals.iter()) {
                w[i] = v;
                if !mark[i] {
                    mark[i] = true;
                    touched.push(i);
                }
            }

            // Symbolic: depth-first reach of column k over the graph of L.
            // A pivot position p < k contributes to column k iff its pivot row
            // is reachable from the nonzeros of A(:,qcol[k]); edges run from a
            // column to the pivot positions of its (already-pivoted) rows. All
            // edges point to strictly larger positions, so ascending position
            // order is a valid topological order for the numeric solve.
            reach.clear();
            for &i in rows.iter() {
                let pp = pinv[i];
                if pp >= 0 && !reach_mark[pp as usize] {
                    reach_mark[pp as usize] = true;
                    reach.push(pp as usize);
                    dfs_stack.push(pp as usize);
                }
            }
            while let Some(p) = dfs_stack.pop() {
                let (ls, le) = (l_col_ptr[p], l_col_ptr[p + 1]);
                for idx in ls..le {
                    let pp = pinv[l_row_idx[idx]];
                    if pp >= 0 && !reach_mark[pp as usize] {
                        reach_mark[pp as usize] = true;
                        reach.push(pp as usize);
                        dfs_stack.push(pp as usize);
                    }
                }
            }
            reach.sort_unstable();
            reach_visits += reach.len();

            // Numeric forward substitution over the reach; collect U(:,k).
            let mut u_entries: Vec<(usize, f64)> = Vec::with_capacity(reach.len());
            for &p in reach.iter() {
                reach_mark[p] = false; // clear for the next column
                let r = perm[p];
                let xp = w[r];
                if xp == 0.0 {
                    continue;
                }
                u_entries.push((p, xp));
                let (ls, le) = (l_col_ptr[p], l_col_ptr[p + 1]);
                for idx in ls..le {
                    let i = l_row_idx[idx];
                    w[i] -= xp * l_val[idx];
                    if !mark[i] {
                        mark[i] = true;
                        touched.push(i);
                    }
                }
            }

            // Pivot selection among unpivoted touched rows (partial pivoting).
            let mut amax = 0.0_f64;
            let mut ipiv: isize = -1;
            for &i in touched.iter() {
                if pinv[i] < 0 {
                    let av = w[i].abs();
                    if av > amax {
                        amax = av;
                        ipiv = i as isize;
                    }
                }
            }

            let mut piv;
            let pivot_row: usize;
            if amax <= ztol {
                match params.on_singular {
                    LuSingularAction::Fail => {
                        // L9 (dev/research/repo-review-2026-06-09.md): report the
                        // ORIGINAL basis column `qcol[k]`, not the internal
                        // factorization position `k`. The caller (e.g. a simplex
                        // driver) knows original columns, not the AMD-dependent
                        // processing order, so `qcol[k]` is the index it can act on.
                        return Err(FeralError::SingularBasis { column: qcol[k] });
                    }
                    LuSingularAction::PerturbToEps { abs_floor } => {
                        // L13 (dev/research/repo-review-2026-06-09.md): perturb the
                        // largest-|w| unpivoted row `ipiv` (the same row threshold
                        // partial pivoting would select), matching the dense path,
                        // which perturbs its partial-pivoting-selected row — not the
                        // index-first unpivoted row. Reusing `ipiv` also avoids the
                        // O(m) scan whenever the column has any touched unpivoted
                        // entry; the scan remains only as the fallback for a column
                        // that is structurally empty in every unpivoted row
                        // (`ipiv < 0`), where `w` is zero and any row will do.
                        let r = if ipiv >= 0 {
                            ipiv as usize
                        } else {
                            (0..m)
                                .find(|&i| pinv[i] < 0)
                                .ok_or(FeralError::SingularBasis { column: qcol[k] })?
                        };
                        pivot_row = r;
                        let s = if w[r] < 0.0 { -1.0 } else { 1.0 };
                        piv = s * abs_floor.max(w[r].abs());
                    }
                }
            } else {
                // L2 (dev/research/repo-review-2026-06-09.md): threshold partial
                // pivoting. The strict max-magnitude row `ipiv` is the stability
                // baseline; but if the natural diagonal of this column — original
                // row `qcol[k]` — is still unpivoted and is within `utol·amax` of
                // that max, prefer it. The diagonal pivot preserves structure
                // (less fill) without sacrificing more than a factor `utol` of
                // stability. `utol == 1.0` (the default) recovers strict partial
                // pivoting, since the diagonal must then equal the max to qualify.
                // This matches CSparse `cs_lu` (Davis, *Direct Methods for Sparse
                // Linear Systems*, §6.3): `if (pinv[col] < 0 && |x[col]| >= a*tol)
                // ipiv = col`.
                let diag = qcol[k];
                // The diagonal must also clear the singularity floor `ztol`,
                // not merely the threshold `utol·amax` (repo-review verification
                // residual #2). With a loose `utol` (≤ `zero_pivot_tol`) a
                // sub-`ztol` diagonal could otherwise satisfy `|w[diag]| ≥
                // utol·amax` and be preferred over the sound max-magnitude row
                // (`amax > ztol` always holds in this branch), then be silently
                // clamped below — a sub-tolerance perturbation that bypasses
                // `on_singular`. The dense path carries the same `&& diag > ztol`
                // conjunct (`dense_factor.rs`); this keeps the two paths in step.
                pivot_row =
                    if pinv[diag] < 0 && w[diag].abs() >= utol * amax && w[diag].abs() > ztol {
                        diag
                    } else {
                        ipiv as usize
                    };
                piv = w[pivot_row];
                // With the `> ztol` conjunct above and `amax > ztol` in this
                // branch, both pivot choices satisfy `|piv| > ztol`, so this
                // guard is unreachable in practice. It remains as defensive
                // parity with the dense path: should the invariant ever be
                // broken by a future change, a sub-floor pivot is routed through
                // `on_singular` (a loud `SingularBasis` under `Fail`, or an
                // accountable perturbation) rather than silently clamped.
                if piv.abs() <= ztol {
                    match params.on_singular {
                        LuSingularAction::Fail => {
                            return Err(FeralError::SingularBasis { column: qcol[k] });
                        }
                        LuSingularAction::PerturbToEps { abs_floor } => {
                            let s = if piv < 0.0 { -1.0 } else { 1.0 };
                            piv = s * abs_floor.max(piv.abs());
                        }
                    }
                }
            }

            pinv[pivot_row] = k as isize;
            perm[k] = pivot_row;
            u_diag[k] = piv;
            for (p, v) in u_entries.into_iter() {
                u_row_idx.push(p);
                u_val.push(v);
            }
            u_col_ptr.push(u_row_idx.len());

            // L(:,k): unpivoted touched rows (excluding the pivot) / piv.
            let inv = 1.0 / piv;
            for &i in touched.iter() {
                if pinv[i] < 0 && w[i] != 0.0 {
                    l_row_idx.push(i); // original row, remapped after the loop
                    l_val.push(w[i] * inv);
                }
            }
            l_col_ptr.push(l_row_idx.len());

            // Clear w / mark for the next column.
            for &i in touched.iter() {
                w[i] = 0.0;
                mark[i] = false;
            }
            touched.clear();
        }

        // Remap L's stored original rows to pivot positions, then sort columns.
        let perm_inv: Vec<usize> = pinv.iter().map(|&p| p as usize).collect();
        remap_and_sort_l(&l_col_ptr, &mut l_row_idx, &mut l_val, &perm_inv, m);

        // Transpose U from column-wise (built above) to row-wise CSR, then into
        // per-row vectors with the diagonal entry first. Columns were emitted in
        // increasing order, so each row's strict-upper entries are sorted.
        let (u_row_ptr, u_col_idx, u_val) = transpose_to_csr(&u_col_ptr, &u_row_idx, &u_val, m);
        let mut u_rows: Vec<Vec<(usize, f64)>> = Vec::with_capacity(m);
        for i in 0..m {
            let mut row = Vec::with_capacity(1 + (u_row_ptr[i + 1] - u_row_ptr[i]));
            row.push((i, u_diag[i])); // diagonal first
            for idx in u_row_ptr[i]..u_row_ptr[i + 1] {
                row.push((u_col_idx[idx], u_val[idx]));
            }
            u_rows.push(row);
        }
        // Column-wise index of U's strict-upper entries (rows added in
        // increasing order, so each `u_above[c]` is sorted ascending).
        let mut u_above: Vec<Vec<usize>> = vec![Vec::new(); m];
        for (i, row) in u_rows.iter().enumerate() {
            for &(c, _) in row.iter() {
                if c > i {
                    u_above[c].push(i);
                }
            }
        }

        // Inverse of the scaling row permutation, for sparse-spike seeding.
        let mut scale_rperm_inv = vec![0usize; m];
        for (i, &o) in scale.rperm.iter().enumerate() {
            scale_rperm_inv[o] = i;
        }

        // `max|U|` at factor: denominator of the element-growth monitor (L5).
        let mut u_max0 = 0.0_f64;
        for row in u_rows.iter() {
            for &(_, v) in row.iter() {
                u_max0 = u_max0.max(v.abs());
            }
        }
        let u_max0 = u_max0.max(f64::MIN_POSITIVE);

        Ok(SparseLu {
            m,
            l_col_ptr,
            l_row_idx,
            l_val,
            u_rows,
            perm,
            perm_inv,
            qcol,
            qcol_inv,
            u_above,
            etas: Vec::new(),
            growth: 1.0,
            u_max0,
            reach_visits,
            params,
            scale,
            scale_rperm_inv,
            scratch: vec![0.0; m],
            scratch_mark: vec![false; m],
            ft_work: vec![0.0; m],
            scratch_b: vec![0.0; m],
            scratch_c: vec![0.0; m],
            scratch_d: vec![0.0; m],
        })
    }

    /// Convenience: analyze + factor from dense columns.
    pub fn factor_dense_columns(
        m: usize,
        cols: &[Vec<f64>],
        params: LuParams,
    ) -> Result<Self, FeralError> {
        let a = SparseColMatrix::from_dense_columns(m, cols)?;
        let symbolic = SparseLuSymbolic::analyze(&a)?;
        SparseLu::factor(&a, &symbolic, params)
    }

    /// Basis dimension.
    #[inline]
    pub fn dim(&self) -> usize {
        self.m
    }

    /// Row permutation: `perm[k]` = original row in pivot position `k`.
    #[inline]
    pub fn perm(&self) -> &[usize] {
        &self.perm
    }

    /// Column order: `qcol[k]` = original column in position `k`.
    #[inline]
    pub fn qcol(&self) -> &[usize] {
        &self.qcol
    }

    /// Total stored nonzeros in `L` and `U` (including the `U` diagonal).
    pub fn factor_nnz(&self) -> usize {
        self.l_val.len() + self.u_rows.iter().map(|r| r.len()).sum::<usize>()
    }

    /// Total elementary operations across all Forrest–Tomlin update etas — the
    /// work a warm solve replays for the update chain. For bump-local updates
    /// this stays `O(Σ bump)`, not `O(k·n)` (the structural FT witness).
    pub fn eta_ops(&self) -> usize {
        self.etas.iter().map(|e| e.ops.len()).sum()
    }

    /// Operations in the most recent update's eta (its bump cost).
    pub fn last_eta_ops(&self) -> usize {
        self.etas.last().map(|e| e.ops.len()).unwrap_or(0)
    }

    /// Total Gilbert–Peierls reach nodes visited during the factorization.
    /// This is `O(nnz(U))` (output-sensitive); it would be `O(n²)` if the
    /// factor scanned all prior columns. Used by the scalability guard test.
    pub fn reach_visits(&self) -> usize {
        self.reach_visits
    }

    /// Reconstruct dense entry `(i, j)` of `L` (pivot-position coordinates).
    pub fn l_dense(&self, i: usize, j: usize) -> f64 {
        if i == j {
            return 1.0;
        }
        let (s, e) = (self.l_col_ptr[j], self.l_col_ptr[j + 1]);
        for idx in s..e {
            if self.l_row_idx[idx] == i {
                return self.l_val[idx];
            }
        }
        0.0
    }

    /// Reconstruct dense entry `(i, j)` of `U` (pivot-position coordinates).
    pub fn u_dense(&self, i: usize, j: usize) -> f64 {
        if i > j {
            return 0.0;
        }
        for &(c, v) in self.u_rows[i].iter() {
            if c == j {
                return v;
            }
        }
        0.0
    }
}

/// Transpose a strict-upper `U` from column-wise CSC (`col_ptr` over columns,
/// `row_idx` = pivot-row positions) to row-wise CSR. Columns are assumed
/// emitted in ascending order, so each output row is column-sorted.
fn transpose_to_csr(
    col_ptr: &[usize],
    row_idx: &[usize],
    val: &[f64],
    m: usize,
) -> (Vec<usize>, Vec<usize>, Vec<f64>) {
    let nnz = val.len();
    let mut row_cnt = vec![0usize; m];
    for &r in row_idx.iter() {
        row_cnt[r] += 1;
    }
    let mut row_ptr = vec![0usize; m + 1];
    for i in 0..m {
        row_ptr[i + 1] = row_ptr[i] + row_cnt[i];
    }
    let mut col_idx = vec![0usize; nnz];
    let mut out_val = vec![0.0; nnz];
    let mut next: Vec<usize> = row_ptr[..m].to_vec();
    for k in 0..m {
        for idx in col_ptr[k]..col_ptr[k + 1] {
            let r = row_idx[idx];
            let dst = next[r];
            next[r] += 1;
            col_idx[dst] = k;
            out_val[dst] = val[idx];
        }
    }
    (row_ptr, col_idx, out_val)
}

/// Remap L's original row indices to pivot positions and sort each column.
fn remap_and_sort_l(
    col_ptr: &[usize],
    row_idx: &mut [usize],
    val: &mut [f64],
    perm_inv: &[usize],
    m: usize,
) {
    for r in row_idx.iter_mut() {
        *r = perm_inv[*r];
    }
    let mut order: Vec<usize> = Vec::new();
    for j in 0..m {
        let (s, e) = (col_ptr[j], col_ptr[j + 1]);
        order.clear();
        order.extend(s..e);
        order.sort_by_key(|&idx| row_idx[idx]);
        let rows: Vec<usize> = order.iter().map(|&idx| row_idx[idx]).collect();
        let vals: Vec<f64> = order.iter().map(|&idx| val[idx]).collect();
        row_idx[s..e].copy_from_slice(&rows);
        val[s..e].copy_from_slice(&vals);
    }
}