feral 0.4.0

Sparse symmetric indefinite direct solver in pure Rust, with certified inertia counts.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
//! MC64 wrapper: input preprocessing, Hungarian call, symmetric
//! averaging, and output guards.
//!
//! Given a sparse symmetric matrix (lower triangle only in the
//! input CSC), this module produces a symmetric scaling vector
//! `s` such that `D · A · D` (with `D = diag(s)`) has
//! magnitude-bounded off-diagonals and unit-scale diagonals.
//!
//! Algorithm (mirrors `ref/spral/src/scaling.f90::hungarian_wrapper`,
//! lines 597-801, in its non-singular branch):
//!
//!   1. Expand the lower-triangle CSC to a full symmetric pattern,
//!      carrying the original values with the transpose entries.
//!   2. Drop explicit zero entries (log of zero is -∞).
//!   3. Compute `c[k] = log |a[k]|` on the remaining entries.
//!   4. For each column j, compute `cmax[j] = max_k c[k]` and replace
//!      each `c[k]` by `cmax[j] - c[k]`. The cost graph is now
//!      non-negative and has minimum 0 in each column.
//!   5. Run `hungarian_match` on the cost graph.
//!   6. Unwind the normalization (SPRAL scaling.f90:681-682):
//!      - `rscaling[i] = u[i]` (row dual unchanged)
//!      - `cscaling[j] = v[j] - cmax[j]` (column dual minus column max)
//!   7. Symmetric average (SPRAL scaling.f90:169):
//!      `s[i] = exp((rscaling[i] + cscaling[i]) / 2)`.
//!   8. Safety guards: clamp the exponent to avoid overflow,
//!      rewrite any `s[i] == 0` or non-finite result to `1.0`.
//!   9. On partial matching, set `s[i] = 1.0` for unmatched indices
//!      and return `ScalingInfo::PartialSingular { n_unmatched }`.
//!
//! The partial-singular path deviates from SPRAL, which runs a
//! second Hungarian pass on the full-rank submatrix and then
//! applies a Duff-Pralet correction (scaling.f90:688-800). The
//! research note `dev/research/mc64-scaling.md` §"Structurally
//! singular matrices" specifies identity fallback for unmatched
//! rows/columns as the correct behavior for feral, because KKT
//! matrices from IPOPT are occasionally structurally rank-deficient
//! and a hard failure would regress the current `ForceAccept`
//! pathway.

use super::hungarian::{hungarian_match, CostGraph, Matching};
use super::ScalingInfo;
use crate::error::FeralError;
use crate::sparse::csc::CscMatrix;

/// Upper bound on the argument to `exp` before overflow.
/// `ln(f64::MAX) ≈ 709.78`. We use 709.0 as a safe ceiling.
const LOG_HUGE: f64 = 709.0;

/// Compute the MC64 symmetric scaling for a sparse symmetric matrix.
///
/// The input `matrix` stores only the lower triangle (including the
/// diagonal). This function expands to a full symmetric pattern
/// internally before running the matching.
///
/// Returns a length-`n` scaling vector in **user-order** indexing
/// together with a `ScalingInfo` diagnostic. On a non-singular
/// matrix the returned info is `ScalingInfo::Applied`; if the
/// matching is partial the info is
/// `ScalingInfo::PartialSingular { n_unmatched }` and the
/// unmatched positions are filled with `1.0` as an identity
/// fallback.
/// Compute only the MC64 matching, returning `(perm, n_matched)`.
/// Diagnostic helper for Phase 2.6.5; skips the scaling-vector
/// post-processing that `compute_symmetric` does.
pub(crate) fn matching_perm(matrix: &CscMatrix) -> Result<(Vec<usize>, usize), FeralError> {
    let cache = compute_matching(matrix)?;
    Ok((cache.perm, cache.n_matched))
}

/// Cached MC64 output: the full Hungarian matching plus the
/// column-max normalization, from which the scaling vector can be
/// recovered without rerunning the expensive Hungarian kernel.
///
/// Populated by [`compute_matching`] when `LdltCompress` preprocessing
/// runs; consumed by [`scaling_from_cache`] in the numeric phase when
/// the caller's scaling strategy resolves to `Mc64Symmetric`. Moves
/// the ~70% of symbolic overhead (Hungarian + cost graph build) off
/// the critical path for matrices where both compression and MC64
/// scaling run.
#[derive(Debug, Clone)]
pub(crate) struct Mc64Cache {
    pub perm: Vec<usize>,
    pub u: Vec<f64>,
    pub v: Vec<f64>,
    pub cmax: Vec<f64>,
    pub n_matched: usize,
}

/// Probe counter: number of times `compute_matching` ran the full
/// Hungarian pipeline. Process-global; readable from any thread.
/// Used by the value-bounded-cache investigation (see
/// `dev/research/mc64-value-bounded-cache-2026-05-17.md`) to confirm
/// whether MC64 dominates warm IPM wall in the live ipopt-feral path,
/// not just on the pounce-dumped corpus. Set `FERAL_MC64_TRACE=1` in
/// the environment to also stream per-call wall time to stderr.
pub static MC64_RECOMPUTE_COUNT: std::sync::atomic::AtomicUsize =
    std::sync::atomic::AtomicUsize::new(0);

/// Run the expensive MC64 pipeline — build the cost graph and run
/// the Hungarian kernel — and return the full output. The cheap
/// scaling-vector post-processing is in [`scaling_from_cache`].
pub(crate) fn compute_matching(matrix: &CscMatrix) -> Result<Mc64Cache, FeralError> {
    let n = matrix.n;
    if n == 0 {
        return Ok(Mc64Cache {
            perm: Vec::new(),
            u: Vec::new(),
            v: Vec::new(),
            cmax: Vec::new(),
            n_matched: 0,
        });
    }
    MC64_RECOMPUTE_COUNT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
    let trace = matches!(
        std::env::var("FERAL_MC64_TRACE").as_deref(),
        Ok("1") | Ok("on")
    );
    let t0 = if trace {
        Some(std::time::Instant::now())
    } else {
        None
    };

    let (cost_graph, cmax) = build_cost_graph(matrix)?;
    let Matching {
        perm,
        u,
        v,
        n_matched,
    } = hungarian_match(&cost_graph);

    if let Some(t0) = t0 {
        let ms = t0.elapsed().as_secs_f64() * 1e3;
        let count = MC64_RECOMPUTE_COUNT.load(std::sync::atomic::Ordering::Relaxed);
        eprintln!(
            "[feral mc64] call #{} n={} nnz={} matching {:.1} ms",
            count,
            n,
            matrix.row_idx.len(),
            ms,
        );
    }

    Ok(Mc64Cache {
        perm,
        u,
        v,
        cmax,
        n_matched,
    })
}

pub(crate) fn compute_symmetric(matrix: &CscMatrix) -> Result<(Vec<f64>, ScalingInfo), FeralError> {
    let cache = compute_matching(matrix)?;
    Ok(scaling_from_cache(&cache))
}

/// Cheap O(n) post-processing that turns a cached MC64 matching into
/// the symmetric scaling vector. Mirrors the body of
/// [`compute_symmetric`] from step 6 onward.
pub(crate) fn scaling_from_cache(cache: &Mc64Cache) -> (Vec<f64>, ScalingInfo) {
    let n = cache.perm.len();
    if n == 0 {
        return (Vec::new(), ScalingInfo::Applied);
    }
    let Mc64Cache {
        perm,
        u,
        v,
        cmax,
        n_matched,
    } = cache;

    // Step 6-7: unwind normalization and form the symmetric average.
    //
    //   rscaling[i] = u[i]
    //   cscaling[i] = v[i] - cmax[i]
    //   s[i]        = exp((rscaling[i] + cscaling[i]) / 2)
    //               = exp((u[i] + v[i] - cmax[i]) / 2)
    //
    // Matches SPRAL scaling.f90:681-682 followed by :169.
    let mut scaling = vec![1.0_f64; n];
    for i in 0..n {
        // If the column had no usable entries at all, cmax[i] is
        // `f64::NEG_INFINITY` (see `build_cost_graph`). Any such
        // index is "empty" — the Hungarian kernel cannot match
        // that column meaningfully — so we fall back to identity
        // scaling for it. This is the structurally empty-column
        // case from the research note.
        if !cmax[i].is_finite() {
            scaling[i] = 1.0;
            continue;
        }

        // For unmatched columns, fall back to identity scaling
        // rather than using the dual variables (which are
        // meaningless on the unmatched part of the graph).
        if perm[i] == usize::MAX {
            scaling[i] = 1.0;
            continue;
        }

        let mut arg = (u[i] + v[i] - cmax[i]) / 2.0;

        // Clamp to avoid overflow on `exp`. A dual variable can
        // grow to ±∞-ish magnitudes on pathological inputs; both
        // MUMPS (dana_aux.F:1797-1816) and SSIDS guard against
        // this. The clamp is symmetric so that a clamped row
        // exponentiates to a very large or very small but finite
        // value rather than `+∞` or `0`.
        if !arg.is_finite() {
            scaling[i] = 1.0;
            continue;
        }
        arg = arg.clamp(-LOG_HUGE, LOG_HUGE);

        let s = arg.exp();

        // Defensive rewrite: a zero or non-finite scaling would
        // annihilate a whole row/column and destroy symmetry.
        // Mirrors MUMPS dana_aux.F:1809-1811.
        if s == 0.0 || !s.is_finite() {
            scaling[i] = 1.0;
        } else {
            scaling[i] = s;
        }
    }

    let info = if *n_matched == n {
        ScalingInfo::Applied
    } else {
        ScalingInfo::PartialSingular {
            n_unmatched: n - *n_matched,
        }
    };

    (scaling, info)
}

/// Build the Hungarian cost graph and per-column maximum (`cmax`).
///
/// Expands the lower-triangle CSC `matrix` to a full symmetric
/// pattern, drops explicit-zero entries, takes the log of the
/// absolute value of each remaining entry, and normalizes each
/// column by subtracting its maximum so that the resulting costs
/// are non-negative.
///
/// Returns `(CostGraph, cmax)` where `cmax[j]` is the pre-
/// normalization column maximum (i.e., `max_i log|a[i,j]|`) used
/// in step 6 of `compute_symmetric` to unwind the normalization.
/// Columns with no finite (non-zero) entries have
/// `cmax[j] = f64::NEG_INFINITY`, which the caller treats as a
/// "fall back to identity" signal.
///
/// Algorithmic mirror: `ref/spral/src/scaling.f90:636-657`.
fn build_cost_graph(matrix: &CscMatrix) -> Result<(CostGraph, Vec<f64>), FeralError> {
    let n = matrix.n;

    // Two-pass expansion: first count the non-zero entries per
    // expanded column, then fill in the rows and values.
    //
    // For each stored lower-triangle entry at (row=i, col=j):
    //   * if val != 0 and i == j: contributes one entry to column j.
    //   * if val != 0 and i > j:  contributes to both column j (row i)
    //                             and column i (row j).
    //
    // Zero entries are dropped at the counting step so `log 0`
    // never appears.
    let mut col_counts = vec![0usize; n];
    for j in 0..n {
        for k in matrix.col_ptr[j]..matrix.col_ptr[j + 1] {
            let i = matrix.row_idx[k];
            let val = matrix.values[k];
            if val == 0.0 {
                continue;
            }
            col_counts[j] += 1;
            if i != j {
                col_counts[i] += 1;
            }
        }
    }

    // Prefix sum to column pointers.
    let mut col_ptr = vec![0usize; n + 1];
    for j in 0..n {
        col_ptr[j + 1] = col_ptr[j] + col_counts[j];
    }
    let nnz_full = col_ptr[n];

    let mut row_idx = vec![0usize; nnz_full];
    let mut cost = vec![0.0_f64; nnz_full];
    let mut offsets: Vec<usize> = col_ptr[..n].to_vec();

    // Second pass: place entries.
    for j in 0..n {
        for k in matrix.col_ptr[j]..matrix.col_ptr[j + 1] {
            let i = matrix.row_idx[k];
            let val = matrix.values[k];
            if val == 0.0 {
                continue;
            }
            let logabs = val.abs().ln();
            // (i, j) stays in column j.
            let p = offsets[j];
            row_idx[p] = i;
            cost[p] = logabs;
            offsets[j] += 1;
            // (j, i) transpose entry, if off-diagonal.
            if i != j {
                let q = offsets[i];
                row_idx[q] = j;
                cost[q] = logabs;
                offsets[i] += 1;
            }
        }
    }

    // Sort each column's rows ascending (Hungarian kernel does not
    // strictly require this, but a predictable order makes the
    // greedy initialization deterministic and matches SPRAL's
    // behaviour after `half_to_full`).
    for j in 0..n {
        let start = col_ptr[j];
        let end = col_ptr[j + 1];
        let mut pairs: Vec<(usize, f64)> = (start..end).map(|k| (row_idx[k], cost[k])).collect();
        pairs.sort_by_key(|&(r, _)| r);
        for (k, (r, c)) in (start..end).zip(pairs) {
            row_idx[k] = r;
            cost[k] = c;
        }
    }

    // Column-max normalization: for each column, find the maximum
    // log-absolute value and subtract it from every entry in that
    // column. Entries of an all-zero column are absent, so the
    // `cmax` for an empty column is `f64::NEG_INFINITY` and its
    // range is already empty — nothing to normalize.
    let mut cmax = vec![f64::NEG_INFINITY; n];
    for j in 0..n {
        let start = col_ptr[j];
        let end = col_ptr[j + 1];
        if start == end {
            continue;
        }
        let mut m = cost[start];
        for &c in &cost[(start + 1)..end] {
            if c > m {
                m = c;
            }
        }
        cmax[j] = m;
        for c in &mut cost[start..end] {
            *c = m - *c;
        }
    }

    let graph = CostGraph {
        n,
        col_ptr,
        row_idx,
        cost,
    };
    Ok((graph, cmax))
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Diagonal SPD: expansion is a no-op, cost is all zero after
    /// normalization, Hungarian returns identity matching with
    /// `u = v = 0`, unwinding gives `scaling[i] = exp(-log A_ii / 2)
    /// = 1/sqrt(A_ii)`, and the scaled diagonal is 1.
    #[test]
    fn diagonal_matrix_produces_inverse_sqrt_scaling() {
        let csc = CscMatrix::from_triplets(3, &[0, 1, 2], &[0, 1, 2], &[2.0, 3.0, 5.0]).unwrap();
        let (s, info) = compute_symmetric(&csc).unwrap();
        assert_eq!(info, ScalingInfo::Applied);
        let expected = [
            1.0 / 2.0_f64.sqrt(),
            1.0 / 3.0_f64.sqrt(),
            1.0 / 5.0_f64.sqrt(),
        ];
        for i in 0..3 {
            assert!(
                (s[i] - expected[i]).abs() < 1e-12,
                "s[{}] = {}, expected {}",
                i,
                s[i],
                expected[i]
            );
        }
    }

    /// Empty 0×0 matrix returns an empty scaling vector.
    #[test]
    fn empty_matrix_returns_empty_scaling() {
        let csc = CscMatrix {
            n: 0,
            col_ptr: vec![0],
            row_idx: vec![],
            values: vec![],
        };
        let (s, info) = compute_symmetric(&csc).unwrap();
        assert!(s.is_empty());
        assert_eq!(info, ScalingInfo::Applied);
    }
}