Skip to main content

gam_models/
penalized_vector_glm.rs

1//! Generic penalized vector-response GLM Newton solver (fixed λ).
2//!
3//! This is the shared scaffold extracted from
4//! [`crate::multinomial::fit_penalized_multinomial`] (dense softmax
5//! Fisher block) and
6//! [`crate::binomial_multi::fit_penalized_binomial_multi`]
7//! (row-diagonal independent-binomial Fisher block). Both families fit a
8//! penalized vector-response GLM with a shared design `X ∈ ℝ^{N×P}` and a
9//! shared penalty `S ∈ ℝ^{P×P}` replicated per output, differing **only** in
10//! the per-row Fisher-block algebra and the likelihood/residual. Everything
11//! else — input validation, penalized objective / gradient / Hessian assembly,
12//! damped Newton with backtracking, the relative-step convergence test, and the
13//! final penalized-objective / deviance tally — is written once here.
14//!
15//! # Fit problem
16//!
17//! With `β = [β_0; β_1; …; β_{M-1}]` stacked in output-major order
18//! (`β_a ∈ ℝ^P` is the coefficient block for output `a`), minimise the
19//! penalized negative log-likelihood
20//!
21//! ```text
22//!   F(β) = − log L(β) + ½ Σ_{a=0}^{M-1} λ_a · β_aᵀ S β_a
23//! ```
24//!
25//! where `log L` and its η-derivatives are supplied by the family's
26//! [`VectorLikelihood`] adapter and `λ_a` is a per-output smoothing parameter
27//! scaling the shared penalty `S`. The active linear predictor is
28//! `η_{n,a} = (X β_a)_n`, shape `(N, M)`.
29//!
30//! # Newton step
31//!
32//! Each iteration assembles the coupled penalized Hessian and gradient in
33//! output-major coefficient ordering `flat[a·P + i] = β[i, a]` (matching
34//! [`gam_solve::pirls::dense_block_xtwx`]):
35//!
36//! ```text
37//!   H[a·P + i, b·P + j] = Σ_n W_{n,a,b} · X[n,i] · X[n,j]   (+ δ_{ab} λ_a S[i,j])
38//!   g[a·P + i]          = Σ_n r_{n,a} · X[n,i]              (+ λ_a (S β_a)[i])
39//! ```
40//!
41//! with the per-row Fisher block `W_{n,·,·} = −∂² log L / ∂η ∂η` (the family's
42//! [`VectorLikelihood::hess_block`], or a caller override) and the residual
43//! `r_{n,a} = −∂ log L / ∂η_a` (`−`[`VectorLikelihood::grad_eta`]). The step
44//! `δ = − H^{-1} g` is solved through faer's symmetric-PD-with-fallback
45//! factorisation under an adaptive Levenberg–Marquardt ridge: when a
46//! rank-deficient block (collinear / quasi-separated columns under a small
47//! per-output λ) makes the Bunch–Kaufman fallback back-substitute through
48//! near-zero pivots into a non-finite δ, a diagonal ridge `τ·I` — scaled by the
49//! Hessian's largest diagonal so it is curvature-scale invariant — is added and
50//! the system re-solved, escalating τ geometrically until δ is finite. The
51//! step is then accepted by a backtracking line search on `F` (full step first,
52//! halve up to 8 times). Because the line search validates against the
53//! *unridged* objective `F`, the ridge never biases the converged β̂ (at the
54//! optimum the gradient vanishes and δ → 0 for any τ). Convergence is the
55//! relative coefficient step `‖δ‖ / (1 + ‖β‖) ≤ tol`.
56//!
57//! # Fisher-block override
58//!
59//! When `fisher_w_override` is `Some`, each Newton step uses the supplied
60//! per-row `(N, M, M)` curvature block in place of the analytic
61//! [`VectorLikelihood::hess_block`]; the gradient/residual path stays analytic
62//! (issue #349). The two families differ in what they accept off the diagonal:
63//! multinomial admits a full dense block, while independent-binomial columns
64//! only consume the per-output diagonal (a non-zero cross term cannot be
65//! represented by the separable columns). That family-specific precondition is
66//! enforced by the adapter before it constructs the override view; the engine
67//! consumes whatever block it is given.
68
69use gam_linalg::faer_ndarray::{FaerArrayView, array2_to_matmut, factorize_symmetricwith_fallback};
70use crate::vector_response::VectorLikelihood;
71use crate::model_types::EstimationError;
72use gam_solve::pirls::dense_block_xtwx;
73use faer::Side;
74use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayView3};
75
76/// Base Levenberg–Marquardt ridge as a fraction of the penalized Hessian's
77/// largest diagonal entry (so it is invariant to the problem's overall
78/// curvature scale). At ~1e-10 of the dominant curvature it is negligible
79/// relative to identified-direction curvature — it never biases the identified
80/// optimum (at β̂ the unridged gradient still vanishes there) — yet large
81/// enough to lift an exactly rank-deficient null direction off zero so the
82/// Bunch–Kaufman fallback yields a finite, descent Newton step (gam#856).
83const BASE_RIDGE_FRACTION_OF_MAX_DIAG: f64 = 1.0e-10;
84
85/// Geometric ridge-escalation budget for a single Newton step. 30 doublings
86/// span ~9 orders of magnitude over the base ridge, which covers any
87/// conditioning a finite-curvature softmax/binomial block can present.
88const MAX_RIDGE_ESCALATIONS: usize = 30;
89
90/// Backtracking budget for the damped-Newton line search: full step first, then
91/// halve up to this many times if the penalized objective fails to decrease.
92const MAX_BACKTRACKS: usize = 8;
93
94/// Per-step line-search contraction factor (halving).
95const LINE_SEARCH_SHRINK: f64 = 0.5;
96
97/// Slack on the "objective decreased" acceptance test, absorbing floating-point
98/// round-off so a step that is flat to machine precision is not rejected.
99const OBJECTIVE_DECREASE_SLACK: f64 = 1.0e-12;
100
101/// First-order optimality gate (gam#856) as a fraction of `1 + max_diag`: the
102/// unridged penalized gradient norm must fall below this curvature-scaled
103/// threshold before convergence is declared, certifying stationarity on the
104/// identified subspace rather than a premature step-norm stall.
105const OPTIMALITY_GRAD_FRACTION: f64 = 1.0e-6;
106
107/// Class-space metric of the replicated smoothing penalty (#1587).
108///
109/// * `Diagonal` — the historical `diag_a(λ_a) ⊗ S`: each active output's
110///   coefficient block is penalised independently. Correct for genuinely
111///   independent outputs (independent-binomial columns), but for a *softmax*
112///   multinomial it penalises the reference-anchored log-odds contrasts
113///   `η_a = log(p_a/p_ref)`, so the fit is NOT invariant to the arbitrary
114///   reference-class choice (#1587).
115/// * `Centered` — the reference-symmetric `λ · ((I_{M} − J_{M}/K) ⊗ S)` with a
116///   single shared `λ` (= `lambdas[0]`; the caller must pass uniform `lambdas`)
117///   and `K = M + 1`. This is exactly the symmetric CLR penalty
118///   `Σ_{k=0}^{K-1} β̃_kᵀ S β̃_k` (with `Σ_k β̃_k = 0`) written in the active-class
119///   (ALR) gauge — invariant to which class is the baseline (the multinomial
120///   analogue of #1549's `G^{1/2}` Aitchison whitening). Couples the class
121///   blocks via the `−(λ/K)·S` off-diagonals; the engine already factors a
122///   class-coupled Hessian (the softmax Fisher block is dense), so this is a
123///   penalty-assembly change only.
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
125pub enum ClassPenaltyMetric {
126    /// Independent per-output penalty `diag_a(λ_a) ⊗ S` (historical default).
127    #[default]
128    Diagonal,
129    /// Reference-symmetric centered penalty `λ·((I − J/K) ⊗ S)`, `K = M + 1`.
130    Centered,
131}
132
133/// Inputs to [`fit_penalized_vector_glm`].
134///
135/// `M` (the number of active outputs / linear-predictor columns) is taken from
136/// `lambdas.len()`; the engine validates it against the design and override
137/// shapes. The response `y` is passed verbatim to the [`VectorLikelihood`]
138/// adapter, which owns its own `(N, ·)` shape contract (binomial columns use
139/// `K = M`; multinomial one-hot uses `K = M + 1`), so the engine does not
140/// constrain its column count beyond `y.nrows() == N`.
141pub struct PenalizedVectorGlmInputs<'a> {
142    /// Design matrix `X ∈ ℝ^{N×P}` (one row per observation, shared across
143    /// every output column).
144    pub design: ArrayView2<'a, f64>,
145    /// Response `Y ∈ ℝ^{N×·}`, interpreted by the [`VectorLikelihood`].
146    pub y: ArrayView2<'a, f64>,
147    /// Shared smoothing penalty `S ∈ ℝ^{P×P}` (symmetric, PSD).
148    pub penalty: ArrayView2<'a, f64>,
149    /// Per-output smoothing parameter `λ_a`, length `M`.
150    pub lambdas: ArrayView1<'a, f64>,
151    /// Optional per-row Fisher-block override, shape `(N, M, M)`. When `Some`,
152    /// it replaces the analytic [`VectorLikelihood::hess_block`] as the Newton
153    /// curvature; the gradient/residual path stays analytic (issue #349). The
154    /// adapter is responsible for any family-specific structural precondition
155    /// on the block (e.g. zero off-diagonals for independent columns).
156    pub fisher_w_override: Option<ArrayView3<'a, f64>>,
157    /// Maximum Newton iterations; recommend 50.
158    pub max_iter: usize,
159    /// Relative-step convergence tolerance; recommend 1e-7.
160    pub tol: f64,
161    /// Class-space metric of the replicated penalty (#1587). `Diagonal`
162    /// preserves the historical independent-per-output penalty; `Centered`
163    /// selects the reference-symmetric softmax penalty (requires uniform
164    /// `lambdas`). See [`ClassPenaltyMetric`].
165    pub class_penalty_metric: ClassPenaltyMetric,
166}
167
168/// Outputs of [`fit_penalized_vector_glm`].
169pub struct PenalizedVectorGlmOutputs {
170    /// Coefficient matrix, shape `(P, M)` (column `a` is `β_a`).
171    pub coefficients: Array2<f64>,
172    /// Final active linear predictor `η = X β̂`, shape `(N, M)`. The adapter
173    /// turns this into fitted probabilities via its own inverse link.
174    pub eta: Array2<f64>,
175    /// Number of Newton iterations executed (including the final step that
176    /// satisfied the tolerance).
177    pub iterations: usize,
178    /// `true` if the relative-step test was satisfied before `max_iter`.
179    pub converged: bool,
180    /// Unpenalized log-likelihood `log L(β̂)`.
181    pub log_likelihood: f64,
182    /// Penalty term `½ Σ_a λ_a · β̂_aᵀ S β̂_a` at the returned `β̂`.
183    pub penalty_term: f64,
184}
185
186/// Quadratic form `½ β_aᵀ S β_a` accumulated across outputs with per-output
187/// weight `λ_a`. Shared by the objective evaluator and the final tally.
188fn weighted_penalty_sum(
189    beta: &Array2<f64>,
190    penalty: ArrayView2<'_, f64>,
191    lambdas: ArrayView1<'_, f64>,
192    metric: ClassPenaltyMetric,
193) -> f64 {
194    let (p, m) = beta.dim();
195    match metric {
196        ClassPenaltyMetric::Diagonal => {
197            let mut pen = 0.0_f64;
198            for a in 0..m {
199                let la = lambdas[a];
200                if la == 0.0 {
201                    continue;
202                }
203                let beta_col = beta.column(a);
204                let mut quad = 0.0_f64;
205                for i in 0..p {
206                    let mut s_beta_i = 0.0_f64;
207                    for j in 0..p {
208                        s_beta_i += penalty[[i, j]] * beta_col[j];
209                    }
210                    quad += beta_col[i] * s_beta_i;
211                }
212                pen += 0.5 * la * quad;
213            }
214            pen
215        }
216        // Centered (#1587): ½·λ·[ Σ_a β_aᵀSβ_a − (1/K)·gᵀSg ], g = Σ_a β_a,
217        // K = M + 1. Equals the symmetric CLR penalty Σ_k β̃_kᵀSβ̃_k (Σβ̃=0) in
218        // the active-class gauge — reference-invariant. Shared λ = lambdas[0].
219        ClassPenaltyMetric::Centered => {
220            if m == 0 {
221                return 0.0;
222            }
223            let lam = lambdas[0];
224            if lam == 0.0 {
225                return 0.0;
226            }
227            let k = (m + 1) as f64;
228            // g = Σ_a β_a (the active-class coefficient sum, a p-vector).
229            let mut g = vec![0.0_f64; p];
230            for a in 0..m {
231                let col = beta.column(a);
232                for i in 0..p {
233                    g[i] += col[i];
234                }
235            }
236            // Σ_a β_aᵀSβ_a.
237            let mut sum_quad = 0.0_f64;
238            for a in 0..m {
239                let col = beta.column(a);
240                for i in 0..p {
241                    let mut s_beta_i = 0.0_f64;
242                    for j in 0..p {
243                        s_beta_i += penalty[[i, j]] * col[j];
244                    }
245                    sum_quad += col[i] * s_beta_i;
246                }
247            }
248            // gᵀSg.
249            let mut g_quad = 0.0_f64;
250            for i in 0..p {
251                let mut s_g_i = 0.0_f64;
252                for j in 0..p {
253                    s_g_i += penalty[[i, j]] * g[j];
254                }
255                g_quad += g[i] * s_g_i;
256            }
257            0.5 * lam * (sum_quad - g_quad / k)
258        }
259    }
260}
261
262/// Fit a penalized vector-response GLM at fixed `λ` via damped Newton.
263///
264/// The `likelihood` adapter supplies the per-row Fisher block, the residual
265/// gradient, and the log-likelihood; the engine owns the entire optimisation
266/// scaffold. See the module docs for the optimisation problem, the
267/// output-major coefficient ordering, and the convergence semantics.
268///
269/// `context` is woven into every diagnostic message so each family keeps its
270/// own error prefix (e.g. `"fit_penalized_multinomial"`).
271pub fn fit_penalized_vector_glm<L: VectorLikelihood>(
272    inputs: PenalizedVectorGlmInputs<'_>,
273    likelihood: &L,
274    context: &str,
275) -> Result<PenalizedVectorGlmOutputs, EstimationError> {
276    let PenalizedVectorGlmInputs {
277        design,
278        y,
279        penalty,
280        lambdas,
281        fisher_w_override,
282        max_iter,
283        tol,
284        class_penalty_metric,
285    } = inputs;
286
287    // ────────────────────────────── shape checks ──────────────────────────
288    let n_obs = design.nrows();
289    let p = design.ncols();
290    if n_obs == 0 || p == 0 {
291        crate::bail_invalid_estim!("{context}: design must be nonempty (got {n_obs}x{p})");
292    }
293    let m = lambdas.len();
294    if m == 0 {
295        crate::bail_invalid_estim!("{context}: need at least one active output (got M=0)");
296    }
297    if y.nrows() != n_obs {
298        crate::bail_invalid_estim!("{context}: y rows {} ≠ design rows {n_obs}", y.nrows());
299    }
300    if penalty.dim() != (p, p) {
301        crate::bail_invalid_estim!(
302            "{context}: penalty shape {:?} ≠ (P, P) = ({p}, {p})",
303            penalty.dim()
304        );
305    }
306    for (i, &v) in lambdas.iter().enumerate() {
307        if !(v.is_finite() && v >= 0.0) {
308            crate::bail_invalid_estim!("{context}: lambdas[{i}] must be finite and ≥ 0 (got {v})");
309        }
310    }
311    if let Some(fw) = fisher_w_override.as_ref() {
312        if fw.dim() != (n_obs, m, m) {
313            crate::bail_invalid_estim!(
314                "{context}: fisher_w_override shape {:?} ≠ (N, M, M) = ({n_obs}, {m}, {m})",
315                fw.dim()
316            );
317        }
318    }
319    for ((i, j), &v) in design.indexed_iter() {
320        if !v.is_finite() {
321            crate::bail_invalid_estim!("{context}: design[{i},{j}] must be finite (got {v})");
322        }
323    }
324
325    // ────────────────────────── Newton iteration ──────────────────────────
326    // β stored as (P, M) column-major-per-output; flat index uses output-major
327    // ordering `flat[a · P + i] = β[i, a]` to align with `dense_block_xtwx`.
328    let mut beta = Array2::<f64>::zeros((p, m));
329    let mut eta = Array2::<f64>::zeros((n_obs, m));
330    // Reused η scratch for the line-search objective probes (see
331    // `evaluate_objective`): overwritten in full on every call, so it carries
332    // no state between calls and hoisting it out of the backtracking loop is a
333    // pure heap-allocation removal with no effect on the computed objective.
334    let mut eta_objective_scratch = Array2::<f64>::zeros((n_obs, m));
335    let beta_flat_dim = p * m;
336    // Reused penalized-gradient buffer: each Newton iteration writes every entry
337    // `grad_flat[a·p + i] = Xᵀr` (direct assignment over all a∈0..m, i∈0..p)
338    // before adding the penalty term and before any read, so it carries no state
339    // across iterations and hoisting it out of the Newton loop is a pure
340    // heap-allocation removal with no effect on the computed gradient.
341    let mut grad_flat = Array1::<f64>::zeros(beta_flat_dim);
342
343    let mut iterations = 0usize;
344    let mut converged = false;
345    let mut last_objective = f64::INFINITY;
346
347    // η = X · β for the current β, reused by the analytic Fisher / gradient.
348    let recompute_eta = |beta: &Array2<f64>, eta: &mut Array2<f64>| {
349        for a in 0..m {
350            let beta_col = beta.column(a);
351            for row in 0..n_obs {
352                let mut eta_val = 0.0_f64;
353                for i in 0..p {
354                    eta_val += design[[row, i]] * beta_col[i];
355                }
356                eta[[row, a]] = eta_val;
357            }
358        }
359    };
360
361    // Penalized objective F(β) = − log L(X β) + ½ Σ_a λ_a β_aᵀ S β_a.
362    // The caller supplies a reused `(n_obs, m)` scratch for η = X·β so the
363    // backtracking line search (which calls this up to `MAX_BACKTRACKS + 1`
364    // times per Newton iteration) does not heap-allocate a fresh η buffer on
365    // every probe. The scratch is overwritten in full by `recompute_eta` before
366    // it is read, so reusing it is bit-for-bit identical to the prior
367    // allocate-fresh body: `recompute_eta` runs the SAME `Σ_i design·β` loop in
368    // the SAME order this closure used inline.
369    let evaluate_objective = |beta_trial: &Array2<f64>, eta_scratch: &mut Array2<f64>| -> f64 {
370        recompute_eta(beta_trial, eta_scratch);
371        let ll = likelihood.log_lik(eta_scratch.view(), y);
372        let pen = weighted_penalty_sum(beta_trial, penalty, lambdas, class_penalty_metric);
373        -ll + pen
374    };
375
376    for iter in 0..max_iter {
377        iterations = iter + 1;
378
379        recompute_eta(&beta, &mut eta);
380
381        // Per-row dense Fisher block W_{n,a,b} = −∂² log L / ∂η_a ∂η_b: either
382        // the caller-supplied curvature override (issue #349 escape-hatch —
383        // curvature only) or the analytic [`VectorLikelihood::hess_block`]. The
384        // residual r_{n,a} = −∂ log L / ∂η_a stays analytic in both cases.
385        let analytic_fisher = fisher_w_override
386            .as_ref()
387            .map_or_else(|| Some(likelihood.hess_block(eta.view(), y)), |_| None);
388        let fisher_blocks = match fisher_w_override.as_ref() {
389            Some(fw) => *fw,
390            None => analytic_fisher
391                .as_ref()
392                .expect("analytic Fisher computed when no override")
393                .view(),
394        };
395        let residual = likelihood.grad_eta(eta.view(), y).mapv(|v| -v);
396
397        // Penalized Hessian: H = block(XᵀWX) + diag_a(λ_a S).
398        let mut hessian = dense_block_xtwx(design, fisher_blocks, None)?;
399        if hessian.nrows() != beta_flat_dim || hessian.ncols() != beta_flat_dim {
400            crate::bail_invalid_estim!(
401                "{context}: assembled Hessian shape {:?} ≠ ({beta_flat_dim}, {beta_flat_dim})",
402                hessian.dim()
403            );
404        }
405        match class_penalty_metric {
406            ClassPenaltyMetric::Diagonal => {
407                for a in 0..m {
408                    let la = lambdas[a];
409                    if la == 0.0 {
410                        continue;
411                    }
412                    let base = a * p;
413                    for i in 0..p {
414                        for j in 0..p {
415                            hessian[[base + i, base + j]] += la * penalty[[i, j]];
416                        }
417                    }
418                }
419            }
420            // Centered (#1587): H_{ab} += λ·(δ_ab − 1/K)·S, K = M+1, shared
421            // λ = lambdas[0] — couples every class pair via the −(λ/K)·S
422            // off-diagonals. Reference-invariant softmax penalty.
423            ClassPenaltyMetric::Centered if m > 0 && lambdas[0] != 0.0 => {
424                let lam = lambdas[0];
425                let inv_k = 1.0 / ((m + 1) as f64);
426                for a in 0..m {
427                    for b in 0..m {
428                        let coef = lam * (if a == b { 1.0 } else { 0.0 } - inv_k);
429                        let (ba, bb) = (a * p, b * p);
430                        for i in 0..p {
431                            for j in 0..p {
432                                hessian[[ba + i, bb + j]] += coef * penalty[[i, j]];
433                            }
434                        }
435                    }
436                }
437            }
438            ClassPenaltyMetric::Centered => {}
439        }
440
441        // Penalized gradient: g_a = Xᵀ r_{·,a} + (penalty gradient). For the
442        // Diagonal metric that is `λ_a S β_a`; for Centered it is the
443        // reference-symmetric `λ S (β_a − β̄)`, β̄ = (1/K) Σ_b β_b (#1587).
444        // Written into the reused `grad_flat` buffer; the loop below assigns
445        // every entry (`=`) before the penalty `+=`, so no re-zeroing is needed.
446        for a in 0..m {
447            for i in 0..p {
448                let mut acc = 0.0_f64;
449                for row in 0..n_obs {
450                    acc += design[[row, i]] * residual[[row, a]];
451                }
452                grad_flat[a * p + i] = acc;
453            }
454        }
455        match class_penalty_metric {
456            ClassPenaltyMetric::Diagonal => {
457                for a in 0..m {
458                    let la = lambdas[a];
459                    if la == 0.0 {
460                        continue;
461                    }
462                    let beta_col = beta.column(a);
463                    for i in 0..p {
464                        let mut s_beta_i = 0.0_f64;
465                        for j in 0..p {
466                            s_beta_i += penalty[[i, j]] * beta_col[j];
467                        }
468                        grad_flat[a * p + i] += la * s_beta_i;
469                    }
470                }
471            }
472            ClassPenaltyMetric::Centered if m > 0 && lambdas[0] != 0.0 => {
473                let lam = lambdas[0];
474                let inv_k = 1.0 / ((m + 1) as f64);
475                // β̄ = (1/K) Σ_b β_b.
476                let mut bbar = vec![0.0_f64; p];
477                for b in 0..m {
478                    let col = beta.column(b);
479                    for i in 0..p {
480                        bbar[i] += col[i];
481                    }
482                }
483                for v in bbar.iter_mut() {
484                    *v *= inv_k;
485                }
486                for a in 0..m {
487                    let beta_col = beta.column(a);
488                    for i in 0..p {
489                        // (S (β_a − β̄))_i.
490                        let mut s_centered_i = 0.0_f64;
491                        for j in 0..p {
492                            s_centered_i += penalty[[i, j]] * (beta_col[j] - bbar[j]);
493                        }
494                        grad_flat[a * p + i] += lam * s_centered_i;
495                    }
496                }
497            }
498            ClassPenaltyMetric::Centered => {}
499        }
500
501        // δ = − H^{-1} · grad, solved through an adaptive Levenberg–Marquardt
502        // ridge. The penalized Hessian `H = block(XᵀWX) + diag_a(λ_a S)` can be
503        // rank-deficient — a multinomial class block with quasi-separated /
504        // collinear columns and a small per-class λ leaves `XᵀW_aX + λ_a S`
505        // singular. faer's symmetric fallback chain ends at Bunch–Kaufman
506        // (LBLᵀ), which factorizes indefinite/singular matrices "successfully"
507        // and then back-substitutes through near-zero pivots, yielding a
508        // non-finite δ. Rather than aborting the whole fit on one bad block, we
509        // add a small ridge `τ·I` (Levenberg style) to the diagonal and
510        // re-factorize, escalating τ geometrically until the step is finite.
511        //
512        // The base ridge is scaled by the Hessian's largest diagonal entry so
513        // it is invariant to the problem's overall curvature scale: a tiny
514        // nudge relative to the dominant curvature, large enough to lift the
515        // null directions off zero. A finite δ from the ridged system is a
516        // descent direction for the *unridged* penalized objective `F`
517        // (ridging only shrinks the step toward the gradient direction), and
518        // the backtracking line search below validates it against `F` itself,
519        // so the ridge never biases the converged β̂ — at the optimum the
520        // gradient vanishes and the step → 0 regardless of τ.
521        let max_diag =
522            (0..beta_flat_dim).fold(0.0_f64, |acc, idx| acc.max(hessian[[idx, idx]].abs()));
523        // The ridge floors at `base_ridge` (not 0) for every solve. An exactly
524        // rank-deficient block (e.g. duplicate / collinear design columns under
525        // a near-zero λ) leaves `H = block(XᵀWX) + diag_a(λ_a S)` singular along
526        // a null direction. faer's Bunch–Kaufman fallback factorizes a singular
527        // matrix "successfully" and back-substitutes through the zero pivot to a
528        // *finite but arbitrary* component in the null space, so the resulting
529        // Newton direction is not a descent direction in the identified
530        // subspace — the line search then shrinks α toward 0 and the step-norm
531        // test declares a false convergence at a point where the unridged
532        // penalized gradient on identified directions is still large (gam#856).
533        // A minimal Tikhonov ridge `base_ridge·I` resolves the null direction to
534        // its minimum-norm representative, giving a true descent direction.
535        let base_ridge = if max_diag.is_finite() && max_diag > 0.0 {
536            max_diag * BASE_RIDGE_FRACTION_OF_MAX_DIAG
537        } else {
538            BASE_RIDGE_FRACTION_OF_MAX_DIAG
539        };
540        let mut delta = Array1::<f64>::zeros(beta_flat_dim);
541        let mut ridge = base_ridge;
542        let mut solved = false;
543        for attempt in 0..=MAX_RIDGE_ESCALATIONS {
544            let mut ridged = hessian.clone();
545            if ridge > 0.0 {
546                for idx in 0..beta_flat_dim {
547                    ridged[[idx, idx]] += ridge;
548                }
549            }
550            let factor = match factorize_symmetricwith_fallback(
551                FaerArrayView::new(&ridged).as_ref(),
552                Side::Lower,
553            ) {
554                Ok(factor) => factor,
555                Err(err) => {
556                    // A genuine factorization failure (not just a singular
557                    // pivot) — escalate the ridge and retry; only give up after
558                    // exhausting the escalation budget.
559                    if attempt == MAX_RIDGE_ESCALATIONS {
560                        return Err(EstimationError::InvalidInput(format!(
561                            "{context}: Hessian factorization failed at iter {iter} \
562                             even with ridge {ridge:.3e}: {err}"
563                        )));
564                    }
565                    ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
566                    continue;
567                }
568            };
569            let mut rhs = Array2::<f64>::zeros((beta_flat_dim, 1));
570            for i in 0..beta_flat_dim {
571                rhs[[i, 0]] = -grad_flat[i];
572            }
573            {
574                let rhs_view = array2_to_matmut(&mut rhs);
575                factor.solve_in_place(rhs_view);
576            }
577            if (0..beta_flat_dim).all(|i| rhs[[i, 0]].is_finite()) {
578                for i in 0..beta_flat_dim {
579                    delta[i] = rhs[[i, 0]];
580                }
581                solved = true;
582                break;
583            }
584            // Singular pivots back-substituted to ±inf/NaN: escalate the ridge.
585            ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
586        }
587        assert!(
588            solved,
589            "{context}: Newton step remained non-finite at iter {iter} after {} ridge \
590             escalations up to {ridge:.3e}; the penalized Hessian is pathologically \
591             rank-deficient (grad_norm={:.3e}, max_diag={max_diag:.3e})",
592            MAX_RIDGE_ESCALATIONS,
593            grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt(),
594        );
595
596        // Damped acceptance: full step first, halve up to `MAX_BACKTRACKS` times
597        // if the penalized negative log-likelihood fails to decrease. The first
598        // iteration seeds `last_objective` from the initial β.
599        let proposed_beta = |alpha: f64| -> Array2<f64> {
600            let mut out = beta.clone();
601            for a in 0..m {
602                for i in 0..p {
603                    out[[i, a]] += alpha * delta[a * p + i];
604                }
605            }
606            out
607        };
608        if iter == 0 {
609            last_objective = evaluate_objective(&beta, &mut eta_objective_scratch);
610            if !last_objective.is_finite() {
611                crate::bail_invalid_estim!("{context}: non-finite objective at β = 0");
612            }
613        }
614        let mut alpha = 1.0_f64;
615        let mut accepted_beta = proposed_beta(alpha);
616        let mut new_objective = evaluate_objective(&accepted_beta, &mut eta_objective_scratch);
617        let mut backtrack = 0usize;
618        while (!new_objective.is_finite()
619            || new_objective > last_objective + OBJECTIVE_DECREASE_SLACK)
620            && backtrack < MAX_BACKTRACKS
621        {
622            alpha *= LINE_SEARCH_SHRINK;
623            accepted_beta = proposed_beta(alpha);
624            new_objective = evaluate_objective(&accepted_beta, &mut eta_objective_scratch);
625            backtrack += 1;
626        }
627
628        let mut step_norm_sq = 0.0_f64;
629        let mut beta_norm_sq = 0.0_f64;
630        for a in 0..m {
631            for i in 0..p {
632                let d = accepted_beta[[i, a]] - beta[[i, a]];
633                step_norm_sq += d * d;
634                let v = accepted_beta[[i, a]];
635                beta_norm_sq += v * v;
636            }
637        }
638
639        beta = accepted_beta;
640        last_objective = new_objective;
641
642        let step_norm = step_norm_sq.sqrt();
643        let beta_norm = beta_norm_sq.sqrt();
644        // First-order optimality gate (gam#856): the step-norm test alone can
645        // fire prematurely when a backtracking line search has shrunk α on a
646        // poor direction, leaving a point that is NOT stationary. `grad_flat`
647        // is the unridged penalized gradient ∇F(β) at the pre-step β; with a
648        // small step it is ≈ ∇F at the accepted β. Its norm reflects only
649        // identified directions (it is exactly zero along an unidentified null
650        // direction such as a duplicate-column e₁−e₂ split), so requiring it to
651        // be small certifies first-order optimality on the identified subspace
652        // without penalizing legitimate non-identifiability. Scale the gate by
653        // the data magnitude so it is invariant to problem scale.
654        let grad_norm = grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt();
655        // Curvature-scaled optimality threshold: `max_diag` is the dominant
656        // penalized-Hessian diagonal entry, so `OPTIMALITY_GRAD_FRACTION·max_diag`
657        // is a tiny gradient relative to the problem's curvature scale and is
658        // reached by a few quadratically-converging Newton steps on this smooth,
659        // bounded softmax/binomial likelihood.
660        let grad_optimal = grad_norm <= OPTIMALITY_GRAD_FRACTION * (1.0 + max_diag);
661        if step_norm <= tol * (1.0 + beta_norm) && grad_optimal {
662            converged = true;
663            break;
664        }
665    }
666
667    // ──────────────────────────── post-process ────────────────────────────
668    recompute_eta(&beta, &mut eta);
669    let log_likelihood = likelihood.log_lik(eta.view(), y);
670    let penalty_term = weighted_penalty_sum(&beta, penalty, lambdas, class_penalty_metric);
671
672    Ok(PenalizedVectorGlmOutputs {
673        coefficients: beta,
674        eta,
675        iterations,
676        converged,
677        log_likelihood,
678        penalty_term,
679    })
680}
681
682#[cfg(test)]
683mod parity_tests {
684    //! Parity tests for the shared scaffold across both Fisher-block families
685    //! (issue #409). The engine is exercised through the two public adapters —
686    //! [`crate::binomial_multi::fit_penalized_binomial_multi`]
687    //! (row-diagonal block) and
688    //! [`crate::multinomial::fit_penalized_multinomial`] (dense
689    //! softmax block) — and we assert, with un-weakened bounds, that:
690    //!
691    //!   1. each fit hits the first-order optimality condition `∇F(β̂) = 0`,
692    //!      verified by a central finite difference of the penalized objective
693    //!      (the engine never sees this gradient, so this is an independent
694    //!      check that the shared Newton scaffold converged correctly);
695    //!   2. the reported fitted probabilities are consistent with `β̂` and the
696    //!      reported deviance equals `−2 · log L(β̂)`;
697    //!   3. for the binomial family, the `K`-column joint solve reproduces a
698    //!      from-scratch single-column penalized logistic Newton solve column
699    //!      for column (the row-diagonal block must decouple exactly).
700
701    use super::{ClassPenaltyMetric, weighted_penalty_sum};
702    use crate::binomial_multi::{BinomialMultiFitInputs, fit_penalized_binomial_multi};
703    use crate::multinomial::{MultinomialFitInputs, fit_penalized_multinomial};
704    use ndarray::{Array1, Array2};
705
706    /// #1587: the `Centered` class-penalty metric is invariant to the arbitrary
707    /// reference-class choice. Penalizing the `K−1` ALR contrasts under ANY of
708    /// the `K` baselines yields the same value (the symmetric CLR penalty
709    /// `Σ_k β̃_kᵀSβ̃_k`), whereas the historical `Diagonal` metric does not — that
710    /// non-invariance is exactly the #1587 defect. Pure-algebra check on the
711    /// penalty form (no fit), so it pins the engine foundation the production
712    /// wiring (REML per-term λ re-key) will build on.
713    #[test]
714    fn centered_penalty_is_reference_class_invariant_1587() {
715        // K = 3 classes, p = 2 coefficients; symmetric PSD penalty S.
716        let s = ndarray::array![[2.0_f64, 0.5], [0.5, 1.0]];
717        // A CLR (sum-to-zero) coefficient set: β̃_0 + β̃_1 + β̃_2 = 0.
718        let bt = [[1.0_f64, 0.5], [-0.3, 0.2], [-0.7, -0.7]];
719        for j in 0..2 {
720            let colsum: f64 = (0..3).map(|k| bt[k][j]).sum();
721            assert!(colsum.abs() < 1e-12, "test CLR set must sum to zero");
722        }
723        // Direct symmetric penalty Σ_k β̃_kᵀ S β̃_k.
724        let mut symmetric = 0.0_f64;
725        for k in 0..3 {
726            for i in 0..2 {
727                for j in 0..2 {
728                    symmetric += bt[k][i] * s[[i, j]] * bt[k][j];
729                }
730            }
731        }
732        let lambdas = Array1::from(vec![1.0_f64, 1.0]);
733        let mut centered_vals = Vec::new();
734        let mut diagonal_vals = Vec::new();
735        // For each reference class r, the two ALR contrasts are β̃_a − β̃_r (a≠r).
736        for r in 0..3 {
737            let others: Vec<usize> = (0..3).filter(|&k| k != r).collect();
738            let mut beta = Array2::<f64>::zeros((2, 2));
739            for (a, &o) in others.iter().enumerate() {
740                for i in 0..2 {
741                    beta[[i, a]] = bt[o][i] - bt[r][i];
742                }
743            }
744            let c =
745                weighted_penalty_sum(&beta, s.view(), lambdas.view(), ClassPenaltyMetric::Centered);
746            let d =
747                weighted_penalty_sum(&beta, s.view(), lambdas.view(), ClassPenaltyMetric::Diagonal);
748            assert!(
749                (c - 0.5 * symmetric).abs() < 1e-12,
750                "ref {r}: Centered penalty {c} must equal ½·symmetric {}",
751                0.5 * symmetric
752            );
753            centered_vals.push(c);
754            diagonal_vals.push(d);
755        }
756        let cspread = centered_vals.iter().cloned().fold(f64::MIN, f64::max)
757            - centered_vals.iter().cloned().fold(f64::MAX, f64::min);
758        assert!(
759            cspread < 1e-12,
760            "Centered must be reference-invariant; got {centered_vals:?}"
761        );
762        let dspread = diagonal_vals.iter().cloned().fold(f64::MIN, f64::max)
763            - diagonal_vals.iter().cloned().fold(f64::MAX, f64::min);
764        assert!(
765            dspread > 1e-6,
766            "Diagonal is the non-invariant #1587 path; references must disagree, got {diagonal_vals:?}"
767        );
768    }
769
770    fn sigmoid(eta: f64) -> f64 {
771        if eta >= 0.0 {
772            1.0 / (1.0 + (-eta).exp())
773        } else {
774            let e = eta.exp();
775            e / (1.0 + e)
776        }
777    }
778
779    /// Softmax with implicit reference column (η_ref = 0) over `M` active η.
780    fn softmax_ref(eta_active: &[f64]) -> Vec<f64> {
781        let m = eta_active.len();
782        let mut out = vec![0.0_f64; m + 1];
783        let mut max_eta = 0.0_f64;
784        for &v in eta_active {
785            if v > max_eta {
786                max_eta = v;
787            }
788        }
789        let baseline = (-max_eta).exp();
790        let mut denom = baseline;
791        for (idx, &v) in eta_active.iter().enumerate() {
792            let e = (v - max_eta).exp();
793            out[idx] = e;
794            denom += e;
795        }
796        for v in out.iter_mut().take(m) {
797            *v /= denom;
798        }
799        out[m] = baseline / denom;
800        out
801    }
802
803    /// Penalized negative log-likelihood for the independent-binomial family at
804    /// a candidate coefficient matrix `β ∈ ℝ^{P×K}`, computed directly from the
805    /// definition (no engine internals).
806    fn binomial_objective(
807        design: &Array2<f64>,
808        y: &Array2<f64>,
809        penalty: &Array2<f64>,
810        lambdas: &Array1<f64>,
811        beta: &Array2<f64>,
812    ) -> f64 {
813        let (n, p) = design.dim();
814        let k = y.ncols();
815        let mut ll = 0.0_f64;
816        for row in 0..n {
817            for a in 0..k {
818                let mut eta = 0.0_f64;
819                for i in 0..p {
820                    eta += design[[row, i]] * beta[[i, a]];
821                }
822                let mu = sigmoid(eta).clamp(1.0e-12, 1.0 - 1.0e-12);
823                let yv = y[[row, a]];
824                ll += yv * mu.ln() + (1.0 - yv) * (1.0 - mu).ln();
825            }
826        }
827        let mut pen = 0.0_f64;
828        for a in 0..k {
829            let la = lambdas[a];
830            for i in 0..p {
831                let mut sbi = 0.0_f64;
832                for j in 0..p {
833                    sbi += penalty[[i, j]] * beta[[j, a]];
834                }
835                pen += 0.5 * la * beta[[i, a]] * sbi;
836            }
837        }
838        -ll + pen
839    }
840
841    /// Penalized negative log-likelihood for the multinomial family at a
842    /// candidate active-class coefficient matrix `β ∈ ℝ^{P×(K-1)}`.
843    fn multinomial_objective(
844        design: &Array2<f64>,
845        y_one_hot: &Array2<f64>,
846        penalty: &Array2<f64>,
847        lambdas: &Array1<f64>,
848        beta: &Array2<f64>,
849    ) -> f64 {
850        let (n, p) = design.dim();
851        let k = y_one_hot.ncols();
852        let m = k - 1;
853        let mut ll = 0.0_f64;
854        let mut eta_active = vec![0.0_f64; m];
855        for row in 0..n {
856            for a in 0..m {
857                let mut eta = 0.0_f64;
858                for i in 0..p {
859                    eta += design[[row, i]] * beta[[i, a]];
860                }
861                eta_active[a] = eta;
862            }
863            let probs = softmax_ref(&eta_active);
864            for c in 0..k {
865                let yc = y_one_hot[[row, c]];
866                if yc != 0.0 {
867                    ll += yc * probs[c].max(1.0e-300).ln();
868                }
869            }
870        }
871        let mut pen = 0.0_f64;
872        for a in 0..m {
873            let la = lambdas[a];
874            for i in 0..p {
875                let mut sbi = 0.0_f64;
876                for j in 0..p {
877                    sbi += penalty[[i, j]] * beta[[j, a]];
878                }
879                pen += 0.5 * la * beta[[i, a]] * sbi;
880            }
881        }
882        -ll + pen
883    }
884
885    /// Central finite-difference gradient of an objective over every entry of a
886    /// `(P, C)` coefficient matrix. The optimum must drive every component to
887    /// ~0; we assert the max |component| against an un-weakened bound.
888    fn fd_grad<F: Fn(&Array2<f64>) -> f64>(beta: &Array2<f64>, f: F) -> f64 {
889        let (p, c) = beta.dim();
890        let h = 1.0e-6;
891        let mut max_abs = 0.0_f64;
892        for i in 0..p {
893            for a in 0..c {
894                let mut up = beta.clone();
895                let mut dn = beta.clone();
896                up[[i, a]] += h;
897                dn[[i, a]] -= h;
898                let g = (f(&up) - f(&dn)) / (2.0 * h);
899                max_abs = max_abs.max(g.abs());
900            }
901        }
902        max_abs
903    }
904
905    fn binomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
906        let n = 40;
907        let p = 3;
908        let k = 3;
909        let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
910            0 => 1.0,
911            1 => ((i + 1) as f64 * 0.37).sin(),
912            _ => ((i + 1) as f64 * 0.11).cos(),
913        });
914        let y = Array2::<f64>::from_shape_fn((n, k), |(i, a)| {
915            // Deterministic but non-degenerate {0,1} labels per column.
916            if ((i * 7 + a * 13 + 3) % 5) < 3 {
917                1.0
918            } else {
919                0.0
920            }
921        });
922        let penalty = Array2::<f64>::eye(p);
923        let lambdas = Array1::from(vec![0.3_f64, 1.2, 2.5]);
924        (design, y, penalty, lambdas)
925    }
926
927    fn multinomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
928        let n = 45;
929        let p = 3;
930        let k = 4;
931        let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
932            0 => 1.0,
933            1 => ((i + 2) as f64 * 0.29).sin(),
934            _ => ((i + 2) as f64 * 0.17).cos(),
935        });
936        let mut y = Array2::<f64>::zeros((n, k));
937        for i in 0..n {
938            y[[i, (i * 3 + 1) % k]] = 1.0;
939        }
940        let penalty = Array2::<f64>::eye(p);
941        let lambdas = Array1::from(vec![0.5_f64, 1.0, 2.0]);
942        (design, y, penalty, lambdas)
943    }
944
945    #[test]
946    fn binomial_engine_hits_optimum_and_is_self_consistent() {
947        let (design, y, penalty, lambdas) = binomial_fixture();
948        let fit = fit_penalized_binomial_multi(BinomialMultiFitInputs {
949            design: design.view(),
950            y: y.view(),
951            penalty: penalty.view(),
952            lambdas: lambdas.view(),
953            row_weights: None,
954            fisher_w_override: None,
955            max_iter: 100,
956            tol: 1.0e-12,
957        })
958        .expect("binomial fit must succeed");
959        assert!(fit.converged, "binomial fit must converge");
960
961        // First-order optimality: ∇F(β̂) = 0 (engine never used this gradient).
962        let g = fd_grad(&fit.coefficients, |b| {
963            binomial_objective(&design, &y, &penalty, &lambdas, b)
964        });
965        assert!(
966            g < 1.0e-6,
967            "binomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
968        );
969
970        // Fitted probabilities reproduce σ(X β̂) and deviance = −2 log L.
971        let (n, p) = design.dim();
972        let k = y.ncols();
973        let mut log_lik = 0.0_f64;
974        for row in 0..n {
975            for a in 0..k {
976                let mut eta = 0.0_f64;
977                for i in 0..p {
978                    eta += design[[row, i]] * fit.coefficients[[i, a]];
979                }
980                let mu = sigmoid(eta);
981                assert!(
982                    (fit.fitted_probabilities[[row, a]] - mu).abs() < 1.0e-10,
983                    "fitted probability must equal σ(X β̂)"
984                );
985                let muc = mu.clamp(1.0e-12, 1.0 - 1.0e-12);
986                let yv = y[[row, a]];
987                log_lik += yv * muc.ln() + (1.0 - yv) * (1.0 - muc).ln();
988            }
989        }
990        assert!(
991            (fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
992            "deviance must equal −2 log L"
993        );
994    }
995
996    #[test]
997    fn binomial_joint_solve_decouples_into_single_column_solves() {
998        // Parity: the row-diagonal Fisher block means the K-column joint solve
999        // must reproduce, column for column, an independent single-column
1000        // penalized logistic Newton solve. This is the defining property the
1001        // shared engine preserves for the independent-binomial family.
1002        let (design, y, penalty, lambdas) = binomial_fixture();
1003        let joint = fit_penalized_binomial_multi(BinomialMultiFitInputs {
1004            design: design.view(),
1005            y: y.view(),
1006            penalty: penalty.view(),
1007            lambdas: lambdas.view(),
1008            row_weights: None,
1009            fisher_w_override: None,
1010            max_iter: 100,
1011            tol: 1.0e-12,
1012        })
1013        .expect("joint fit must succeed");
1014
1015        let k = y.ncols();
1016        for a in 0..k {
1017            // Single-column problem: one binomial response, one λ.
1018            let y_col = y.column(a).to_owned().insert_axis(ndarray::Axis(1));
1019            let lam = Array1::from(vec![lambdas[a]]);
1020            let single = fit_penalized_binomial_multi(BinomialMultiFitInputs {
1021                design: design.view(),
1022                y: y_col.view(),
1023                penalty: penalty.view(),
1024                lambdas: lam.view(),
1025                row_weights: None,
1026                fisher_w_override: None,
1027                max_iter: 100,
1028                tol: 1.0e-12,
1029            })
1030            .expect("single-column fit must succeed");
1031            for i in 0..design.ncols() {
1032                let dj = joint.coefficients[[i, a]];
1033                let ds = single.coefficients[[i, 0]];
1034                assert!(
1035                    (dj - ds).abs() < 1.0e-8,
1036                    "joint column {a} coef {i} ({dj}) must match single-column solve ({ds})"
1037                );
1038            }
1039        }
1040    }
1041
1042    #[test]
1043    fn multinomial_engine_hits_optimum_and_is_self_consistent() {
1044        let (design, y, penalty, lambdas) = multinomial_fixture();
1045        let fit = fit_penalized_multinomial(MultinomialFitInputs {
1046            design: design.view(),
1047            y_one_hot: y.view(),
1048            penalty: penalty.view(),
1049            lambdas: lambdas.view(),
1050            row_weights: None,
1051            fisher_w_override: None,
1052            max_iter: 100,
1053            tol: 1.0e-12,
1054        })
1055        .expect("multinomial fit must succeed");
1056        assert!(fit.converged, "multinomial fit must converge");
1057
1058        // First-order optimality: ∇F(β̂) = 0.
1059        let g = fd_grad(&fit.coefficients_active, |b| {
1060            multinomial_objective(&design, &y, &penalty, &lambdas, b)
1061        });
1062        assert!(
1063            g < 1.0e-6,
1064            "multinomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
1065        );
1066
1067        // Fitted probabilities are a valid simplex per row and reproduce the
1068        // softmax of X β̂; deviance = −2 log L.
1069        let (n, p) = design.dim();
1070        let k = y.ncols();
1071        let m = k - 1;
1072        let mut log_lik = 0.0_f64;
1073        let mut eta_active = vec![0.0_f64; m];
1074        for row in 0..n {
1075            for a in 0..m {
1076                let mut eta = 0.0_f64;
1077                for i in 0..p {
1078                    eta += design[[row, i]] * fit.coefficients_active[[i, a]];
1079                }
1080                eta_active[a] = eta;
1081            }
1082            let probs = softmax_ref(&eta_active);
1083            let mut row_sum = 0.0_f64;
1084            for c in 0..k {
1085                assert!(
1086                    (fit.fitted_probabilities[[row, c]] - probs[c]).abs() < 1.0e-10,
1087                    "fitted probability must equal softmax(X β̂)"
1088                );
1089                row_sum += fit.fitted_probabilities[[row, c]];
1090                let yc = y[[row, c]];
1091                if yc != 0.0 {
1092                    log_lik += yc * probs[c].max(1.0e-300).ln();
1093                }
1094            }
1095            assert!(
1096                (row_sum - 1.0).abs() < 1.0e-10,
1097                "fitted probabilities must sum to 1 per row"
1098            );
1099        }
1100        assert!(
1101            (fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
1102            "deviance must equal −2 log L"
1103        );
1104    }
1105
1106    #[test]
1107    fn multinomial_rank_deficient_block_recovers_via_ridge_not_crash() {
1108        // Issue #557: a rank-deficient class block under a tiny per-class λ used
1109        // to make faer's Bunch–Kaufman fallback back-substitute through near-zero
1110        // pivots into a non-finite Newton step δ, and the solver aborted with
1111        // "Newton step is non-finite". The adaptive Levenberg–Marquardt ridge
1112        // must instead lift the null direction off zero, keep δ finite, and let
1113        // the backtracking line search converge to the penalized optimum.
1114        //
1115        // Construct an exactly rank-deficient design: column 2 is a perfect
1116        // duplicate of column 1, so XᵀWX is singular along (e₁ − e₂) for every
1117        // class, and we drive the corresponding λ to a tiny value so the penalty
1118        // cannot regularize that null direction. A non-robust solver crashes
1119        // here; the ridge path must produce a finite, self-consistent fit.
1120        let n = 50;
1121        let p = 4;
1122        let k = 4;
1123        let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
1124            0 => 1.0,
1125            1 => ((i + 1) as f64 * 0.23).sin(),
1126            2 => ((i + 1) as f64 * 0.23).sin(), // exact duplicate of column 1
1127            _ => ((i + 1) as f64 * 0.19).cos(),
1128        });
1129        let mut y = Array2::<f64>::zeros((n, k));
1130        for i in 0..n {
1131            y[[i, (i * 5 + 2) % k]] = 1.0;
1132        }
1133        // Penalty touches only the smooth-ish columns 1..p; columns 0/1/2 share
1134        // the collinearity, and a near-zero λ leaves the (e₁ − e₂) null direction
1135        // unregularized — exactly the rank-deficient regime that triggered #557.
1136        let mut penalty = Array2::<f64>::zeros((p, p));
1137        penalty[[3, 3]] = 1.0;
1138        let lambdas = Array1::from(vec![1.0e-10_f64, 1.0e-10, 1.0e-10]);
1139
1140        let fit = fit_penalized_multinomial(MultinomialFitInputs {
1141            design: design.view(),
1142            y_one_hot: y.view(),
1143            penalty: penalty.view(),
1144            lambdas: lambdas.view(),
1145            row_weights: None,
1146            fisher_w_override: None,
1147            max_iter: 200,
1148            tol: 1.0e-10,
1149        })
1150        .expect("rank-deficient multinomial fit must NOT crash (#557): the ridge path recovers it");
1151
1152        // Every coefficient and fitted probability must be finite (no inf/NaN
1153        // leaked from the near-singular solve).
1154        for &c in fit.coefficients_active.iter() {
1155            assert!(c.is_finite(), "coefficient must be finite, got {c}");
1156        }
1157        for &pr in fit.fitted_probabilities.iter() {
1158            assert!(
1159                pr.is_finite() && (-1.0e-9..=1.0 + 1.0e-9).contains(&pr),
1160                "fitted probability must be a finite simplex entry, got {pr}"
1161            );
1162        }
1163        // Rows must remain on the simplex.
1164        let (nn, kk) = fit.fitted_probabilities.dim();
1165        for row in 0..nn {
1166            let s: f64 = (0..kk).map(|c| fit.fitted_probabilities[[row, c]]).sum();
1167            assert!(
1168                (s - 1.0).abs() < 1.0e-9,
1169                "row {row} probabilities must sum to 1, got {s}"
1170            );
1171        }
1172
1173        // The recovered fit must satisfy first-order optimality of the penalized
1174        // objective along every NON-NULL coordinate. The (e₁ − e₂) null
1175        // direction is unidentified (the ridge picks the minimum-norm split
1176        // between the duplicate columns), so the gradient is exactly zero along
1177        // every identified direction; a central finite difference of F over the
1178        // full coefficient matrix is dominated by the identified part and must be
1179        // small. We assert the penalized objective gradient is near-zero — the
1180        // ridge biases the step but never the optimum (at β̂ the unridged
1181        // gradient vanishes for any τ).
1182        let g = fd_grad(&fit.coefficients_active, |b| {
1183            multinomial_objective(&design, &y, &penalty, &lambdas, b)
1184        });
1185        assert!(
1186            g < 1.0e-4,
1187            "penalized objective gradient at the ridge-recovered β̂ must (near-)vanish \
1188             along identified directions (max |∂F| = {g})"
1189        );
1190    }
1191}