gam_models/penalized_vector_glm.rs
1//! Generic penalized vector-response GLM Newton solver (fixed λ).
2//!
3//! This is the shared scaffold extracted from
4//! [`crate::multinomial::fit_penalized_multinomial`] (dense softmax
5//! Fisher block) and
6//! [`crate::binomial_multi::fit_penalized_binomial_multi`]
7//! (row-diagonal independent-binomial Fisher block). Both families fit a
8//! penalized vector-response GLM with a shared design `X ∈ ℝ^{N×P}` and a
9//! shared penalty `S ∈ ℝ^{P×P}` replicated per output, differing **only** in
10//! the per-row Fisher-block algebra and the likelihood/residual. Everything
11//! else — input validation, penalized objective / gradient / Hessian assembly,
12//! damped Newton with backtracking, the relative-step convergence test, and the
13//! final penalized-objective / deviance tally — is written once here.
14//!
15//! # Fit problem
16//!
17//! With `β = [β_0; β_1; …; β_{M-1}]` stacked in output-major order
18//! (`β_a ∈ ℝ^P` is the coefficient block for output `a`), minimise the
19//! penalized negative log-likelihood
20//!
21//! ```text
22//! F(β) = − log L(β) + ½ Σ_{a=0}^{M-1} λ_a · β_aᵀ S β_a
23//! ```
24//!
25//! where `log L` and its η-derivatives are supplied by the family's
26//! [`VectorLikelihood`] adapter and `λ_a` is a per-output smoothing parameter
27//! scaling the shared penalty `S`. The active linear predictor is
28//! `η_{n,a} = (X β_a)_n`, shape `(N, M)`.
29//!
30//! # Newton step
31//!
32//! Each iteration assembles the coupled penalized Hessian and gradient in
33//! output-major coefficient ordering `flat[a·P + i] = β[i, a]` (matching
34//! [`gam_solve::pirls::dense_block_xtwx`]):
35//!
36//! ```text
37//! H[a·P + i, b·P + j] = Σ_n W_{n,a,b} · X[n,i] · X[n,j] (+ δ_{ab} λ_a S[i,j])
38//! g[a·P + i] = Σ_n r_{n,a} · X[n,i] (+ λ_a (S β_a)[i])
39//! ```
40//!
41//! with the per-row Fisher block `W_{n,·,·} = −∂² log L / ∂η ∂η` (the family's
42//! [`VectorLikelihood::hess_block`], or a caller override) and the residual
43//! `r_{n,a} = −∂ log L / ∂η_a` (`−`[`VectorLikelihood::grad_eta`]). The step
44//! `δ = − H^{-1} g` is solved through faer's symmetric-PD-with-fallback
45//! factorisation under an adaptive Levenberg–Marquardt ridge: when a
46//! rank-deficient block (collinear / quasi-separated columns under a small
47//! per-output λ) makes the Bunch–Kaufman fallback back-substitute through
48//! near-zero pivots into a non-finite δ, a diagonal ridge `τ·I` — scaled by the
49//! Hessian's largest diagonal so it is curvature-scale invariant — is added and
50//! the system re-solved, escalating τ geometrically until δ is finite. The
51//! step is then accepted by a backtracking line search on `F` (full step first,
52//! halve up to 8 times). Because the line search validates against the
53//! *unridged* objective `F`, the ridge never biases the converged β̂ (at the
54//! optimum the gradient vanishes and δ → 0 for any τ). Convergence is the
55//! relative coefficient step `‖δ‖ / (1 + ‖β‖) ≤ tol`.
56//!
57//! # Fisher-block override
58//!
59//! When `fisher_w_override` is `Some`, each Newton step uses the supplied
60//! per-row `(N, M, M)` curvature block in place of the analytic
61//! [`VectorLikelihood::hess_block`]; the gradient/residual path stays analytic
62//! (issue #349). The two families differ in what they accept off the diagonal:
63//! multinomial admits a full dense block, while independent-binomial columns
64//! only consume the per-output diagonal (a non-zero cross term cannot be
65//! represented by the separable columns). That family-specific precondition is
66//! enforced by the adapter before it constructs the override view; the engine
67//! consumes whatever block it is given.
68
69use gam_linalg::faer_ndarray::{FaerArrayView, array2_to_matmut, factorize_symmetricwith_fallback};
70use crate::vector_response::VectorLikelihood;
71use crate::model_types::EstimationError;
72use gam_solve::pirls::dense_block_xtwx;
73use faer::Side;
74use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayView3};
75
76/// Base Levenberg–Marquardt ridge as a fraction of the penalized Hessian's
77/// largest diagonal entry (so it is invariant to the problem's overall
78/// curvature scale). At ~1e-10 of the dominant curvature it is negligible
79/// relative to identified-direction curvature — it never biases the identified
80/// optimum (at β̂ the unridged gradient still vanishes there) — yet large
81/// enough to lift an exactly rank-deficient null direction off zero so the
82/// Bunch–Kaufman fallback yields a finite, descent Newton step (gam#856).
83const BASE_RIDGE_FRACTION_OF_MAX_DIAG: f64 = 1.0e-10;
84
85/// Geometric ridge-escalation budget for a single Newton step. 30 doublings
86/// span ~9 orders of magnitude over the base ridge, which covers any
87/// conditioning a finite-curvature softmax/binomial block can present.
88const MAX_RIDGE_ESCALATIONS: usize = 30;
89
90/// Backtracking budget for the damped-Newton line search: full step first, then
91/// halve up to this many times if the penalized objective fails to decrease.
92const MAX_BACKTRACKS: usize = 8;
93
94/// Per-step line-search contraction factor (halving).
95const LINE_SEARCH_SHRINK: f64 = 0.5;
96
97/// Slack on the "objective decreased" acceptance test, absorbing floating-point
98/// round-off so a step that is flat to machine precision is not rejected.
99const OBJECTIVE_DECREASE_SLACK: f64 = 1.0e-12;
100
101/// First-order optimality gate (gam#856) as a fraction of `1 + max_diag`: the
102/// unridged penalized gradient norm must fall below this curvature-scaled
103/// threshold before convergence is declared, certifying stationarity on the
104/// identified subspace rather than a premature step-norm stall.
105const OPTIMALITY_GRAD_FRACTION: f64 = 1.0e-6;
106
107/// Class-space metric of the replicated smoothing penalty (#1587).
108///
109/// * `Diagonal` — the historical `diag_a(λ_a) ⊗ S`: each active output's
110/// coefficient block is penalised independently. Correct for genuinely
111/// independent outputs (independent-binomial columns), but for a *softmax*
112/// multinomial it penalises the reference-anchored log-odds contrasts
113/// `η_a = log(p_a/p_ref)`, so the fit is NOT invariant to the arbitrary
114/// reference-class choice (#1587).
115/// * `Centered` — the reference-symmetric `λ · ((I_{M} − J_{M}/K) ⊗ S)` with a
116/// single shared `λ` (= `lambdas[0]`; the caller must pass uniform `lambdas`)
117/// and `K = M + 1`. This is exactly the symmetric CLR penalty
118/// `Σ_{k=0}^{K-1} β̃_kᵀ S β̃_k` (with `Σ_k β̃_k = 0`) written in the active-class
119/// (ALR) gauge — invariant to which class is the baseline (the multinomial
120/// analogue of #1549's `G^{1/2}` Aitchison whitening). Couples the class
121/// blocks via the `−(λ/K)·S` off-diagonals; the engine already factors a
122/// class-coupled Hessian (the softmax Fisher block is dense), so this is a
123/// penalty-assembly change only.
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
125pub enum ClassPenaltyMetric {
126 /// Independent per-output penalty `diag_a(λ_a) ⊗ S` (historical default).
127 #[default]
128 Diagonal,
129 /// Reference-symmetric centered penalty `λ·((I − J/K) ⊗ S)`, `K = M + 1`.
130 Centered,
131}
132
133/// Inputs to [`fit_penalized_vector_glm`].
134///
135/// `M` (the number of active outputs / linear-predictor columns) is taken from
136/// `lambdas.len()`; the engine validates it against the design and override
137/// shapes. The response `y` is passed verbatim to the [`VectorLikelihood`]
138/// adapter, which owns its own `(N, ·)` shape contract (binomial columns use
139/// `K = M`; multinomial one-hot uses `K = M + 1`), so the engine does not
140/// constrain its column count beyond `y.nrows() == N`.
141pub struct PenalizedVectorGlmInputs<'a> {
142 /// Design matrix `X ∈ ℝ^{N×P}` (one row per observation, shared across
143 /// every output column).
144 pub design: ArrayView2<'a, f64>,
145 /// Response `Y ∈ ℝ^{N×·}`, interpreted by the [`VectorLikelihood`].
146 pub y: ArrayView2<'a, f64>,
147 /// Shared smoothing penalty `S ∈ ℝ^{P×P}` (symmetric, PSD).
148 pub penalty: ArrayView2<'a, f64>,
149 /// Per-output smoothing parameter `λ_a`, length `M`.
150 pub lambdas: ArrayView1<'a, f64>,
151 /// Optional per-row Fisher-block override, shape `(N, M, M)`. When `Some`,
152 /// it replaces the analytic [`VectorLikelihood::hess_block`] as the Newton
153 /// curvature; the gradient/residual path stays analytic (issue #349). The
154 /// adapter is responsible for any family-specific structural precondition
155 /// on the block (e.g. zero off-diagonals for independent columns).
156 pub fisher_w_override: Option<ArrayView3<'a, f64>>,
157 /// Maximum Newton iterations; recommend 50.
158 pub max_iter: usize,
159 /// Relative-step convergence tolerance; recommend 1e-7.
160 pub tol: f64,
161 /// Class-space metric of the replicated penalty (#1587). `Diagonal`
162 /// preserves the historical independent-per-output penalty; `Centered`
163 /// selects the reference-symmetric softmax penalty (requires uniform
164 /// `lambdas`). See [`ClassPenaltyMetric`].
165 pub class_penalty_metric: ClassPenaltyMetric,
166}
167
168/// Outputs of [`fit_penalized_vector_glm`].
169pub struct PenalizedVectorGlmOutputs {
170 /// Coefficient matrix, shape `(P, M)` (column `a` is `β_a`).
171 pub coefficients: Array2<f64>,
172 /// Final active linear predictor `η = X β̂`, shape `(N, M)`. The adapter
173 /// turns this into fitted probabilities via its own inverse link.
174 pub eta: Array2<f64>,
175 /// Number of Newton iterations executed (including the final step that
176 /// satisfied the tolerance).
177 pub iterations: usize,
178 /// `true` if the relative-step test was satisfied before `max_iter`.
179 pub converged: bool,
180 /// Unpenalized log-likelihood `log L(β̂)`.
181 pub log_likelihood: f64,
182 /// Penalty term `½ Σ_a λ_a · β̂_aᵀ S β̂_a` at the returned `β̂`.
183 pub penalty_term: f64,
184}
185
186/// Quadratic form `½ β_aᵀ S β_a` accumulated across outputs with per-output
187/// weight `λ_a`. Shared by the objective evaluator and the final tally.
188fn weighted_penalty_sum(
189 beta: &Array2<f64>,
190 penalty: ArrayView2<'_, f64>,
191 lambdas: ArrayView1<'_, f64>,
192 metric: ClassPenaltyMetric,
193) -> f64 {
194 let (p, m) = beta.dim();
195 match metric {
196 ClassPenaltyMetric::Diagonal => {
197 let mut pen = 0.0_f64;
198 for a in 0..m {
199 let la = lambdas[a];
200 if la == 0.0 {
201 continue;
202 }
203 let beta_col = beta.column(a);
204 let mut quad = 0.0_f64;
205 for i in 0..p {
206 let mut s_beta_i = 0.0_f64;
207 for j in 0..p {
208 s_beta_i += penalty[[i, j]] * beta_col[j];
209 }
210 quad += beta_col[i] * s_beta_i;
211 }
212 pen += 0.5 * la * quad;
213 }
214 pen
215 }
216 // Centered (#1587): ½·λ·[ Σ_a β_aᵀSβ_a − (1/K)·gᵀSg ], g = Σ_a β_a,
217 // K = M + 1. Equals the symmetric CLR penalty Σ_k β̃_kᵀSβ̃_k (Σβ̃=0) in
218 // the active-class gauge — reference-invariant. Shared λ = lambdas[0].
219 ClassPenaltyMetric::Centered => {
220 if m == 0 {
221 return 0.0;
222 }
223 let lam = lambdas[0];
224 if lam == 0.0 {
225 return 0.0;
226 }
227 let k = (m + 1) as f64;
228 // g = Σ_a β_a (the active-class coefficient sum, a p-vector).
229 let mut g = vec![0.0_f64; p];
230 for a in 0..m {
231 let col = beta.column(a);
232 for i in 0..p {
233 g[i] += col[i];
234 }
235 }
236 // Σ_a β_aᵀSβ_a.
237 let mut sum_quad = 0.0_f64;
238 for a in 0..m {
239 let col = beta.column(a);
240 for i in 0..p {
241 let mut s_beta_i = 0.0_f64;
242 for j in 0..p {
243 s_beta_i += penalty[[i, j]] * col[j];
244 }
245 sum_quad += col[i] * s_beta_i;
246 }
247 }
248 // gᵀSg.
249 let mut g_quad = 0.0_f64;
250 for i in 0..p {
251 let mut s_g_i = 0.0_f64;
252 for j in 0..p {
253 s_g_i += penalty[[i, j]] * g[j];
254 }
255 g_quad += g[i] * s_g_i;
256 }
257 0.5 * lam * (sum_quad - g_quad / k)
258 }
259 }
260}
261
262/// Fit a penalized vector-response GLM at fixed `λ` via damped Newton.
263///
264/// The `likelihood` adapter supplies the per-row Fisher block, the residual
265/// gradient, and the log-likelihood; the engine owns the entire optimisation
266/// scaffold. See the module docs for the optimisation problem, the
267/// output-major coefficient ordering, and the convergence semantics.
268///
269/// `context` is woven into every diagnostic message so each family keeps its
270/// own error prefix (e.g. `"fit_penalized_multinomial"`).
271pub fn fit_penalized_vector_glm<L: VectorLikelihood>(
272 inputs: PenalizedVectorGlmInputs<'_>,
273 likelihood: &L,
274 context: &str,
275) -> Result<PenalizedVectorGlmOutputs, EstimationError> {
276 let PenalizedVectorGlmInputs {
277 design,
278 y,
279 penalty,
280 lambdas,
281 fisher_w_override,
282 max_iter,
283 tol,
284 class_penalty_metric,
285 } = inputs;
286
287 // ────────────────────────────── shape checks ──────────────────────────
288 let n_obs = design.nrows();
289 let p = design.ncols();
290 if n_obs == 0 || p == 0 {
291 crate::bail_invalid_estim!("{context}: design must be nonempty (got {n_obs}x{p})");
292 }
293 let m = lambdas.len();
294 if m == 0 {
295 crate::bail_invalid_estim!("{context}: need at least one active output (got M=0)");
296 }
297 if y.nrows() != n_obs {
298 crate::bail_invalid_estim!("{context}: y rows {} ≠ design rows {n_obs}", y.nrows());
299 }
300 if penalty.dim() != (p, p) {
301 crate::bail_invalid_estim!(
302 "{context}: penalty shape {:?} ≠ (P, P) = ({p}, {p})",
303 penalty.dim()
304 );
305 }
306 for (i, &v) in lambdas.iter().enumerate() {
307 if !(v.is_finite() && v >= 0.0) {
308 crate::bail_invalid_estim!("{context}: lambdas[{i}] must be finite and ≥ 0 (got {v})");
309 }
310 }
311 if let Some(fw) = fisher_w_override.as_ref() {
312 if fw.dim() != (n_obs, m, m) {
313 crate::bail_invalid_estim!(
314 "{context}: fisher_w_override shape {:?} ≠ (N, M, M) = ({n_obs}, {m}, {m})",
315 fw.dim()
316 );
317 }
318 }
319 for ((i, j), &v) in design.indexed_iter() {
320 if !v.is_finite() {
321 crate::bail_invalid_estim!("{context}: design[{i},{j}] must be finite (got {v})");
322 }
323 }
324
325 // ────────────────────────── Newton iteration ──────────────────────────
326 // β stored as (P, M) column-major-per-output; flat index uses output-major
327 // ordering `flat[a · P + i] = β[i, a]` to align with `dense_block_xtwx`.
328 let mut beta = Array2::<f64>::zeros((p, m));
329 let mut eta = Array2::<f64>::zeros((n_obs, m));
330 // Reused η scratch for the line-search objective probes (see
331 // `evaluate_objective`): overwritten in full on every call, so it carries
332 // no state between calls and hoisting it out of the backtracking loop is a
333 // pure heap-allocation removal with no effect on the computed objective.
334 let mut eta_objective_scratch = Array2::<f64>::zeros((n_obs, m));
335 let beta_flat_dim = p * m;
336 // Reused penalized-gradient buffer: each Newton iteration writes every entry
337 // `grad_flat[a·p + i] = Xᵀr` (direct assignment over all a∈0..m, i∈0..p)
338 // before adding the penalty term and before any read, so it carries no state
339 // across iterations and hoisting it out of the Newton loop is a pure
340 // heap-allocation removal with no effect on the computed gradient.
341 let mut grad_flat = Array1::<f64>::zeros(beta_flat_dim);
342
343 let mut iterations = 0usize;
344 let mut converged = false;
345 let mut last_objective = f64::INFINITY;
346
347 // η = X · β for the current β, reused by the analytic Fisher / gradient.
348 let recompute_eta = |beta: &Array2<f64>, eta: &mut Array2<f64>| {
349 for a in 0..m {
350 let beta_col = beta.column(a);
351 for row in 0..n_obs {
352 let mut eta_val = 0.0_f64;
353 for i in 0..p {
354 eta_val += design[[row, i]] * beta_col[i];
355 }
356 eta[[row, a]] = eta_val;
357 }
358 }
359 };
360
361 // Penalized objective F(β) = − log L(X β) + ½ Σ_a λ_a β_aᵀ S β_a.
362 // The caller supplies a reused `(n_obs, m)` scratch for η = X·β so the
363 // backtracking line search (which calls this up to `MAX_BACKTRACKS + 1`
364 // times per Newton iteration) does not heap-allocate a fresh η buffer on
365 // every probe. The scratch is overwritten in full by `recompute_eta` before
366 // it is read, so reusing it is bit-for-bit identical to the prior
367 // allocate-fresh body: `recompute_eta` runs the SAME `Σ_i design·β` loop in
368 // the SAME order this closure used inline.
369 let evaluate_objective = |beta_trial: &Array2<f64>, eta_scratch: &mut Array2<f64>| -> f64 {
370 recompute_eta(beta_trial, eta_scratch);
371 let ll = likelihood.log_lik(eta_scratch.view(), y);
372 let pen = weighted_penalty_sum(beta_trial, penalty, lambdas, class_penalty_metric);
373 -ll + pen
374 };
375
376 for iter in 0..max_iter {
377 iterations = iter + 1;
378
379 recompute_eta(&beta, &mut eta);
380
381 // Per-row dense Fisher block W_{n,a,b} = −∂² log L / ∂η_a ∂η_b: either
382 // the caller-supplied curvature override (issue #349 escape-hatch —
383 // curvature only) or the analytic [`VectorLikelihood::hess_block`]. The
384 // residual r_{n,a} = −∂ log L / ∂η_a stays analytic in both cases.
385 let analytic_fisher = fisher_w_override
386 .as_ref()
387 .map_or_else(|| Some(likelihood.hess_block(eta.view(), y)), |_| None);
388 let fisher_blocks = match fisher_w_override.as_ref() {
389 Some(fw) => *fw,
390 None => analytic_fisher
391 .as_ref()
392 .expect("analytic Fisher computed when no override")
393 .view(),
394 };
395 let residual = likelihood.grad_eta(eta.view(), y).mapv(|v| -v);
396
397 // Penalized Hessian: H = block(XᵀWX) + diag_a(λ_a S).
398 let mut hessian = dense_block_xtwx(design, fisher_blocks, None)?;
399 if hessian.nrows() != beta_flat_dim || hessian.ncols() != beta_flat_dim {
400 crate::bail_invalid_estim!(
401 "{context}: assembled Hessian shape {:?} ≠ ({beta_flat_dim}, {beta_flat_dim})",
402 hessian.dim()
403 );
404 }
405 match class_penalty_metric {
406 ClassPenaltyMetric::Diagonal => {
407 for a in 0..m {
408 let la = lambdas[a];
409 if la == 0.0 {
410 continue;
411 }
412 let base = a * p;
413 for i in 0..p {
414 for j in 0..p {
415 hessian[[base + i, base + j]] += la * penalty[[i, j]];
416 }
417 }
418 }
419 }
420 // Centered (#1587): H_{ab} += λ·(δ_ab − 1/K)·S, K = M+1, shared
421 // λ = lambdas[0] — couples every class pair via the −(λ/K)·S
422 // off-diagonals. Reference-invariant softmax penalty.
423 ClassPenaltyMetric::Centered if m > 0 && lambdas[0] != 0.0 => {
424 let lam = lambdas[0];
425 let inv_k = 1.0 / ((m + 1) as f64);
426 for a in 0..m {
427 for b in 0..m {
428 let coef = lam * (if a == b { 1.0 } else { 0.0 } - inv_k);
429 let (ba, bb) = (a * p, b * p);
430 for i in 0..p {
431 for j in 0..p {
432 hessian[[ba + i, bb + j]] += coef * penalty[[i, j]];
433 }
434 }
435 }
436 }
437 }
438 ClassPenaltyMetric::Centered => {}
439 }
440
441 // Penalized gradient: g_a = Xᵀ r_{·,a} + (penalty gradient). For the
442 // Diagonal metric that is `λ_a S β_a`; for Centered it is the
443 // reference-symmetric `λ S (β_a − β̄)`, β̄ = (1/K) Σ_b β_b (#1587).
444 // Written into the reused `grad_flat` buffer; the loop below assigns
445 // every entry (`=`) before the penalty `+=`, so no re-zeroing is needed.
446 for a in 0..m {
447 for i in 0..p {
448 let mut acc = 0.0_f64;
449 for row in 0..n_obs {
450 acc += design[[row, i]] * residual[[row, a]];
451 }
452 grad_flat[a * p + i] = acc;
453 }
454 }
455 match class_penalty_metric {
456 ClassPenaltyMetric::Diagonal => {
457 for a in 0..m {
458 let la = lambdas[a];
459 if la == 0.0 {
460 continue;
461 }
462 let beta_col = beta.column(a);
463 for i in 0..p {
464 let mut s_beta_i = 0.0_f64;
465 for j in 0..p {
466 s_beta_i += penalty[[i, j]] * beta_col[j];
467 }
468 grad_flat[a * p + i] += la * s_beta_i;
469 }
470 }
471 }
472 ClassPenaltyMetric::Centered if m > 0 && lambdas[0] != 0.0 => {
473 let lam = lambdas[0];
474 let inv_k = 1.0 / ((m + 1) as f64);
475 // β̄ = (1/K) Σ_b β_b.
476 let mut bbar = vec![0.0_f64; p];
477 for b in 0..m {
478 let col = beta.column(b);
479 for i in 0..p {
480 bbar[i] += col[i];
481 }
482 }
483 for v in bbar.iter_mut() {
484 *v *= inv_k;
485 }
486 for a in 0..m {
487 let beta_col = beta.column(a);
488 for i in 0..p {
489 // (S (β_a − β̄))_i.
490 let mut s_centered_i = 0.0_f64;
491 for j in 0..p {
492 s_centered_i += penalty[[i, j]] * (beta_col[j] - bbar[j]);
493 }
494 grad_flat[a * p + i] += lam * s_centered_i;
495 }
496 }
497 }
498 ClassPenaltyMetric::Centered => {}
499 }
500
501 // δ = − H^{-1} · grad, solved through an adaptive Levenberg–Marquardt
502 // ridge. The penalized Hessian `H = block(XᵀWX) + diag_a(λ_a S)` can be
503 // rank-deficient — a multinomial class block with quasi-separated /
504 // collinear columns and a small per-class λ leaves `XᵀW_aX + λ_a S`
505 // singular. faer's symmetric fallback chain ends at Bunch–Kaufman
506 // (LBLᵀ), which factorizes indefinite/singular matrices "successfully"
507 // and then back-substitutes through near-zero pivots, yielding a
508 // non-finite δ. Rather than aborting the whole fit on one bad block, we
509 // add a small ridge `τ·I` (Levenberg style) to the diagonal and
510 // re-factorize, escalating τ geometrically until the step is finite.
511 //
512 // The base ridge is scaled by the Hessian's largest diagonal entry so
513 // it is invariant to the problem's overall curvature scale: a tiny
514 // nudge relative to the dominant curvature, large enough to lift the
515 // null directions off zero. A finite δ from the ridged system is a
516 // descent direction for the *unridged* penalized objective `F`
517 // (ridging only shrinks the step toward the gradient direction), and
518 // the backtracking line search below validates it against `F` itself,
519 // so the ridge never biases the converged β̂ — at the optimum the
520 // gradient vanishes and the step → 0 regardless of τ.
521 let max_diag =
522 (0..beta_flat_dim).fold(0.0_f64, |acc, idx| acc.max(hessian[[idx, idx]].abs()));
523 // The ridge floors at `base_ridge` (not 0) for every solve. An exactly
524 // rank-deficient block (e.g. duplicate / collinear design columns under
525 // a near-zero λ) leaves `H = block(XᵀWX) + diag_a(λ_a S)` singular along
526 // a null direction. faer's Bunch–Kaufman fallback factorizes a singular
527 // matrix "successfully" and back-substitutes through the zero pivot to a
528 // *finite but arbitrary* component in the null space, so the resulting
529 // Newton direction is not a descent direction in the identified
530 // subspace — the line search then shrinks α toward 0 and the step-norm
531 // test declares a false convergence at a point where the unridged
532 // penalized gradient on identified directions is still large (gam#856).
533 // A minimal Tikhonov ridge `base_ridge·I` resolves the null direction to
534 // its minimum-norm representative, giving a true descent direction.
535 let base_ridge = if max_diag.is_finite() && max_diag > 0.0 {
536 max_diag * BASE_RIDGE_FRACTION_OF_MAX_DIAG
537 } else {
538 BASE_RIDGE_FRACTION_OF_MAX_DIAG
539 };
540 let mut delta = Array1::<f64>::zeros(beta_flat_dim);
541 let mut ridge = base_ridge;
542 let mut solved = false;
543 for attempt in 0..=MAX_RIDGE_ESCALATIONS {
544 let mut ridged = hessian.clone();
545 if ridge > 0.0 {
546 for idx in 0..beta_flat_dim {
547 ridged[[idx, idx]] += ridge;
548 }
549 }
550 let factor = match factorize_symmetricwith_fallback(
551 FaerArrayView::new(&ridged).as_ref(),
552 Side::Lower,
553 ) {
554 Ok(factor) => factor,
555 Err(err) => {
556 // A genuine factorization failure (not just a singular
557 // pivot) — escalate the ridge and retry; only give up after
558 // exhausting the escalation budget.
559 if attempt == MAX_RIDGE_ESCALATIONS {
560 return Err(EstimationError::InvalidInput(format!(
561 "{context}: Hessian factorization failed at iter {iter} \
562 even with ridge {ridge:.3e}: {err}"
563 )));
564 }
565 ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
566 continue;
567 }
568 };
569 let mut rhs = Array2::<f64>::zeros((beta_flat_dim, 1));
570 for i in 0..beta_flat_dim {
571 rhs[[i, 0]] = -grad_flat[i];
572 }
573 {
574 let rhs_view = array2_to_matmut(&mut rhs);
575 factor.solve_in_place(rhs_view);
576 }
577 if (0..beta_flat_dim).all(|i| rhs[[i, 0]].is_finite()) {
578 for i in 0..beta_flat_dim {
579 delta[i] = rhs[[i, 0]];
580 }
581 solved = true;
582 break;
583 }
584 // Singular pivots back-substituted to ±inf/NaN: escalate the ridge.
585 ridge = if ridge > 0.0 { ridge * 2.0 } else { base_ridge };
586 }
587 assert!(
588 solved,
589 "{context}: Newton step remained non-finite at iter {iter} after {} ridge \
590 escalations up to {ridge:.3e}; the penalized Hessian is pathologically \
591 rank-deficient (grad_norm={:.3e}, max_diag={max_diag:.3e})",
592 MAX_RIDGE_ESCALATIONS,
593 grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt(),
594 );
595
596 // Damped acceptance: full step first, halve up to `MAX_BACKTRACKS` times
597 // if the penalized negative log-likelihood fails to decrease. The first
598 // iteration seeds `last_objective` from the initial β.
599 let proposed_beta = |alpha: f64| -> Array2<f64> {
600 let mut out = beta.clone();
601 for a in 0..m {
602 for i in 0..p {
603 out[[i, a]] += alpha * delta[a * p + i];
604 }
605 }
606 out
607 };
608 if iter == 0 {
609 last_objective = evaluate_objective(&beta, &mut eta_objective_scratch);
610 if !last_objective.is_finite() {
611 crate::bail_invalid_estim!("{context}: non-finite objective at β = 0");
612 }
613 }
614 let mut alpha = 1.0_f64;
615 let mut accepted_beta = proposed_beta(alpha);
616 let mut new_objective = evaluate_objective(&accepted_beta, &mut eta_objective_scratch);
617 let mut backtrack = 0usize;
618 while (!new_objective.is_finite()
619 || new_objective > last_objective + OBJECTIVE_DECREASE_SLACK)
620 && backtrack < MAX_BACKTRACKS
621 {
622 alpha *= LINE_SEARCH_SHRINK;
623 accepted_beta = proposed_beta(alpha);
624 new_objective = evaluate_objective(&accepted_beta, &mut eta_objective_scratch);
625 backtrack += 1;
626 }
627
628 let mut step_norm_sq = 0.0_f64;
629 let mut beta_norm_sq = 0.0_f64;
630 for a in 0..m {
631 for i in 0..p {
632 let d = accepted_beta[[i, a]] - beta[[i, a]];
633 step_norm_sq += d * d;
634 let v = accepted_beta[[i, a]];
635 beta_norm_sq += v * v;
636 }
637 }
638
639 beta = accepted_beta;
640 last_objective = new_objective;
641
642 let step_norm = step_norm_sq.sqrt();
643 let beta_norm = beta_norm_sq.sqrt();
644 // First-order optimality gate (gam#856): the step-norm test alone can
645 // fire prematurely when a backtracking line search has shrunk α on a
646 // poor direction, leaving a point that is NOT stationary. `grad_flat`
647 // is the unridged penalized gradient ∇F(β) at the pre-step β; with a
648 // small step it is ≈ ∇F at the accepted β. Its norm reflects only
649 // identified directions (it is exactly zero along an unidentified null
650 // direction such as a duplicate-column e₁−e₂ split), so requiring it to
651 // be small certifies first-order optimality on the identified subspace
652 // without penalizing legitimate non-identifiability. Scale the gate by
653 // the data magnitude so it is invariant to problem scale.
654 let grad_norm = grad_flat.iter().map(|v| v * v).sum::<f64>().sqrt();
655 // Curvature-scaled optimality threshold: `max_diag` is the dominant
656 // penalized-Hessian diagonal entry, so `OPTIMALITY_GRAD_FRACTION·max_diag`
657 // is a tiny gradient relative to the problem's curvature scale and is
658 // reached by a few quadratically-converging Newton steps on this smooth,
659 // bounded softmax/binomial likelihood.
660 let grad_optimal = grad_norm <= OPTIMALITY_GRAD_FRACTION * (1.0 + max_diag);
661 if step_norm <= tol * (1.0 + beta_norm) && grad_optimal {
662 converged = true;
663 break;
664 }
665 }
666
667 // ──────────────────────────── post-process ────────────────────────────
668 recompute_eta(&beta, &mut eta);
669 let log_likelihood = likelihood.log_lik(eta.view(), y);
670 let penalty_term = weighted_penalty_sum(&beta, penalty, lambdas, class_penalty_metric);
671
672 Ok(PenalizedVectorGlmOutputs {
673 coefficients: beta,
674 eta,
675 iterations,
676 converged,
677 log_likelihood,
678 penalty_term,
679 })
680}
681
682#[cfg(test)]
683mod parity_tests {
684 //! Parity tests for the shared scaffold across both Fisher-block families
685 //! (issue #409). The engine is exercised through the two public adapters —
686 //! [`crate::binomial_multi::fit_penalized_binomial_multi`]
687 //! (row-diagonal block) and
688 //! [`crate::multinomial::fit_penalized_multinomial`] (dense
689 //! softmax block) — and we assert, with un-weakened bounds, that:
690 //!
691 //! 1. each fit hits the first-order optimality condition `∇F(β̂) = 0`,
692 //! verified by a central finite difference of the penalized objective
693 //! (the engine never sees this gradient, so this is an independent
694 //! check that the shared Newton scaffold converged correctly);
695 //! 2. the reported fitted probabilities are consistent with `β̂` and the
696 //! reported deviance equals `−2 · log L(β̂)`;
697 //! 3. for the binomial family, the `K`-column joint solve reproduces a
698 //! from-scratch single-column penalized logistic Newton solve column
699 //! for column (the row-diagonal block must decouple exactly).
700
701 use super::{ClassPenaltyMetric, weighted_penalty_sum};
702 use crate::binomial_multi::{BinomialMultiFitInputs, fit_penalized_binomial_multi};
703 use crate::multinomial::{MultinomialFitInputs, fit_penalized_multinomial};
704 use ndarray::{Array1, Array2};
705
706 /// #1587: the `Centered` class-penalty metric is invariant to the arbitrary
707 /// reference-class choice. Penalizing the `K−1` ALR contrasts under ANY of
708 /// the `K` baselines yields the same value (the symmetric CLR penalty
709 /// `Σ_k β̃_kᵀSβ̃_k`), whereas the historical `Diagonal` metric does not — that
710 /// non-invariance is exactly the #1587 defect. Pure-algebra check on the
711 /// penalty form (no fit), so it pins the engine foundation the production
712 /// wiring (REML per-term λ re-key) will build on.
713 #[test]
714 fn centered_penalty_is_reference_class_invariant_1587() {
715 // K = 3 classes, p = 2 coefficients; symmetric PSD penalty S.
716 let s = ndarray::array![[2.0_f64, 0.5], [0.5, 1.0]];
717 // A CLR (sum-to-zero) coefficient set: β̃_0 + β̃_1 + β̃_2 = 0.
718 let bt = [[1.0_f64, 0.5], [-0.3, 0.2], [-0.7, -0.7]];
719 for j in 0..2 {
720 let colsum: f64 = (0..3).map(|k| bt[k][j]).sum();
721 assert!(colsum.abs() < 1e-12, "test CLR set must sum to zero");
722 }
723 // Direct symmetric penalty Σ_k β̃_kᵀ S β̃_k.
724 let mut symmetric = 0.0_f64;
725 for k in 0..3 {
726 for i in 0..2 {
727 for j in 0..2 {
728 symmetric += bt[k][i] * s[[i, j]] * bt[k][j];
729 }
730 }
731 }
732 let lambdas = Array1::from(vec![1.0_f64, 1.0]);
733 let mut centered_vals = Vec::new();
734 let mut diagonal_vals = Vec::new();
735 // For each reference class r, the two ALR contrasts are β̃_a − β̃_r (a≠r).
736 for r in 0..3 {
737 let others: Vec<usize> = (0..3).filter(|&k| k != r).collect();
738 let mut beta = Array2::<f64>::zeros((2, 2));
739 for (a, &o) in others.iter().enumerate() {
740 for i in 0..2 {
741 beta[[i, a]] = bt[o][i] - bt[r][i];
742 }
743 }
744 let c =
745 weighted_penalty_sum(&beta, s.view(), lambdas.view(), ClassPenaltyMetric::Centered);
746 let d =
747 weighted_penalty_sum(&beta, s.view(), lambdas.view(), ClassPenaltyMetric::Diagonal);
748 assert!(
749 (c - 0.5 * symmetric).abs() < 1e-12,
750 "ref {r}: Centered penalty {c} must equal ½·symmetric {}",
751 0.5 * symmetric
752 );
753 centered_vals.push(c);
754 diagonal_vals.push(d);
755 }
756 let cspread = centered_vals.iter().cloned().fold(f64::MIN, f64::max)
757 - centered_vals.iter().cloned().fold(f64::MAX, f64::min);
758 assert!(
759 cspread < 1e-12,
760 "Centered must be reference-invariant; got {centered_vals:?}"
761 );
762 let dspread = diagonal_vals.iter().cloned().fold(f64::MIN, f64::max)
763 - diagonal_vals.iter().cloned().fold(f64::MAX, f64::min);
764 assert!(
765 dspread > 1e-6,
766 "Diagonal is the non-invariant #1587 path; references must disagree, got {diagonal_vals:?}"
767 );
768 }
769
770 fn sigmoid(eta: f64) -> f64 {
771 if eta >= 0.0 {
772 1.0 / (1.0 + (-eta).exp())
773 } else {
774 let e = eta.exp();
775 e / (1.0 + e)
776 }
777 }
778
779 /// Softmax with implicit reference column (η_ref = 0) over `M` active η.
780 fn softmax_ref(eta_active: &[f64]) -> Vec<f64> {
781 let m = eta_active.len();
782 let mut out = vec![0.0_f64; m + 1];
783 let mut max_eta = 0.0_f64;
784 for &v in eta_active {
785 if v > max_eta {
786 max_eta = v;
787 }
788 }
789 let baseline = (-max_eta).exp();
790 let mut denom = baseline;
791 for (idx, &v) in eta_active.iter().enumerate() {
792 let e = (v - max_eta).exp();
793 out[idx] = e;
794 denom += e;
795 }
796 for v in out.iter_mut().take(m) {
797 *v /= denom;
798 }
799 out[m] = baseline / denom;
800 out
801 }
802
803 /// Penalized negative log-likelihood for the independent-binomial family at
804 /// a candidate coefficient matrix `β ∈ ℝ^{P×K}`, computed directly from the
805 /// definition (no engine internals).
806 fn binomial_objective(
807 design: &Array2<f64>,
808 y: &Array2<f64>,
809 penalty: &Array2<f64>,
810 lambdas: &Array1<f64>,
811 beta: &Array2<f64>,
812 ) -> f64 {
813 let (n, p) = design.dim();
814 let k = y.ncols();
815 let mut ll = 0.0_f64;
816 for row in 0..n {
817 for a in 0..k {
818 let mut eta = 0.0_f64;
819 for i in 0..p {
820 eta += design[[row, i]] * beta[[i, a]];
821 }
822 let mu = sigmoid(eta).clamp(1.0e-12, 1.0 - 1.0e-12);
823 let yv = y[[row, a]];
824 ll += yv * mu.ln() + (1.0 - yv) * (1.0 - mu).ln();
825 }
826 }
827 let mut pen = 0.0_f64;
828 for a in 0..k {
829 let la = lambdas[a];
830 for i in 0..p {
831 let mut sbi = 0.0_f64;
832 for j in 0..p {
833 sbi += penalty[[i, j]] * beta[[j, a]];
834 }
835 pen += 0.5 * la * beta[[i, a]] * sbi;
836 }
837 }
838 -ll + pen
839 }
840
841 /// Penalized negative log-likelihood for the multinomial family at a
842 /// candidate active-class coefficient matrix `β ∈ ℝ^{P×(K-1)}`.
843 fn multinomial_objective(
844 design: &Array2<f64>,
845 y_one_hot: &Array2<f64>,
846 penalty: &Array2<f64>,
847 lambdas: &Array1<f64>,
848 beta: &Array2<f64>,
849 ) -> f64 {
850 let (n, p) = design.dim();
851 let k = y_one_hot.ncols();
852 let m = k - 1;
853 let mut ll = 0.0_f64;
854 let mut eta_active = vec![0.0_f64; m];
855 for row in 0..n {
856 for a in 0..m {
857 let mut eta = 0.0_f64;
858 for i in 0..p {
859 eta += design[[row, i]] * beta[[i, a]];
860 }
861 eta_active[a] = eta;
862 }
863 let probs = softmax_ref(&eta_active);
864 for c in 0..k {
865 let yc = y_one_hot[[row, c]];
866 if yc != 0.0 {
867 ll += yc * probs[c].max(1.0e-300).ln();
868 }
869 }
870 }
871 let mut pen = 0.0_f64;
872 for a in 0..m {
873 let la = lambdas[a];
874 for i in 0..p {
875 let mut sbi = 0.0_f64;
876 for j in 0..p {
877 sbi += penalty[[i, j]] * beta[[j, a]];
878 }
879 pen += 0.5 * la * beta[[i, a]] * sbi;
880 }
881 }
882 -ll + pen
883 }
884
885 /// Central finite-difference gradient of an objective over every entry of a
886 /// `(P, C)` coefficient matrix. The optimum must drive every component to
887 /// ~0; we assert the max |component| against an un-weakened bound.
888 fn fd_grad<F: Fn(&Array2<f64>) -> f64>(beta: &Array2<f64>, f: F) -> f64 {
889 let (p, c) = beta.dim();
890 let h = 1.0e-6;
891 let mut max_abs = 0.0_f64;
892 for i in 0..p {
893 for a in 0..c {
894 let mut up = beta.clone();
895 let mut dn = beta.clone();
896 up[[i, a]] += h;
897 dn[[i, a]] -= h;
898 let g = (f(&up) - f(&dn)) / (2.0 * h);
899 max_abs = max_abs.max(g.abs());
900 }
901 }
902 max_abs
903 }
904
905 fn binomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
906 let n = 40;
907 let p = 3;
908 let k = 3;
909 let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
910 0 => 1.0,
911 1 => ((i + 1) as f64 * 0.37).sin(),
912 _ => ((i + 1) as f64 * 0.11).cos(),
913 });
914 let y = Array2::<f64>::from_shape_fn((n, k), |(i, a)| {
915 // Deterministic but non-degenerate {0,1} labels per column.
916 if ((i * 7 + a * 13 + 3) % 5) < 3 {
917 1.0
918 } else {
919 0.0
920 }
921 });
922 let penalty = Array2::<f64>::eye(p);
923 let lambdas = Array1::from(vec![0.3_f64, 1.2, 2.5]);
924 (design, y, penalty, lambdas)
925 }
926
927 fn multinomial_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>, Array1<f64>) {
928 let n = 45;
929 let p = 3;
930 let k = 4;
931 let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
932 0 => 1.0,
933 1 => ((i + 2) as f64 * 0.29).sin(),
934 _ => ((i + 2) as f64 * 0.17).cos(),
935 });
936 let mut y = Array2::<f64>::zeros((n, k));
937 for i in 0..n {
938 y[[i, (i * 3 + 1) % k]] = 1.0;
939 }
940 let penalty = Array2::<f64>::eye(p);
941 let lambdas = Array1::from(vec![0.5_f64, 1.0, 2.0]);
942 (design, y, penalty, lambdas)
943 }
944
945 #[test]
946 fn binomial_engine_hits_optimum_and_is_self_consistent() {
947 let (design, y, penalty, lambdas) = binomial_fixture();
948 let fit = fit_penalized_binomial_multi(BinomialMultiFitInputs {
949 design: design.view(),
950 y: y.view(),
951 penalty: penalty.view(),
952 lambdas: lambdas.view(),
953 row_weights: None,
954 fisher_w_override: None,
955 max_iter: 100,
956 tol: 1.0e-12,
957 })
958 .expect("binomial fit must succeed");
959 assert!(fit.converged, "binomial fit must converge");
960
961 // First-order optimality: ∇F(β̂) = 0 (engine never used this gradient).
962 let g = fd_grad(&fit.coefficients, |b| {
963 binomial_objective(&design, &y, &penalty, &lambdas, b)
964 });
965 assert!(
966 g < 1.0e-6,
967 "binomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
968 );
969
970 // Fitted probabilities reproduce σ(X β̂) and deviance = −2 log L.
971 let (n, p) = design.dim();
972 let k = y.ncols();
973 let mut log_lik = 0.0_f64;
974 for row in 0..n {
975 for a in 0..k {
976 let mut eta = 0.0_f64;
977 for i in 0..p {
978 eta += design[[row, i]] * fit.coefficients[[i, a]];
979 }
980 let mu = sigmoid(eta);
981 assert!(
982 (fit.fitted_probabilities[[row, a]] - mu).abs() < 1.0e-10,
983 "fitted probability must equal σ(X β̂)"
984 );
985 let muc = mu.clamp(1.0e-12, 1.0 - 1.0e-12);
986 let yv = y[[row, a]];
987 log_lik += yv * muc.ln() + (1.0 - yv) * (1.0 - muc).ln();
988 }
989 }
990 assert!(
991 (fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
992 "deviance must equal −2 log L"
993 );
994 }
995
996 #[test]
997 fn binomial_joint_solve_decouples_into_single_column_solves() {
998 // Parity: the row-diagonal Fisher block means the K-column joint solve
999 // must reproduce, column for column, an independent single-column
1000 // penalized logistic Newton solve. This is the defining property the
1001 // shared engine preserves for the independent-binomial family.
1002 let (design, y, penalty, lambdas) = binomial_fixture();
1003 let joint = fit_penalized_binomial_multi(BinomialMultiFitInputs {
1004 design: design.view(),
1005 y: y.view(),
1006 penalty: penalty.view(),
1007 lambdas: lambdas.view(),
1008 row_weights: None,
1009 fisher_w_override: None,
1010 max_iter: 100,
1011 tol: 1.0e-12,
1012 })
1013 .expect("joint fit must succeed");
1014
1015 let k = y.ncols();
1016 for a in 0..k {
1017 // Single-column problem: one binomial response, one λ.
1018 let y_col = y.column(a).to_owned().insert_axis(ndarray::Axis(1));
1019 let lam = Array1::from(vec![lambdas[a]]);
1020 let single = fit_penalized_binomial_multi(BinomialMultiFitInputs {
1021 design: design.view(),
1022 y: y_col.view(),
1023 penalty: penalty.view(),
1024 lambdas: lam.view(),
1025 row_weights: None,
1026 fisher_w_override: None,
1027 max_iter: 100,
1028 tol: 1.0e-12,
1029 })
1030 .expect("single-column fit must succeed");
1031 for i in 0..design.ncols() {
1032 let dj = joint.coefficients[[i, a]];
1033 let ds = single.coefficients[[i, 0]];
1034 assert!(
1035 (dj - ds).abs() < 1.0e-8,
1036 "joint column {a} coef {i} ({dj}) must match single-column solve ({ds})"
1037 );
1038 }
1039 }
1040 }
1041
1042 #[test]
1043 fn multinomial_engine_hits_optimum_and_is_self_consistent() {
1044 let (design, y, penalty, lambdas) = multinomial_fixture();
1045 let fit = fit_penalized_multinomial(MultinomialFitInputs {
1046 design: design.view(),
1047 y_one_hot: y.view(),
1048 penalty: penalty.view(),
1049 lambdas: lambdas.view(),
1050 row_weights: None,
1051 fisher_w_override: None,
1052 max_iter: 100,
1053 tol: 1.0e-12,
1054 })
1055 .expect("multinomial fit must succeed");
1056 assert!(fit.converged, "multinomial fit must converge");
1057
1058 // First-order optimality: ∇F(β̂) = 0.
1059 let g = fd_grad(&fit.coefficients_active, |b| {
1060 multinomial_objective(&design, &y, &penalty, &lambdas, b)
1061 });
1062 assert!(
1063 g < 1.0e-6,
1064 "multinomial penalized gradient at β̂ must vanish (max |∂F| = {g})"
1065 );
1066
1067 // Fitted probabilities are a valid simplex per row and reproduce the
1068 // softmax of X β̂; deviance = −2 log L.
1069 let (n, p) = design.dim();
1070 let k = y.ncols();
1071 let m = k - 1;
1072 let mut log_lik = 0.0_f64;
1073 let mut eta_active = vec![0.0_f64; m];
1074 for row in 0..n {
1075 for a in 0..m {
1076 let mut eta = 0.0_f64;
1077 for i in 0..p {
1078 eta += design[[row, i]] * fit.coefficients_active[[i, a]];
1079 }
1080 eta_active[a] = eta;
1081 }
1082 let probs = softmax_ref(&eta_active);
1083 let mut row_sum = 0.0_f64;
1084 for c in 0..k {
1085 assert!(
1086 (fit.fitted_probabilities[[row, c]] - probs[c]).abs() < 1.0e-10,
1087 "fitted probability must equal softmax(X β̂)"
1088 );
1089 row_sum += fit.fitted_probabilities[[row, c]];
1090 let yc = y[[row, c]];
1091 if yc != 0.0 {
1092 log_lik += yc * probs[c].max(1.0e-300).ln();
1093 }
1094 }
1095 assert!(
1096 (row_sum - 1.0).abs() < 1.0e-10,
1097 "fitted probabilities must sum to 1 per row"
1098 );
1099 }
1100 assert!(
1101 (fit.deviance - (-2.0 * log_lik)).abs() < 1.0e-9,
1102 "deviance must equal −2 log L"
1103 );
1104 }
1105
1106 #[test]
1107 fn multinomial_rank_deficient_block_recovers_via_ridge_not_crash() {
1108 // Issue #557: a rank-deficient class block under a tiny per-class λ used
1109 // to make faer's Bunch–Kaufman fallback back-substitute through near-zero
1110 // pivots into a non-finite Newton step δ, and the solver aborted with
1111 // "Newton step is non-finite". The adaptive Levenberg–Marquardt ridge
1112 // must instead lift the null direction off zero, keep δ finite, and let
1113 // the backtracking line search converge to the penalized optimum.
1114 //
1115 // Construct an exactly rank-deficient design: column 2 is a perfect
1116 // duplicate of column 1, so XᵀWX is singular along (e₁ − e₂) for every
1117 // class, and we drive the corresponding λ to a tiny value so the penalty
1118 // cannot regularize that null direction. A non-robust solver crashes
1119 // here; the ridge path must produce a finite, self-consistent fit.
1120 let n = 50;
1121 let p = 4;
1122 let k = 4;
1123 let design = Array2::<f64>::from_shape_fn((n, p), |(i, j)| match j {
1124 0 => 1.0,
1125 1 => ((i + 1) as f64 * 0.23).sin(),
1126 2 => ((i + 1) as f64 * 0.23).sin(), // exact duplicate of column 1
1127 _ => ((i + 1) as f64 * 0.19).cos(),
1128 });
1129 let mut y = Array2::<f64>::zeros((n, k));
1130 for i in 0..n {
1131 y[[i, (i * 5 + 2) % k]] = 1.0;
1132 }
1133 // Penalty touches only the smooth-ish columns 1..p; columns 0/1/2 share
1134 // the collinearity, and a near-zero λ leaves the (e₁ − e₂) null direction
1135 // unregularized — exactly the rank-deficient regime that triggered #557.
1136 let mut penalty = Array2::<f64>::zeros((p, p));
1137 penalty[[3, 3]] = 1.0;
1138 let lambdas = Array1::from(vec![1.0e-10_f64, 1.0e-10, 1.0e-10]);
1139
1140 let fit = fit_penalized_multinomial(MultinomialFitInputs {
1141 design: design.view(),
1142 y_one_hot: y.view(),
1143 penalty: penalty.view(),
1144 lambdas: lambdas.view(),
1145 row_weights: None,
1146 fisher_w_override: None,
1147 max_iter: 200,
1148 tol: 1.0e-10,
1149 })
1150 .expect("rank-deficient multinomial fit must NOT crash (#557): the ridge path recovers it");
1151
1152 // Every coefficient and fitted probability must be finite (no inf/NaN
1153 // leaked from the near-singular solve).
1154 for &c in fit.coefficients_active.iter() {
1155 assert!(c.is_finite(), "coefficient must be finite, got {c}");
1156 }
1157 for &pr in fit.fitted_probabilities.iter() {
1158 assert!(
1159 pr.is_finite() && (-1.0e-9..=1.0 + 1.0e-9).contains(&pr),
1160 "fitted probability must be a finite simplex entry, got {pr}"
1161 );
1162 }
1163 // Rows must remain on the simplex.
1164 let (nn, kk) = fit.fitted_probabilities.dim();
1165 for row in 0..nn {
1166 let s: f64 = (0..kk).map(|c| fit.fitted_probabilities[[row, c]]).sum();
1167 assert!(
1168 (s - 1.0).abs() < 1.0e-9,
1169 "row {row} probabilities must sum to 1, got {s}"
1170 );
1171 }
1172
1173 // The recovered fit must satisfy first-order optimality of the penalized
1174 // objective along every NON-NULL coordinate. The (e₁ − e₂) null
1175 // direction is unidentified (the ridge picks the minimum-norm split
1176 // between the duplicate columns), so the gradient is exactly zero along
1177 // every identified direction; a central finite difference of F over the
1178 // full coefficient matrix is dominated by the identified part and must be
1179 // small. We assert the penalized objective gradient is near-zero — the
1180 // ridge biases the step but never the optimum (at β̂ the unridged
1181 // gradient vanishes for any τ).
1182 let g = fd_grad(&fit.coefficients_active, |b| {
1183 multinomial_objective(&design, &y, &penalty, &lambdas, b)
1184 });
1185 assert!(
1186 g < 1.0e-4,
1187 "penalized objective gradient at the ridge-recovered β̂ must (near-)vanish \
1188 along identified directions (max |∂F| = {g})"
1189 );
1190 }
1191}