gam_solve/estimate/smoothing_correction.rs
1use super::*;
2
3/// Default inner P-IRLS tolerance floor.
4///
5/// The inner Newton iteration certifies the coefficient mode against this
6/// (scale-aware) tolerance independently of the outer REML tolerance. Coupling
7/// the two collapses two unrelated convergence concepts: when a user dials the
8/// outer tolerance up to e.g. 1e-3 to make the smoothing-parameter search
9/// coarser, the inner solve becomes coarse too, returning betas whose
10/// stationarity residual is ~1e-3·scale rather than the floating-point noise
11/// floor. Outer derivatives then read those imprecise betas as if they were
12/// the true mode and accumulate error. Keeping the inner floor at 1e-6 lets
13/// the outer loop relax without contaminating the coefficient certificate.
14pub(crate) const PIRLS_INNER_TOLERANCE_FLOOR: f64 = 1e-6;
15
16#[derive(Clone)]
17pub(crate) struct RemlConfig {
18 pub(crate) likelihood: GlmLikelihoodSpec,
19 pub(crate) link_kind: InverseLink,
20 pub(crate) pirls_convergence_tolerance: f64,
21 pub(crate) max_iterations: usize,
22 pub(crate) reml_convergence_tolerance: f64,
23 pub(crate) firth_bias_reduction: bool,
24 /// Forwarded to `pirls::PirlsConfig::geodesic_acceleration`. Off by default.
25 pub(crate) geodesic_acceleration: bool,
26}
27
28impl RemlConfig {
29 pub(crate) fn external(
30 likelihood: GlmLikelihoodSpec,
31 reml_tol: f64,
32 firth_bias_reduction: bool,
33 ) -> Self {
34 // Inner P-IRLS certifies the coefficient mode against
35 // `pirls_convergence_tolerance`; the outer REML iteration certifies
36 // the smoothing-parameter optimum against `reml_convergence_tolerance`.
37 // These are different concepts and must not be coupled. The inner
38 // tolerance is at most the outer tolerance (so a user who *tightens*
39 // the outer also tightens the inner), but never coarser than the
40 // floor — a coarse outer must not silently pollute the inner mode.
41 let pirls_tol = reml_tol.min(PIRLS_INNER_TOLERANCE_FLOOR);
42 let link_kind = likelihood.spec.link.clone();
43 Self {
44 likelihood,
45 link_kind,
46 pirls_convergence_tolerance: pirls_tol,
47 max_iterations: 0,
48 reml_convergence_tolerance: reml_tol,
49 firth_bias_reduction,
50 geodesic_acceleration: false,
51 }
52 .with_max_iterations(300)
53 }
54
55 pub(crate) fn with_max_iterations(mut self, max_iterations: usize) -> Self {
56 self.max_iterations = max_iterations;
57 self
58 }
59
60 pub(crate) fn link_function(&self) -> LinkFunction {
61 self.link_kind.link_function()
62 }
63
64 pub(crate) fn as_pirls_config(&self) -> pirls::PirlsConfig {
65 pirls::PirlsConfig {
66 likelihood: self.likelihood.clone(),
67 link_kind: self.link_kind.clone(),
68 max_iterations: self.max_iterations,
69 convergence_tolerance: self.pirls_convergence_tolerance,
70 firth_bias_reduction: self.firth_bias_reduction,
71 // Caller (the REML runtime) populates this hint just before
72 // each `execute_pirls_if_needed` call from the cached final
73 // λ of the previous successful PIRLS solve.
74 initial_lm_lambda: None,
75 geodesic_acceleration: self.geodesic_acceleration,
76 // Arrow-Schur structured-inner-solve descriptor. Not used by
77 // the standard REML→PIRLS path (β-only); set by the latent
78 // driver (`crate::latent_inner::LatentInnerSolver`)
79 // which assembles the per-row (t, β) bordered system
80 // externally. Default `None` preserves back-compat.
81 arrow_schur: None,
82 }
83 }
84}
85pub(crate) const MAX_FACTORIZATION_ATTEMPTS: usize = 4;
86
87/// Small ridge added to the rho-space LAML Hessian before inversion, for
88/// numerical stability when smoothing parameters are weakly identified.
89///
90/// **Stabilization semantics:** this ridge is a
91/// [`gam_problem::StabilizationKind::NumericalPerturbation`] (not an
92/// `ExplicitPrior`). It enters only the inverse used to build `V_rho` for
93/// the smoothing-correction propagation step. It does NOT enter the LAML
94/// objective, its gradient, the saved coefficients, or any user-visible
95/// summary — the rho-Hessian itself is recomputed from first principles
96/// in every place that consults it. Classified as
97/// [`gam_problem::StabilizationKind::NumericalPerturbation`]; no ledger
98/// record is emitted at this site because the perturbation never escapes the
99/// local `V_rho` inverse (it touches no saved coefficient, objective, or
100/// user-visible summary).
101const LAML_RIDGE: f64 = 1e-8;
102/// Minimum penalized-deviance floor, expressed as a fraction of the
103/// problem's own deviance scale (the weighted null deviance `D₀`, see
104/// [`smooth_floor_dp`]). The floor exists only to keep the profiled
105/// dispersion `φ̂ = D_p/(n−M_p)` strictly positive when a smooth fits the
106/// data essentially perfectly (`D_p ↓ 0`), so it must trigger on the
107/// *relative* smallness `D_p/D₀`, never on an absolute magnitude — an
108/// absolute floor silently breaks the exact scale-equivariance of the
109/// Gaussian REML fit under a response rescale `y → a·y` (#1127).
110pub(crate) const DP_FLOOR: f64 = 1e-12;
111/// Width of the smooth transition region for the deviance floor, also as a
112/// fraction of the deviance scale `D₀`.
113const DP_FLOOR_SMOOTH_WIDTH: f64 = 1e-8;
114
115// Unified rho bound corresponding to lambda in [exp(-RHO_BOUND), exp(RHO_BOUND)].
116// Additional headroom reduces frequent contact with the hard box constraints.
117pub const RHO_BOUND: f64 = 30.0;
118// Soft interior prior on rho near the box boundaries.
119pub(crate) const RHO_SOFT_PRIOR_WEIGHT: f64 = 1e-6;
120pub(crate) const RHO_SOFT_PRIOR_SHARPNESS: f64 = 4.0;
121// Adaptive cubature guardrails for bounded correction latency.
122pub(crate) const AUTO_CUBATURE_MAX_RHO_DIM: usize = 12;
123pub(crate) const AUTO_CUBATURE_MAX_EIGENVECTORS: usize = 4;
124pub(crate) const AUTO_CUBATURE_TARGET_VAR_FRAC: f64 = 0.95;
125pub(crate) const AUTO_CUBATURE_MAX_BETA_DIM: usize = 1600;
126pub(crate) const AUTO_CUBATURE_BOUNDARY_MARGIN: f64 = 2.0;
127
128/// Smooth, differentiable approximation of `max(dp, floor)` where the floor
129/// and the width of the smoothing band are taken **relative to the supplied
130/// deviance `scale`** (the weighted null deviance `D₀` of the response).
131///
132/// Returns the smoothed value, first derivative, and second derivative with
133/// respect to `dp`.
134///
135/// # Why the floor must be relative (issue #1127)
136///
137/// The penalized deviance `D_p = Σ wᵢ(yᵢ−μ̂ᵢ)² + β̂ᵀSβ̂` is exactly quadratic
138/// in the response, so under a multiplicative rescale `y → a·y` it scales as
139/// `D_p → a²·D_p`. The profiled Gaussian REML criterion depends on `D_p` only
140/// through `log D_p` (the `(ν/2)·log(2πφ̂)` term, `φ̂ = D_p/ν`), so the rescale
141/// shifts the cost by the *additive constant* `ν·log a` and leaves the
142/// ρ-gradient — hence the selected `λ̂`, the EDF, and `ŝ(x)/a` — exactly
143/// invariant. An **absolute** floor destroys this: when `a` is small enough
144/// that `D_p` enters the fixed band (e.g. `D_p ≈ 3.6e-11` at `a = 1e-6` with
145/// a band of width `1e-8`), `dp_c` is spuriously inflated toward the absolute
146/// floor, `log dp_c` stops tracking `2·log a + const`, and the optimizer
147/// converges at an over-smoothed `λ̂` — reshaping, not merely rescaling, the
148/// smooth. Scaling both the floor and its width by `D₀ ∝ a²` makes the band a
149/// fixed *fraction* of the deviance, so `smooth_floor_dp(a²·dp, a²·D₀) =
150/// a²·smooth_floor_dp(dp, D₀)` exactly and equivariance is restored.
151///
152/// `scale = 1.0` recovers the historical absolute floor byte-for-byte, which
153/// is the correct default for callers without a Gaussian response scale in
154/// hand (the floor is consumed only on the profiled-Gaussian path).
155pub(crate) fn smooth_floor_dp(dp: f64, scale: f64) -> (f64, f64, f64) {
156 let scale = if scale.is_finite() && scale > 0.0 {
157 scale
158 } else {
159 1.0
160 };
161 let floor = DP_FLOOR * scale;
162 let tau = (DP_FLOOR_SMOOTH_WIDTH * scale).max(f64::MIN_POSITIVE);
163 let scaled = (dp - floor) / tau;
164
165 let softplus = if scaled > 20.0 {
166 scaled + (-scaled).exp()
167 } else if scaled < -20.0 {
168 scaled.exp()
169 } else {
170 (1.0 + scaled.exp()).ln()
171 };
172
173 let sigma = if scaled >= 0.0 {
174 let exp_neg = (-scaled).exp();
175 1.0 / (1.0 + exp_neg)
176 } else {
177 let exp_pos = scaled.exp();
178 exp_pos / (1.0 + exp_pos)
179 };
180
181 let dp_c = floor + tau * softplus;
182 let dp_cgrad2 = sigma * (1.0 - sigma) / tau;
183 (dp_c, sigma, dp_cgrad2)
184}
185
186/// Compute the smoothing parameter uncertainty correction matrix `V_corr = J * V_rho * J^T`.
187///
188/// This implements the Wood et al. (2016) correction for smoothing parameter uncertainty.
189/// The corrected covariance for `beta` is: `V*_beta = V_beta + J * V_rho * J^T`.
190/// where:
191/// - `V_beta = H^{-1}` (conditional covariance treating `lambda` as fixed)
192/// - `J = d(beta)/d(rho)` (Jacobian wrt log-smoothing parameters)
193/// - `V_rho = (d^2 LAML / d rho^2)^{-1}` (outer covariance)
194///
195/// Returns the correction matrix in the ORIGINAL coefficient basis.
196///
197/// Full correction reference.
198/// Let `rho ~ N(mu, Sigma)` with `mu = rho_hat`, `Sigma = V_rho`,
199/// and define:
200/// - `A(rho) = H_rho^{-1}`
201/// - `b(rho) = beta_hat_rho`
202///
203/// The exact Gaussian-mixture identity is:
204/// `Var(beta) = E[A(rho)] + Var(b(rho))`.
205///
206/// Around `mu`, this routine keeps the first-order terms:
207///
208/// `E[A(rho)] ~= A(mu) = Hmu^{-1}`
209/// `Var(b(rho)) ~= J Sigma J^T`
210/// `Var(beta) ~= Hmu^{-1} + J V_rho J^T`.
211///
212/// Equivalent first-order propagation around the outer optimum `rho*`:
213///
214/// `Var(beta_hat) ~= Var(beta_hat | rho_hat) + (d beta_hat / d rho) Var(rho_hat) (d beta_hat / d rho)^T`
215/// `= V_beta + J V_rho J^T`.
216///
217/// Components:
218/// `J[:,k] = d(beta_hat)/d(rho_k) = -H^{-1}(A_k beta_hat), A_k = exp(rho_k) S_k`
219/// `V_rho = (d^2 V / d rho^2 at rho*)^{-1}`
220///
221/// Exact non-Gaussian V_ρ^{-1} requires the full Hessian with:
222/// - tr(H^{-1}H_{kℓ})
223/// - tr(H^{-1}H_k H^{-1}H_ℓ)
224/// - pseudo-det second derivatives in S
225/// - and H_{kℓ} terms containing fourth-likelihood derivatives.
226///
227/// This routine obtains V_ρ^{-1} from the analytic rho-space Hessian selected
228/// by `compute_lamlhessian_consistent`, then regularizes before inversion.
229/// If that analytic Hessian is unavailable, the correction is skipped rather
230/// than synthesized numerically.
231///
232/// Notes on omitted higher-order terms:
233/// - The exact `E[A(rho)]` and `Var(b(rho))` can be written with the Gaussian
234/// smoothing/heat operator `exp(0.5 * Delta_Sigma)` (equivalently Wick/Isserlis
235/// contractions of high-order derivatives).
236/// - Those infinite-series corrections are not expanded in this routine.
237pub(crate) struct SmoothingCorrectionComputation {
238 pub correction: Option<Array2<f64>>,
239 pub hessian_rho: Option<Array2<f64>>,
240 /// Regularized inverse outer Hessian `Cov(rho_hat)` in the same rho ordering
241 /// as the fitted smoothing-parameter vector. This exposes the #740 quantity
242 /// to LR Bartlett inference without changing the production algebra that
243 /// computes it.
244 pub rho_covariance: Option<Array2<f64>>,
245 /// Identified-subspace rank of the rho-Hessian inverse used to build
246 /// `correction`. `Some(n)` if the matrix was SPD and fully inverted;
247 /// `Some(r)` with `r < n` if the pseudo-inverse dropped non-identified
248 /// directions; `None` when no inversion was attempted or it failed before
249 /// producing a usable V_ρ. Downstream consumers (e.g. auto-cubature)
250 /// use this to decide whether higher-order corrections are even
251 /// meaningful — they aren't when V_ρ is rank-deficient.
252 pub active_rank: Option<usize>,
253}
254
255/// Result of pseudo-inverting the rho-space LAML Hessian on the identified subspace.
256///
257/// When the outer rho-Hessian has negative or near-zero eigenvalues at convergence
258/// (genuine non-convexity, Z₂-saddles from redundant penalty blocks, or weakly
259/// identified ρ directions on no-signal data), inverting it naively would either
260/// fail or yield an arbitrarily large V_ρ along those directions. Instead we
261/// split the spectrum into an identified subspace (eigenvalues strictly above an
262/// identifiability floor) and a non-identified subspace (negative, numerical zero,
263/// or marginally positive but below the floor). The returned `inverse` is the
264/// rank-deficient pseudo-inverse: 1/σ on the identified directions, 0 on the rest.
265/// `J · V_ρ · J^T` is then a valid rank-deficient inflation along well-identified
266/// rho directions, with zero contribution from non-identified directions.
267pub(crate) struct InvertedRhoHessian {
268 /// Pseudo-inverse projected onto the identified subspace.
269 pub inverse: Array2<f64>,
270 /// Number of eigenpairs retained (σ_i > tau).
271 pub active_rank: usize,
272 /// Eigenpairs dropped for σ_i < −neg_tol (genuine negative curvature).
273 pub dropped_negative: usize,
274 /// Eigenpairs dropped for marginally positive σ_i in (neg_tol, tau].
275 pub dropped_small_positive: usize,
276 /// Eigenpairs dropped for |σ_i| ≤ neg_tol (numerical zero).
277 pub dropped_numerical_zero: usize,
278 /// Smallest eigenvalue (signed) of the input Hessian.
279 pub min_eigenvalue: f64,
280 /// True whenever active_rank < n (i.e. anything was dropped). Cholesky fast
281 /// path always returns false.
282 pub repaired_hessian: bool,
283 /// Per-eigenvalue classification (length n), aligned with the input matrix's
284 /// eigendecomposition order from `eigh`. Used by the [INDEF-HESS] diagnostic.
285 /// Empty on the Cholesky fast path (matrix was SPD, no classification needed).
286 pub eigenvalues: Array1<f64>,
287 /// Eigenvectors as columns, aligned with `eigenvalues`. Empty on the Cholesky
288 /// fast path. Carrying these here eliminates a second `eigh` call in the
289 /// `[INDEF-HESS]` diagnostic — the slow path computes them once and shares.
290 pub eigenvectors: Array2<f64>,
291 /// Per-eigenvalue classification labels parallel to `eigenvalues`.
292 pub classifications: Vec<EigenClassification>,
293}
294
295#[derive(Debug, Clone, Copy, PartialEq, Eq)]
296pub(crate) enum EigenClassification {
297 Active,
298 DroppedNegative,
299 DroppedSmallPositive,
300 DroppedNumericalZero,
301}
302
303/// Invert the rho-space LAML Hessian onto the identified subspace.
304///
305/// Fast path: when the matrix is strictly positive-definite, returns the full
306/// Cholesky inverse with `active_rank = n` and `repaired_hessian = false`.
307///
308/// Slow path: eigendecompose, classify each eigenpair, and assemble the
309/// rank-deficient pseudo-inverse. Returns `None` only when the eigendecomposition
310/// itself fails (non-finite eigenvalues or eigenvectors). An all-bad spectrum
311/// (active_rank == 0) still returns `Some`; the caller is responsible for
312/// deciding whether to use a zero-rank covariance.
313pub(crate) fn invert_regularized_rho_hessian(
314 hessian_rho: &Array2<f64>,
315) -> Option<InvertedRhoHessian> {
316 let n = hessian_rho.nrows();
317 if let Ok(chol) = hessian_rho.cholesky(faer::Side::Lower) {
318 let mut inverse = Array2::<f64>::eye(n);
319 for col in 0..n {
320 let colvec = inverse.column(col).to_owned();
321 let solved = chol.solvevec(&colvec);
322 inverse.column_mut(col).assign(&solved);
323 }
324 // Spectral scale / min eigenvalue are not needed when Cholesky succeeds,
325 // but we surface coherent placeholders so downstream consumers can rely
326 // on the struct fields unconditionally.
327 return Some(InvertedRhoHessian {
328 inverse,
329 active_rank: n,
330 dropped_negative: 0,
331 dropped_small_positive: 0,
332 dropped_numerical_zero: 0,
333 min_eigenvalue: f64::NAN,
334 repaired_hessian: false,
335 eigenvalues: Array1::<f64>::zeros(0),
336 eigenvectors: Array2::<f64>::zeros((0, 0)),
337 classifications: Vec::new(),
338 });
339 }
340
341 let (eigenvalues, eigenvectors) = hessian_rho.eigh(faer::Side::Lower).ok()?;
342 if eigenvalues.iter().any(|v| !v.is_finite()) || !eigenvectors.iter().all(|v| v.is_finite()) {
343 return None;
344 }
345
346 let spectral_scale = eigenvalues
347 .iter()
348 .copied()
349 .map(f64::abs)
350 .fold(0.0_f64, f64::max)
351 .max(1.0);
352 let min_eigenvalue = eigenvalues.iter().copied().fold(f64::INFINITY, f64::min);
353 let neg_tol = 64.0 * f64::EPSILON * (n.max(1) as f64) * spectral_scale;
354 let tau = (spectral_scale * 1e-10).max(LAML_RIDGE);
355
356 let mut inverse = Array2::<f64>::zeros((n, n));
357 let mut classifications = Vec::with_capacity(n);
358 let mut active_rank = 0usize;
359 let mut dropped_negative = 0usize;
360 let mut dropped_small_positive = 0usize;
361 let mut dropped_numerical_zero = 0usize;
362
363 for i in 0..n {
364 let sigma = eigenvalues[i];
365 let class = if sigma > tau {
366 EigenClassification::Active
367 } else if sigma.abs() <= neg_tol {
368 EigenClassification::DroppedNumericalZero
369 } else if sigma > 0.0 {
370 // 0 < sigma <= tau and |sigma| > neg_tol: marginally positive but
371 // below the identifiability floor; 1/sigma would explode.
372 EigenClassification::DroppedSmallPositive
373 } else {
374 // sigma < -neg_tol: genuine negative curvature.
375 EigenClassification::DroppedNegative
376 };
377 classifications.push(class);
378 match class {
379 EigenClassification::Active => {
380 active_rank += 1;
381 let inv_lambda = 1.0 / sigma;
382 let v = eigenvectors.column(i);
383 for row in 0..n {
384 for col in 0..n {
385 inverse[[row, col]] += inv_lambda * v[row] * v[col];
386 }
387 }
388 }
389 EigenClassification::DroppedNegative => dropped_negative += 1,
390 EigenClassification::DroppedSmallPositive => dropped_small_positive += 1,
391 EigenClassification::DroppedNumericalZero => dropped_numerical_zero += 1,
392 }
393 }
394
395 Some(InvertedRhoHessian {
396 inverse,
397 active_rank,
398 dropped_negative,
399 dropped_small_positive,
400 dropped_numerical_zero,
401 min_eigenvalue,
402 repaired_hessian: active_rank < n,
403 eigenvalues,
404 eigenvectors,
405 classifications,
406 })
407}
408
409/// Cosine threshold above which two penalty matrices are treated as the
410/// structural-redundancy signature in [INDEF-HESS] diagnostics. Pairs with
411/// cosine above this AND a dominant-negative eigenvector concentrated on
412/// the pair's antisymmetric direction trigger the headline
413/// `structural_redundancy_detected` line.
414const INDEF_HESS_STRUCTURAL_REDUNDANCY_COS: f64 = 0.999;
415
416/// Penalty-count crossover at which the [INDEF-HESS] pair dump switches from
417/// the full O(k²) grid to top-3 pairs only. Bounds log volume on large-scale
418/// rho_dim while keeping the per-pair detail useful for small models.
419const INDEF_HESS_PAIR_DUMP_GRID_MAX_K: usize = 16;
420
421/// Number of top-cosine pairs to dump when `n_pen > INDEF_HESS_PAIR_DUMP_GRID_MAX_K`.
422const INDEF_HESS_PAIR_DUMP_TOP_N: usize = 3;
423
424/// Diagnostic emitted whenever the post-fit rho-Hessian has at least one
425/// non-identified direction (active_rank < n). Reports the eigendecomposition,
426/// the dominant-negative eigenvector, per-eigenpair classification, and pairwise
427/// penalty cosines tr(SᵢSⱼ)/√(tr(Sᵢ²)·tr(Sⱼ²)). A pair cosine ≈ 1.0 combined
428/// with the negative eigenvector concentrated on that pair's antisymmetric
429/// direction is the structural Z₂-saddle signature.
430///
431/// Output is capped: when the penalty count exceeds 16, only the top-3
432/// highest-cosine pairs are dumped instead of the full O(k²) grid. When the
433/// structural-redundancy signature is detected, a single headline line is
434/// emitted with the offending pair, cosine, and antisymmetric projection.
435fn dump_indefinite_rho_hessian_diagnostic(
436 hessian_rho: &Array2<f64>,
437 final_rho: &Array1<f64>,
438 canonical: &[gam_terms::construction::CanonicalPenalty],
439 inverted: Option<&InvertedRhoHessian>,
440) {
441 let k = hessian_rho.nrows();
442 if k == 0 {
443 return;
444 }
445
446 // Reuse the eigendecomposition already computed by the inverter when present
447 // (the slow path always populates it). Only recompute on the rare paths
448 // where the diagnostic is called without an `InvertedRhoHessian` (e.g. the
449 // eigendecomposition-failed bail in `compute_smoothing_correction`).
450 let (eigenvalues_owned, eigenvectors_owned);
451 let (eigenvalues_ref, eigenvectors_ref) = match inverted {
452 Some(inv) if !inv.eigenvalues.is_empty() && !inv.eigenvectors.is_empty() => {
453 (&inv.eigenvalues, &inv.eigenvectors)
454 }
455 _ => match hessian_rho.eigh(faer::Side::Lower) {
456 Ok((evals, evecs)) => {
457 eigenvalues_owned = evals;
458 eigenvectors_owned = evecs;
459 (&eigenvalues_owned, &eigenvectors_owned)
460 }
461 Err(err) => {
462 log::warn!("[INDEF-HESS] eigendecomposition failed: {err}");
463 return;
464 }
465 },
466 };
467
468 log::warn!("[INDEF-HESS] rho={:?}", final_rho.as_slice().unwrap_or(&[]),);
469 log::warn!(
470 "[INDEF-HESS] eigenvalues={:?}",
471 eigenvalues_ref.as_slice().unwrap_or(&[]),
472 );
473 if let Some(inv) = inverted {
474 log::warn!(
475 "[INDEF-HESS] active_rank={}/{} dropped_negative={} dropped_small_positive={} dropped_numerical_zero={}",
476 inv.active_rank,
477 k,
478 inv.dropped_negative,
479 inv.dropped_small_positive,
480 inv.dropped_numerical_zero,
481 );
482 if !inv.classifications.is_empty() {
483 let labels: Vec<&'static str> = inv
484 .classifications
485 .iter()
486 .map(|c| match c {
487 EigenClassification::Active => "A",
488 EigenClassification::DroppedNegative => "N",
489 EigenClassification::DroppedSmallPositive => "P",
490 EigenClassification::DroppedNumericalZero => "Z",
491 })
492 .collect();
493 log::warn!(
494 "[INDEF-HESS] classifications={:?} (A=active N=neg P=small_pos Z=numerical_zero)",
495 labels,
496 );
497 }
498 }
499
500 let mut neg_idx = 0usize;
501 let mut min_eig = f64::INFINITY;
502 for (i, &v) in eigenvalues_ref.iter().enumerate() {
503 if v < min_eig {
504 min_eig = v;
505 neg_idx = i;
506 }
507 }
508 let v_neg = eigenvectors_ref.column(neg_idx);
509 log::warn!(
510 "[INDEF-HESS] negative_eigenvalue={:.4e} eigenvector={:?}",
511 min_eig,
512 v_neg.as_slice().unwrap_or(&[]),
513 );
514
515 let n_pen = canonical.len();
516 let mut tr_aa = vec![0.0_f64; n_pen];
517 for i in 0..n_pen {
518 let local = &canonical[i].local;
519 let mut s = 0.0;
520 for r in 0..local.nrows() {
521 for c in 0..local.ncols() {
522 s += local[[r, c]] * local[[r, c]];
523 }
524 }
525 tr_aa[i] = s;
526 }
527 log::warn!(
528 "[INDEF-HESS] penalty_count={} ranges={:?} ranks={:?}",
529 n_pen,
530 (0..n_pen)
531 .map(|i| (canonical[i].col_range.start, canonical[i].col_range.end))
532 .collect::<Vec<_>>(),
533 (0..n_pen).map(|i| canonical[i].rank()).collect::<Vec<_>>(),
534 );
535
536 // Collect compatible pairs with their cosines.
537 struct PairCos {
538 i: usize,
539 j: usize,
540 cos: f64,
541 antisym_proj: f64,
542 }
543 let mut pairs: Vec<PairCos> = Vec::new();
544 for i in 0..n_pen {
545 for j in (i + 1)..n_pen {
546 let ci = &canonical[i];
547 let cj = &canonical[j];
548 if ci.col_range != cj.col_range {
549 continue;
550 }
551 let local_i = &ci.local;
552 let local_j = &cj.local;
553 let mut dot = 0.0;
554 for r in 0..local_i.nrows() {
555 for c in 0..local_i.ncols() {
556 dot += local_i[[r, c]] * local_j[[r, c]];
557 }
558 }
559 let cos = if tr_aa[i] > 0.0 && tr_aa[j] > 0.0 {
560 dot / (tr_aa[i].sqrt() * tr_aa[j].sqrt())
561 } else {
562 f64::NAN
563 };
564 let antisym_proj = if v_neg.len() == n_pen {
565 (v_neg[i] - v_neg[j]) / std::f64::consts::SQRT_2
566 } else {
567 f64::NAN
568 };
569 pairs.push(PairCos {
570 i,
571 j,
572 cos,
573 antisym_proj,
574 });
575 }
576 }
577
578 // Headline: structural redundancy detection. Pair cosine above the
579 // structural-redundancy threshold AND the dominant-negative eigenvector's
580 // top-2 absolute components on indices (i, j) with opposite signs.
581 if min_eig < 0.0 && v_neg.len() == n_pen {
582 for p in &pairs {
583 if !(p.cos > INDEF_HESS_STRUCTURAL_REDUNDANCY_COS) {
584 continue;
585 }
586 let mut indexed: Vec<(usize, f64)> = v_neg
587 .iter()
588 .enumerate()
589 .map(|(idx, &val)| (idx, val))
590 .collect();
591 indexed.sort_by(|a, b| {
592 b.1.abs()
593 .partial_cmp(&a.1.abs())
594 .unwrap_or(std::cmp::Ordering::Equal)
595 });
596 if indexed.len() < 2 {
597 continue;
598 }
599 let top0 = indexed[0].0;
600 let top1 = indexed[1].0;
601 let (a, b) = if top0 == p.i && top1 == p.j {
602 (indexed[0].1, indexed[1].1)
603 } else if top0 == p.j && top1 == p.i {
604 (indexed[1].1, indexed[0].1)
605 } else {
606 continue;
607 };
608 if a * b >= 0.0 {
609 continue;
610 }
611 log::warn!(
612 "[INDEF-HESS] structural_redundancy_detected pair=({},{}) cos={:.6} antisym_proj={:.4e}",
613 p.i,
614 p.j,
615 p.cos,
616 p.antisym_proj,
617 );
618 break;
619 }
620 }
621
622 // Cap output: dump the full grid when small, otherwise only the top-N
623 // highest-cosine pairs.
624 if n_pen <= INDEF_HESS_PAIR_DUMP_GRID_MAX_K {
625 for p in &pairs {
626 log::warn!(
627 "[INDEF-HESS] pair=({},{}) cos={:.6} tr_ii={:.4e} tr_jj={:.4e} v_neg[i]-v_neg[j]/sqrt2={:.4e}",
628 p.i,
629 p.j,
630 p.cos,
631 tr_aa[p.i],
632 tr_aa[p.j],
633 p.antisym_proj,
634 );
635 }
636 // Note: we no longer log a "ranges_differ" line per skipped pair to
637 // keep the diagnostic O(k). The headline pair already captures intent.
638 } else {
639 let mut top: Vec<&PairCos> = pairs.iter().filter(|p| p.cos.is_finite()).collect();
640 top.sort_by(|a, b| {
641 b.cos
642 .abs()
643 .partial_cmp(&a.cos.abs())
644 .unwrap_or(std::cmp::Ordering::Equal)
645 });
646 for p in top.iter().take(INDEF_HESS_PAIR_DUMP_TOP_N) {
647 log::warn!(
648 "[INDEF-HESS] top_pair=({},{}) cos={:.6} tr_ii={:.4e} tr_jj={:.4e} v_neg[i]-v_neg[j]/sqrt2={:.4e}",
649 p.i,
650 p.j,
651 p.cos,
652 tr_aa[p.i],
653 tr_aa[p.j],
654 p.antisym_proj,
655 );
656 }
657 }
658}
659
660pub(crate) fn compute_smoothing_correction(
661 reml_state: &RemlState<'_>,
662 final_rho: &Array1<f64>,
663 final_fit: &pirls::PirlsResult,
664) -> SmoothingCorrectionComputation {
665 use gam_linalg::faer_ndarray::{FaerCholesky, FaerEigh};
666
667 let n_rho = final_rho.len();
668 if n_rho == 0 {
669 return SmoothingCorrectionComputation {
670 correction: None,
671 hessian_rho: None,
672 rho_covariance: None,
673 active_rank: None,
674 };
675 }
676
677 let n_coeffs_trans = final_fit.beta_transformed.len();
678 let n_coeffs_orig = final_fit.reparam_result.qs.nrows();
679 let lambdas: Array1<f64> = final_rho.mapv(f64::exp);
680
681 // Step 1: Compute the Jacobian J = d(beta)/d(rho) in transformed space.
682 //
683 // Exact implicit-function identity at the inner optimum:
684 // dβ̂/dρ_k = -H^{-1}(S_k^ρ (β̂ - μ_k)), S_k^ρ = λ_k S_k,
685 // λ_k = exp(ρ_k).
686 //
687 // In transformed coordinates with root penalties S_k = R_kᵀR_k:
688 // S_k (β̂ - μ_k) = R_kᵀ(R_k (β̂ - μ_k)),
689 // so each Jacobian column is one linear solve with H.
690
691 // Use the same objective-consistent inner Hessian surface used by REML:
692 // - non-Firth: H = X'W_HX + S (+ stabilization if present)
693 // - Firth logit: H_total = H - d²Phi/dβ²
694 // Fallback to PIRLS stabilized Hessian only if bundle recovery fails.
695 //
696 // Conclusion:
697 // J[:,k] = dβ̂/dρ_k must use the Jacobian of the actual stationarity
698 // system G*(β,ρ)=0, i.e. H_total for Firth-adjusted fits. Using only
699 // X'W_HX+S here would be inconsistent with the fitted objective and would
700 // misstate smoothing-parameter uncertainty propagation.
701 let h_trans = reml_state
702 .objective_innerhessian(final_rho)
703 .unwrap_or_else(|_| final_fit.stabilizedhessian_transformed.to_dense());
704
705 // The IFT solve below feeds length-`n_coeffs_trans` right-hand sides into
706 // the Cholesky factor of `h_trans`, and faer asserts `rhs.len() == factor.n()`.
707 // A Hessian that does not match the coefficient dimension (e.g. a degenerate
708 // 0×0 placeholder from a geometry backend that failed to materialize a real
709 // dense inner Hessian) would otherwise abort the whole fit inside the solve.
710 // Bail to the no-correction branch exactly like the Cholesky-`Err` guard
711 // below, so the post-fit smoothing correction is simply skipped.
712 if h_trans.nrows() != n_coeffs_trans || h_trans.ncols() != n_coeffs_trans {
713 log::warn!(
714 "smoothing-correction inner Hessian shape {}x{} does not match coefficient dimension {}; skipping.",
715 h_trans.nrows(),
716 h_trans.ncols(),
717 n_coeffs_trans
718 );
719 return SmoothingCorrectionComputation {
720 correction: None,
721 hessian_rho: None,
722 rho_covariance: None,
723 active_rank: None,
724 };
725 }
726
727 // Factor the Hessian for solving
728 let h_chol = match h_trans.cholesky(faer::Side::Lower) {
729 Ok(c) => c,
730 Err(_) => {
731 log::warn!("Cholesky decomposition failed for smoothing correction; skipping.");
732 return SmoothingCorrectionComputation {
733 correction: None,
734 hessian_rho: None,
735 rho_covariance: None,
736 active_rank: None,
737 };
738 }
739 };
740
741 let beta_trans = final_fit.beta_transformed.as_ref();
742 let ct = &final_fit.reparam_result.canonical_transformed;
743
744 // Build the stationarity-gradient derivative matrix G_ρ where column k is
745 // ∂g(β,ρ)/∂ρ_k = λ_k S_k(β - μ_k), then delegate the IFT solve
746 // dβ/dρ = -H⁻¹G_ρ to the canonical evidence helper. This keeps the
747 // coefficient-space prediction correction and the joint-evidence
748 // Arrow-Schur path on the same hand-derived IFT identity.
749 let mut dg_drho_trans = Array2::<f64>::zeros((n_coeffs_trans, n_rho));
750 // Per-ρ_k support: the coefficient range its stationarity-gradient
751 // derivative ∂g/∂ρ_k is nonzero on. Each column is block-local (only the
752 // k-th penalty block), so this is exactly cp.col_range; structurally
753 // inactive columns keep an empty support and the cone-of-influence solve
754 // skips them entirely (their sensitivity is identically zero). See #779.
755 let mut col_supports: Vec<std::ops::Range<usize>> = vec![0..0; n_rho];
756 for k in 0..n_rho {
757 if k >= ct.len() {
758 continue;
759 }
760 let cp = &ct[k];
761 if cp.rank() == 0 {
762 continue;
763 }
764 // S_k(β - μ) — block-local: R^T (R (β[block] - μ)), embedded into p-vector.
765 let r = &cp.col_range;
766 col_supports[k] = r.start..r.end;
767 let beta_block = beta_trans.slice(s![r.start..r.end]);
768 let centered = &beta_block - &cp.prior_mean;
769 let r_beta = cp.root.dot(¢ered);
770 for a in 0..cp.block_dim() {
771 dg_drho_trans[[r.start + a, k]] = lambdas[k]
772 * (0..cp.rank())
773 .map(|row| cp.root[[row, a]] * r_beta[row])
774 .sum::<f64>();
775 }
776 }
777 // Lazy/local cone-of-influence propagation (#779): confine each column's
778 // sensitivity to the coupling component of `h_trans` containing the moved
779 // penalty block, and skip structurally inactive columns. Exact on a
780 // block-decoupled Hessian (entries outside the cone are identically zero)
781 // and identical to the full joint solve on a fully coupled Hessian.
782 let jacobian_trans = match crate::sensitivity::FitSensitivity::from_faer_cholesky(
783 &h_chol,
784 n_coeffs_trans,
785 )
786 .mode_response_coned(h_trans.view(), dg_drho_trans.view(), &col_supports)
787 {
788 Some(jacobian) => jacobian,
789 None => {
790 log::warn!("IFT beta-rho sensitivity solve failed for smoothing correction; skipping.");
791 return SmoothingCorrectionComputation {
792 correction: None,
793 hessian_rho: None,
794 rho_covariance: None,
795 active_rank: None,
796 };
797 }
798 };
799
800 // Step 2: Build V_rho by inverting the LAML Hessian in rho-space.
801 // The authoritative inner-strategy path chooses the rho-space Hessian
802 // evaluation policy here. Unified may still perform local numerical
803 // salvage inside the exact branch, but the branch choice itself no longer
804 // lives inline at the call site.
805 let mut hessian_rho = match reml_state.compute_lamlhessian_consistent(final_rho) {
806 Ok(h) => h,
807 Err(err) => {
808 log::warn!(
809 "LAML Hessian unavailable ({}); skipping smoothing correction.",
810 err
811 );
812 return SmoothingCorrectionComputation {
813 correction: None,
814 hessian_rho: None,
815 rho_covariance: None,
816 active_rank: None,
817 };
818 }
819 };
820
821 // Symmetrize the Hessian
822 gam_linalg::matrix::symmetrize_in_place(&mut hessian_rho);
823
824 // Step 3: Invert Hessian to get V_rho.
825 // Add a small ridge before factorization to regularize weakly identified ρ directions.
826 add_relative_diag_ridge(&mut hessian_rho, LAML_RIDGE, LAML_RIDGE);
827
828 let inverted = match invert_regularized_rho_hessian(&hessian_rho) {
829 Some(inv) => inv,
830 None => {
831 log::warn!(
832 "Eigendecomposition of LAML rho Hessian failed for smoothing correction; skipping."
833 );
834 dump_indefinite_rho_hessian_diagnostic(
835 &hessian_rho,
836 final_rho,
837 &final_fit.reparam_result.canonical_transformed,
838 None,
839 );
840 return SmoothingCorrectionComputation {
841 correction: None,
842 hessian_rho: Some(hessian_rho),
843 rho_covariance: None,
844 active_rank: None,
845 };
846 }
847 };
848
849 let n_rho_total = hessian_rho.nrows();
850 if inverted.active_rank == 0 {
851 // All directions non-identified. Pseudo-inverse is zero, so J·V_ρ·J^T
852 // adds nothing; report no correction (consistent with the prior behavior
853 // for fully indefinite Hessians, but now logged with full context).
854 log::info!(
855 "LAML rho Hessian has no identified directions (active_rank=0/{}, dropped_negative={}, dropped_small_positive={}, dropped_numerical_zero={}, min_eig={:.3e}); smoothing correction is zero, skipping.",
856 n_rho_total,
857 inverted.dropped_negative,
858 inverted.dropped_small_positive,
859 inverted.dropped_numerical_zero,
860 inverted.min_eigenvalue,
861 );
862 dump_indefinite_rho_hessian_diagnostic(
863 &hessian_rho,
864 final_rho,
865 &final_fit.reparam_result.canonical_transformed,
866 Some(&inverted),
867 );
868 return SmoothingCorrectionComputation {
869 correction: None,
870 hessian_rho: Some(hessian_rho),
871 rho_covariance: Some(inverted.inverse),
872 active_rank: Some(0),
873 };
874 }
875
876 if inverted.active_rank < n_rho_total {
877 log::info!(
878 "LAML rho Hessian is rank-deficient on the identified subspace (active_rank={}/{}, dropped_negative={}, dropped_small_positive={}, dropped_numerical_zero={}, min_eig={:.3e}); proceeding with pseudo-inverse V_ρ.",
879 inverted.active_rank,
880 n_rho_total,
881 inverted.dropped_negative,
882 inverted.dropped_small_positive,
883 inverted.dropped_numerical_zero,
884 inverted.min_eigenvalue,
885 );
886 dump_indefinite_rho_hessian_diagnostic(
887 &hessian_rho,
888 final_rho,
889 &final_fit.reparam_result.canonical_transformed,
890 Some(&inverted),
891 );
892 }
893
894 let repaired_hessian = inverted.repaired_hessian;
895 let active_rank_used = inverted.active_rank;
896 let v_rho = inverted.inverse;
897 let rho_covariance = v_rho.clone();
898 if repaired_hessian {
899 log::debug!(
900 "Applied rank-deficient pseudo-inverse on identified rho-Hessian subspace before smoothing correction."
901 );
902 }
903
904 // Step 4: Compute V_corr = J * V_rho * J^T in transformed space.
905 //
906 // This is the first-order smoothing-parameter uncertainty inflation:
907 // Var(β̂) ≈ Var(β̂|ρ̂) + (dβ̂/dρ) Var(ρ̂) (dβ̂/dρ)ᵀ.
908 //
909 // Here:
910 // J = dβ̂/dρ, J[:,k] = -H^{-1}(A_k β̂),
911 // V_ρ = (∇²_{ρρ}V)^{-1} evaluated at the final ρ.
912 let jv_rho = jacobian_trans.dot(&v_rho); // (n_coeffs_trans x n_rho)
913 let v_corr_trans = jv_rho.dot(&jacobian_trans.t()); // (n_coeffs_trans x n_coeffs_trans)
914
915 // Step 5: Transform back to original coefficient basis:
916 // V_corr_orig = Qs * V_corr_trans * Qs^T
917 let qs = &final_fit.reparam_result.qs;
918 let qsv = qs.dot(&v_corr_trans);
919 let mut v_corr_orig = qsv.dot(&qs.t());
920 // The congruence Qs·M·Qsᵀ is symmetric in exact arithmetic, but the two
921 // matrix products fill v[i,j] and v[j,i] via independent dot-products,
922 // leaving O(ε) asymmetry. This is a genuine covariance (added to the base
923 // Vb to form Vp, and consumed by model_comparison's corrected-EDF trace,
924 // which documents a "both symmetric" invariant it then relies on), so we
925 // symmetrize it like every other covariance-assembly site — unlike the
926 // influence matrix F = H⁻¹X'WX, which is deliberately left asymmetric.
927 gam_linalg::matrix::symmetrize_in_place(&mut v_corr_orig);
928
929 // Validate the result
930 if !v_corr_orig.iter().all(|v| v.is_finite()) {
931 log::warn!("Non-finite values in smoothing correction matrix; skipping.");
932 return SmoothingCorrectionComputation {
933 correction: None,
934 hessian_rho: Some(hessian_rho),
935 rho_covariance: Some(rho_covariance),
936 active_rank: Some(active_rank_used),
937 };
938 }
939
940 // Ensure positive semi-definiteness without hiding a failed rho-space
941 // optimum. The smoothing correction V_corr = J·V_ρ·Jᵀ is an SE *inflation*
942 // (Var(β̂|ρ̂) + uncertainty-in-ρ̂). It is PSD *by construction*: V_ρ is the
943 // (active-subspace) inverse of the SPD ρ-Hessian, so the congruence
944 // J·V_ρ·Jᵀ cannot have genuine negative curvature. Crucially it is also
945 // rank-deficient — its rank is at most `n_rho`, while it lives in the
946 // `n_coeffs_orig`-dimensional coefficient space — so for every model with
947 // fewer smoothing parameters than coefficients (i.e. essentially all of
948 // them) it has exact-zero eigenvalues. A symmetric eigensolver renders
949 // those exact zeros as ±O(ε·‖V_corr‖), so the smallest eigenvalue is
950 // routinely a *sub-tolerance negative* that is pure floating-point
951 // roundoff, not curvature. Rejecting the whole correction on such a
952 // roundoff zero silently dropped Vp for the entire #smooth < #coef regime
953 // (every predict() interval lost its smoothing-parameter inflation).
954 //
955 // So we only treat a *material* negative (below the eigensolver roundoff
956 // floor `neg_tol`) as a real failure — that signals a corrupted V_ρ
957 // (near-saddle / pinv-imputed direction) and we skip loudly, letting the
958 // caller fall back to the honest base covariance. Sub-tolerance negatives
959 // are accepted as-is: the matrix is PSD to roundoff, and it is added to the
960 // (dominant) base covariance Vb, which keeps Vp PSD without any relabelling
961 // or clamping.
962 match v_corr_orig.eigh(faer::Side::Lower) {
963 // Eigenvectors are unused: only the smallest eigenvalue's magnitude
964 // distinguishes roundoff from genuine indefiniteness.
965 Ok((eigenvalues, _)) => {
966 let min_eig = eigenvalues.iter().fold(f64::INFINITY, |a, &b| a.min(b));
967 let spectral_scale = eigenvalues
968 .iter()
969 .copied()
970 .map(f64::abs)
971 .fold(0.0_f64, f64::max)
972 .max(1.0);
973 let neg_tol = 64.0 * f64::EPSILON * (n_coeffs_orig.max(1) as f64) * spectral_scale;
974 if min_eig < -neg_tol {
975 log::warn!(
976 "Smoothing correction has material negative eigenvalue {:.3e} \
977 below tolerance {:.3e}; skipping correction.",
978 min_eig,
979 neg_tol
980 );
981 return SmoothingCorrectionComputation {
982 correction: None,
983 hessian_rho: Some(hessian_rho),
984 rho_covariance: Some(rho_covariance),
985 active_rank: Some(active_rank_used),
986 };
987 }
988 }
989 Err(_) => {
990 log::warn!("Eigendecomposition failed for smoothing correction validation.");
991 }
992 }
993 SmoothingCorrectionComputation {
994 correction: Some(v_corr_orig),
995 hessian_rho: Some(hessian_rho),
996 rho_covariance: Some(rho_covariance),
997 active_rank: Some(active_rank_used),
998 }
999}