gam_solve/estimate/
smoothing_correction.rs

1use super::*;
2
3/// Default inner P-IRLS tolerance floor.
4///
5/// The inner Newton iteration certifies the coefficient mode against this
6/// (scale-aware) tolerance independently of the outer REML tolerance. Coupling
7/// the two collapses two unrelated convergence concepts: when a user dials the
8/// outer tolerance up to e.g. 1e-3 to make the smoothing-parameter search
9/// coarser, the inner solve becomes coarse too, returning betas whose
10/// stationarity residual is ~1e-3·scale rather than the floating-point noise
11/// floor. Outer derivatives then read those imprecise betas as if they were
12/// the true mode and accumulate error. Keeping the inner floor at 1e-6 lets
13/// the outer loop relax without contaminating the coefficient certificate.
14pub(crate) const PIRLS_INNER_TOLERANCE_FLOOR: f64 = 1e-6;
15
16#[derive(Clone)]
17pub(crate) struct RemlConfig {
18    pub(crate) likelihood: GlmLikelihoodSpec,
19    pub(crate) link_kind: InverseLink,
20    pub(crate) pirls_convergence_tolerance: f64,
21    pub(crate) max_iterations: usize,
22    pub(crate) reml_convergence_tolerance: f64,
23    pub(crate) firth_bias_reduction: bool,
24    /// Forwarded to `pirls::PirlsConfig::geodesic_acceleration`. Off by default.
25    pub(crate) geodesic_acceleration: bool,
26}
27
28impl RemlConfig {
29    pub(crate) fn external(
30        likelihood: GlmLikelihoodSpec,
31        reml_tol: f64,
32        firth_bias_reduction: bool,
33    ) -> Self {
34        // Inner P-IRLS certifies the coefficient mode against
35        // `pirls_convergence_tolerance`; the outer REML iteration certifies
36        // the smoothing-parameter optimum against `reml_convergence_tolerance`.
37        // These are different concepts and must not be coupled. The inner
38        // tolerance is at most the outer tolerance (so a user who *tightens*
39        // the outer also tightens the inner), but never coarser than the
40        // floor — a coarse outer must not silently pollute the inner mode.
41        let pirls_tol = reml_tol.min(PIRLS_INNER_TOLERANCE_FLOOR);
42        let link_kind = likelihood.spec.link.clone();
43        Self {
44            likelihood,
45            link_kind,
46            pirls_convergence_tolerance: pirls_tol,
47            max_iterations: 0,
48            reml_convergence_tolerance: reml_tol,
49            firth_bias_reduction,
50            geodesic_acceleration: false,
51        }
52        .with_max_iterations(300)
53    }
54
55    pub(crate) fn with_max_iterations(mut self, max_iterations: usize) -> Self {
56        self.max_iterations = max_iterations;
57        self
58    }
59
60    pub(crate) fn link_function(&self) -> LinkFunction {
61        self.link_kind.link_function()
62    }
63
64    pub(crate) fn as_pirls_config(&self) -> pirls::PirlsConfig {
65        pirls::PirlsConfig {
66            likelihood: self.likelihood.clone(),
67            link_kind: self.link_kind.clone(),
68            max_iterations: self.max_iterations,
69            convergence_tolerance: self.pirls_convergence_tolerance,
70            firth_bias_reduction: self.firth_bias_reduction,
71            // Caller (the REML runtime) populates this hint just before
72            // each `execute_pirls_if_needed` call from the cached final
73            // λ of the previous successful PIRLS solve.
74            initial_lm_lambda: None,
75            geodesic_acceleration: self.geodesic_acceleration,
76            // Arrow-Schur structured-inner-solve descriptor. Not used by
77            // the standard REML→PIRLS path (β-only); set by the latent
78            // driver (`crate::latent_inner::LatentInnerSolver`)
79            // which assembles the per-row (t, β) bordered system
80            // externally. Default `None` preserves back-compat.
81            arrow_schur: None,
82        }
83    }
84}
85pub(crate) const MAX_FACTORIZATION_ATTEMPTS: usize = 4;
86
87/// Small ridge added to the rho-space LAML Hessian before inversion, for
88/// numerical stability when smoothing parameters are weakly identified.
89///
90/// **Stabilization semantics:** this ridge is a
91/// [`gam_problem::StabilizationKind::NumericalPerturbation`] (not an
92/// `ExplicitPrior`). It enters only the inverse used to build `V_rho` for
93/// the smoothing-correction propagation step. It does NOT enter the LAML
94/// objective, its gradient, the saved coefficients, or any user-visible
95/// summary — the rho-Hessian itself is recomputed from first principles
96/// in every place that consults it. Classified as
97/// [`gam_problem::StabilizationKind::NumericalPerturbation`]; no ledger
98/// record is emitted at this site because the perturbation never escapes the
99/// local `V_rho` inverse (it touches no saved coefficient, objective, or
100/// user-visible summary).
101const LAML_RIDGE: f64 = 1e-8;
102/// Minimum penalized-deviance floor, expressed as a fraction of the
103/// problem's own deviance scale (the weighted null deviance `D₀`, see
104/// [`smooth_floor_dp`]). The floor exists only to keep the profiled
105/// dispersion `φ̂ = D_p/(n−M_p)` strictly positive when a smooth fits the
106/// data essentially perfectly (`D_p ↓ 0`), so it must trigger on the
107/// *relative* smallness `D_p/D₀`, never on an absolute magnitude — an
108/// absolute floor silently breaks the exact scale-equivariance of the
109/// Gaussian REML fit under a response rescale `y → a·y` (#1127).
110pub(crate) const DP_FLOOR: f64 = 1e-12;
111/// Width of the smooth transition region for the deviance floor, also as a
112/// fraction of the deviance scale `D₀`.
113const DP_FLOOR_SMOOTH_WIDTH: f64 = 1e-8;
114
115// Unified rho bound corresponding to lambda in [exp(-RHO_BOUND), exp(RHO_BOUND)].
116// Additional headroom reduces frequent contact with the hard box constraints.
117pub const RHO_BOUND: f64 = 30.0;
118// Soft interior prior on rho near the box boundaries.
119pub(crate) const RHO_SOFT_PRIOR_WEIGHT: f64 = 1e-6;
120pub(crate) const RHO_SOFT_PRIOR_SHARPNESS: f64 = 4.0;
121// Adaptive cubature guardrails for bounded correction latency.
122pub(crate) const AUTO_CUBATURE_MAX_RHO_DIM: usize = 12;
123pub(crate) const AUTO_CUBATURE_MAX_EIGENVECTORS: usize = 4;
124pub(crate) const AUTO_CUBATURE_TARGET_VAR_FRAC: f64 = 0.95;
125pub(crate) const AUTO_CUBATURE_MAX_BETA_DIM: usize = 1600;
126pub(crate) const AUTO_CUBATURE_BOUNDARY_MARGIN: f64 = 2.0;
127
128/// Smooth, differentiable approximation of `max(dp, floor)` where the floor
129/// and the width of the smoothing band are taken **relative to the supplied
130/// deviance `scale`** (the weighted null deviance `D₀` of the response).
131///
132/// Returns the smoothed value, first derivative, and second derivative with
133/// respect to `dp`.
134///
135/// # Why the floor must be relative (issue #1127)
136///
137/// The penalized deviance `D_p = Σ wᵢ(yᵢ−μ̂ᵢ)² + β̂ᵀSβ̂` is exactly quadratic
138/// in the response, so under a multiplicative rescale `y → a·y` it scales as
139/// `D_p → a²·D_p`. The profiled Gaussian REML criterion depends on `D_p` only
140/// through `log D_p` (the `(ν/2)·log(2πφ̂)` term, `φ̂ = D_p/ν`), so the rescale
141/// shifts the cost by the *additive constant* `ν·log a` and leaves the
142/// ρ-gradient — hence the selected `λ̂`, the EDF, and `ŝ(x)/a` — exactly
143/// invariant. An **absolute** floor destroys this: when `a` is small enough
144/// that `D_p` enters the fixed band (e.g. `D_p ≈ 3.6e-11` at `a = 1e-6` with
145/// a band of width `1e-8`), `dp_c` is spuriously inflated toward the absolute
146/// floor, `log dp_c` stops tracking `2·log a + const`, and the optimizer
147/// converges at an over-smoothed `λ̂` — reshaping, not merely rescaling, the
148/// smooth. Scaling both the floor and its width by `D₀ ∝ a²` makes the band a
149/// fixed *fraction* of the deviance, so `smooth_floor_dp(a²·dp, a²·D₀) =
150/// a²·smooth_floor_dp(dp, D₀)` exactly and equivariance is restored.
151///
152/// `scale = 1.0` recovers the historical absolute floor byte-for-byte, which
153/// is the correct default for callers without a Gaussian response scale in
154/// hand (the floor is consumed only on the profiled-Gaussian path).
155pub(crate) fn smooth_floor_dp(dp: f64, scale: f64) -> (f64, f64, f64) {
156    let scale = if scale.is_finite() && scale > 0.0 {
157        scale
158    } else {
159        1.0
160    };
161    let floor = DP_FLOOR * scale;
162    let tau = (DP_FLOOR_SMOOTH_WIDTH * scale).max(f64::MIN_POSITIVE);
163    let scaled = (dp - floor) / tau;
164
165    let softplus = if scaled > 20.0 {
166        scaled + (-scaled).exp()
167    } else if scaled < -20.0 {
168        scaled.exp()
169    } else {
170        (1.0 + scaled.exp()).ln()
171    };
172
173    let sigma = if scaled >= 0.0 {
174        let exp_neg = (-scaled).exp();
175        1.0 / (1.0 + exp_neg)
176    } else {
177        let exp_pos = scaled.exp();
178        exp_pos / (1.0 + exp_pos)
179    };
180
181    let dp_c = floor + tau * softplus;
182    let dp_cgrad2 = sigma * (1.0 - sigma) / tau;
183    (dp_c, sigma, dp_cgrad2)
184}
185
186/// Compute the smoothing parameter uncertainty correction matrix `V_corr = J * V_rho * J^T`.
187///
188/// This implements the Wood et al. (2016) correction for smoothing parameter uncertainty.
189/// The corrected covariance for `beta` is: `V*_beta = V_beta + J * V_rho * J^T`.
190/// where:
191/// - `V_beta = H^{-1}` (conditional covariance treating `lambda` as fixed)
192/// - `J = d(beta)/d(rho)` (Jacobian wrt log-smoothing parameters)
193/// - `V_rho = (d^2 LAML / d rho^2)^{-1}` (outer covariance)
194///
195/// Returns the correction matrix in the ORIGINAL coefficient basis.
196///
197/// Full correction reference.
198/// Let `rho ~ N(mu, Sigma)` with `mu = rho_hat`, `Sigma = V_rho`,
199/// and define:
200/// - `A(rho) = H_rho^{-1}`
201/// - `b(rho) = beta_hat_rho`
202///
203/// The exact Gaussian-mixture identity is:
204///   `Var(beta) = E[A(rho)] + Var(b(rho))`.
205///
206/// Around `mu`, this routine keeps the first-order terms:
207///
208///   `E[A(rho)]      ~= A(mu) = Hmu^{-1}`
209///   `Var(b(rho))    ~= J Sigma J^T`
210///   `Var(beta)      ~= Hmu^{-1} + J V_rho J^T`.
211///
212/// Equivalent first-order propagation around the outer optimum `rho*`:
213///
214///   `Var(beta_hat) ~= Var(beta_hat | rho_hat) + (d beta_hat / d rho) Var(rho_hat) (d beta_hat / d rho)^T`
215///                  `= V_beta + J V_rho J^T`.
216///
217/// Components:
218///   `J[:,k] = d(beta_hat)/d(rho_k) = -H^{-1}(A_k beta_hat),  A_k = exp(rho_k) S_k`
219///   `V_rho  = (d^2 V / d rho^2 at rho*)^{-1}`
220///
221/// Exact non-Gaussian V_ρ^{-1} requires the full Hessian with:
222///   - tr(H^{-1}H_{kℓ})
223///   - tr(H^{-1}H_k H^{-1}H_ℓ)
224///   - pseudo-det second derivatives in S
225///   - and H_{kℓ} terms containing fourth-likelihood derivatives.
226///
227/// This routine obtains V_ρ^{-1} from the analytic rho-space Hessian selected
228/// by `compute_lamlhessian_consistent`, then regularizes before inversion.
229/// If that analytic Hessian is unavailable, the correction is skipped rather
230/// than synthesized numerically.
231///
232/// Notes on omitted higher-order terms:
233/// - The exact `E[A(rho)]` and `Var(b(rho))` can be written with the Gaussian
234///   smoothing/heat operator `exp(0.5 * Delta_Sigma)` (equivalently Wick/Isserlis
235///   contractions of high-order derivatives).
236/// - Those infinite-series corrections are not expanded in this routine.
237pub(crate) struct SmoothingCorrectionComputation {
238    pub correction: Option<Array2<f64>>,
239    pub hessian_rho: Option<Array2<f64>>,
240    /// Regularized inverse outer Hessian `Cov(rho_hat)` in the same rho ordering
241    /// as the fitted smoothing-parameter vector. This exposes the #740 quantity
242    /// to LR Bartlett inference without changing the production algebra that
243    /// computes it.
244    pub rho_covariance: Option<Array2<f64>>,
245    /// Identified-subspace rank of the rho-Hessian inverse used to build
246    /// `correction`. `Some(n)` if the matrix was SPD and fully inverted;
247    /// `Some(r)` with `r < n` if the pseudo-inverse dropped non-identified
248    /// directions; `None` when no inversion was attempted or it failed before
249    /// producing a usable V_ρ. Downstream consumers (e.g. auto-cubature)
250    /// use this to decide whether higher-order corrections are even
251    /// meaningful — they aren't when V_ρ is rank-deficient.
252    pub active_rank: Option<usize>,
253}
254
255/// Result of pseudo-inverting the rho-space LAML Hessian on the identified subspace.
256///
257/// When the outer rho-Hessian has negative or near-zero eigenvalues at convergence
258/// (genuine non-convexity, Z₂-saddles from redundant penalty blocks, or weakly
259/// identified ρ directions on no-signal data), inverting it naively would either
260/// fail or yield an arbitrarily large V_ρ along those directions. Instead we
261/// split the spectrum into an identified subspace (eigenvalues strictly above an
262/// identifiability floor) and a non-identified subspace (negative, numerical zero,
263/// or marginally positive but below the floor). The returned `inverse` is the
264/// rank-deficient pseudo-inverse: 1/σ on the identified directions, 0 on the rest.
265/// `J · V_ρ · J^T` is then a valid rank-deficient inflation along well-identified
266/// rho directions, with zero contribution from non-identified directions.
267pub(crate) struct InvertedRhoHessian {
268    /// Pseudo-inverse projected onto the identified subspace.
269    pub inverse: Array2<f64>,
270    /// Number of eigenpairs retained (σ_i > tau).
271    pub active_rank: usize,
272    /// Eigenpairs dropped for σ_i < −neg_tol (genuine negative curvature).
273    pub dropped_negative: usize,
274    /// Eigenpairs dropped for marginally positive σ_i in (neg_tol, tau].
275    pub dropped_small_positive: usize,
276    /// Eigenpairs dropped for |σ_i| ≤ neg_tol (numerical zero).
277    pub dropped_numerical_zero: usize,
278    /// Smallest eigenvalue (signed) of the input Hessian.
279    pub min_eigenvalue: f64,
280    /// True whenever active_rank < n (i.e. anything was dropped). Cholesky fast
281    /// path always returns false.
282    pub repaired_hessian: bool,
283    /// Per-eigenvalue classification (length n), aligned with the input matrix's
284    /// eigendecomposition order from `eigh`. Used by the [INDEF-HESS] diagnostic.
285    /// Empty on the Cholesky fast path (matrix was SPD, no classification needed).
286    pub eigenvalues: Array1<f64>,
287    /// Eigenvectors as columns, aligned with `eigenvalues`. Empty on the Cholesky
288    /// fast path. Carrying these here eliminates a second `eigh` call in the
289    /// `[INDEF-HESS]` diagnostic — the slow path computes them once and shares.
290    pub eigenvectors: Array2<f64>,
291    /// Per-eigenvalue classification labels parallel to `eigenvalues`.
292    pub classifications: Vec<EigenClassification>,
293}
294
295#[derive(Debug, Clone, Copy, PartialEq, Eq)]
296pub(crate) enum EigenClassification {
297    Active,
298    DroppedNegative,
299    DroppedSmallPositive,
300    DroppedNumericalZero,
301}
302
303/// Invert the rho-space LAML Hessian onto the identified subspace.
304///
305/// Fast path: when the matrix is strictly positive-definite, returns the full
306/// Cholesky inverse with `active_rank = n` and `repaired_hessian = false`.
307///
308/// Slow path: eigendecompose, classify each eigenpair, and assemble the
309/// rank-deficient pseudo-inverse. Returns `None` only when the eigendecomposition
310/// itself fails (non-finite eigenvalues or eigenvectors). An all-bad spectrum
311/// (active_rank == 0) still returns `Some`; the caller is responsible for
312/// deciding whether to use a zero-rank covariance.
313pub(crate) fn invert_regularized_rho_hessian(
314    hessian_rho: &Array2<f64>,
315) -> Option<InvertedRhoHessian> {
316    let n = hessian_rho.nrows();
317    if let Ok(chol) = hessian_rho.cholesky(faer::Side::Lower) {
318        let mut inverse = Array2::<f64>::eye(n);
319        for col in 0..n {
320            let colvec = inverse.column(col).to_owned();
321            let solved = chol.solvevec(&colvec);
322            inverse.column_mut(col).assign(&solved);
323        }
324        // Spectral scale / min eigenvalue are not needed when Cholesky succeeds,
325        // but we surface coherent placeholders so downstream consumers can rely
326        // on the struct fields unconditionally.
327        return Some(InvertedRhoHessian {
328            inverse,
329            active_rank: n,
330            dropped_negative: 0,
331            dropped_small_positive: 0,
332            dropped_numerical_zero: 0,
333            min_eigenvalue: f64::NAN,
334            repaired_hessian: false,
335            eigenvalues: Array1::<f64>::zeros(0),
336            eigenvectors: Array2::<f64>::zeros((0, 0)),
337            classifications: Vec::new(),
338        });
339    }
340
341    let (eigenvalues, eigenvectors) = hessian_rho.eigh(faer::Side::Lower).ok()?;
342    if eigenvalues.iter().any(|v| !v.is_finite()) || !eigenvectors.iter().all(|v| v.is_finite()) {
343        return None;
344    }
345
346    let spectral_scale = eigenvalues
347        .iter()
348        .copied()
349        .map(f64::abs)
350        .fold(0.0_f64, f64::max)
351        .max(1.0);
352    let min_eigenvalue = eigenvalues.iter().copied().fold(f64::INFINITY, f64::min);
353    let neg_tol = 64.0 * f64::EPSILON * (n.max(1) as f64) * spectral_scale;
354    let tau = (spectral_scale * 1e-10).max(LAML_RIDGE);
355
356    let mut inverse = Array2::<f64>::zeros((n, n));
357    let mut classifications = Vec::with_capacity(n);
358    let mut active_rank = 0usize;
359    let mut dropped_negative = 0usize;
360    let mut dropped_small_positive = 0usize;
361    let mut dropped_numerical_zero = 0usize;
362
363    for i in 0..n {
364        let sigma = eigenvalues[i];
365        let class = if sigma > tau {
366            EigenClassification::Active
367        } else if sigma.abs() <= neg_tol {
368            EigenClassification::DroppedNumericalZero
369        } else if sigma > 0.0 {
370            // 0 < sigma <= tau and |sigma| > neg_tol: marginally positive but
371            // below the identifiability floor; 1/sigma would explode.
372            EigenClassification::DroppedSmallPositive
373        } else {
374            // sigma < -neg_tol: genuine negative curvature.
375            EigenClassification::DroppedNegative
376        };
377        classifications.push(class);
378        match class {
379            EigenClassification::Active => {
380                active_rank += 1;
381                let inv_lambda = 1.0 / sigma;
382                let v = eigenvectors.column(i);
383                for row in 0..n {
384                    for col in 0..n {
385                        inverse[[row, col]] += inv_lambda * v[row] * v[col];
386                    }
387                }
388            }
389            EigenClassification::DroppedNegative => dropped_negative += 1,
390            EigenClassification::DroppedSmallPositive => dropped_small_positive += 1,
391            EigenClassification::DroppedNumericalZero => dropped_numerical_zero += 1,
392        }
393    }
394
395    Some(InvertedRhoHessian {
396        inverse,
397        active_rank,
398        dropped_negative,
399        dropped_small_positive,
400        dropped_numerical_zero,
401        min_eigenvalue,
402        repaired_hessian: active_rank < n,
403        eigenvalues,
404        eigenvectors,
405        classifications,
406    })
407}
408
409/// Cosine threshold above which two penalty matrices are treated as the
410/// structural-redundancy signature in [INDEF-HESS] diagnostics. Pairs with
411/// cosine above this AND a dominant-negative eigenvector concentrated on
412/// the pair's antisymmetric direction trigger the headline
413/// `structural_redundancy_detected` line.
414const INDEF_HESS_STRUCTURAL_REDUNDANCY_COS: f64 = 0.999;
415
416/// Penalty-count crossover at which the [INDEF-HESS] pair dump switches from
417/// the full O(k²) grid to top-3 pairs only. Bounds log volume on large-scale
418/// rho_dim while keeping the per-pair detail useful for small models.
419const INDEF_HESS_PAIR_DUMP_GRID_MAX_K: usize = 16;
420
421/// Number of top-cosine pairs to dump when `n_pen > INDEF_HESS_PAIR_DUMP_GRID_MAX_K`.
422const INDEF_HESS_PAIR_DUMP_TOP_N: usize = 3;
423
424/// Diagnostic emitted whenever the post-fit rho-Hessian has at least one
425/// non-identified direction (active_rank < n). Reports the eigendecomposition,
426/// the dominant-negative eigenvector, per-eigenpair classification, and pairwise
427/// penalty cosines tr(SᵢSⱼ)/√(tr(Sᵢ²)·tr(Sⱼ²)). A pair cosine ≈ 1.0 combined
428/// with the negative eigenvector concentrated on that pair's antisymmetric
429/// direction is the structural Z₂-saddle signature.
430///
431/// Output is capped: when the penalty count exceeds 16, only the top-3
432/// highest-cosine pairs are dumped instead of the full O(k²) grid. When the
433/// structural-redundancy signature is detected, a single headline line is
434/// emitted with the offending pair, cosine, and antisymmetric projection.
435fn dump_indefinite_rho_hessian_diagnostic(
436    hessian_rho: &Array2<f64>,
437    final_rho: &Array1<f64>,
438    canonical: &[gam_terms::construction::CanonicalPenalty],
439    inverted: Option<&InvertedRhoHessian>,
440) {
441    let k = hessian_rho.nrows();
442    if k == 0 {
443        return;
444    }
445
446    // Reuse the eigendecomposition already computed by the inverter when present
447    // (the slow path always populates it). Only recompute on the rare paths
448    // where the diagnostic is called without an `InvertedRhoHessian` (e.g. the
449    // eigendecomposition-failed bail in `compute_smoothing_correction`).
450    let (eigenvalues_owned, eigenvectors_owned);
451    let (eigenvalues_ref, eigenvectors_ref) = match inverted {
452        Some(inv) if !inv.eigenvalues.is_empty() && !inv.eigenvectors.is_empty() => {
453            (&inv.eigenvalues, &inv.eigenvectors)
454        }
455        _ => match hessian_rho.eigh(faer::Side::Lower) {
456            Ok((evals, evecs)) => {
457                eigenvalues_owned = evals;
458                eigenvectors_owned = evecs;
459                (&eigenvalues_owned, &eigenvectors_owned)
460            }
461            Err(err) => {
462                log::warn!("[INDEF-HESS] eigendecomposition failed: {err}");
463                return;
464            }
465        },
466    };
467
468    log::warn!("[INDEF-HESS] rho={:?}", final_rho.as_slice().unwrap_or(&[]),);
469    log::warn!(
470        "[INDEF-HESS] eigenvalues={:?}",
471        eigenvalues_ref.as_slice().unwrap_or(&[]),
472    );
473    if let Some(inv) = inverted {
474        log::warn!(
475            "[INDEF-HESS] active_rank={}/{} dropped_negative={} dropped_small_positive={} dropped_numerical_zero={}",
476            inv.active_rank,
477            k,
478            inv.dropped_negative,
479            inv.dropped_small_positive,
480            inv.dropped_numerical_zero,
481        );
482        if !inv.classifications.is_empty() {
483            let labels: Vec<&'static str> = inv
484                .classifications
485                .iter()
486                .map(|c| match c {
487                    EigenClassification::Active => "A",
488                    EigenClassification::DroppedNegative => "N",
489                    EigenClassification::DroppedSmallPositive => "P",
490                    EigenClassification::DroppedNumericalZero => "Z",
491                })
492                .collect();
493            log::warn!(
494                "[INDEF-HESS] classifications={:?} (A=active N=neg P=small_pos Z=numerical_zero)",
495                labels,
496            );
497        }
498    }
499
500    let mut neg_idx = 0usize;
501    let mut min_eig = f64::INFINITY;
502    for (i, &v) in eigenvalues_ref.iter().enumerate() {
503        if v < min_eig {
504            min_eig = v;
505            neg_idx = i;
506        }
507    }
508    let v_neg = eigenvectors_ref.column(neg_idx);
509    log::warn!(
510        "[INDEF-HESS] negative_eigenvalue={:.4e} eigenvector={:?}",
511        min_eig,
512        v_neg.as_slice().unwrap_or(&[]),
513    );
514
515    let n_pen = canonical.len();
516    let mut tr_aa = vec![0.0_f64; n_pen];
517    for i in 0..n_pen {
518        let local = &canonical[i].local;
519        let mut s = 0.0;
520        for r in 0..local.nrows() {
521            for c in 0..local.ncols() {
522                s += local[[r, c]] * local[[r, c]];
523            }
524        }
525        tr_aa[i] = s;
526    }
527    log::warn!(
528        "[INDEF-HESS] penalty_count={} ranges={:?} ranks={:?}",
529        n_pen,
530        (0..n_pen)
531            .map(|i| (canonical[i].col_range.start, canonical[i].col_range.end))
532            .collect::<Vec<_>>(),
533        (0..n_pen).map(|i| canonical[i].rank()).collect::<Vec<_>>(),
534    );
535
536    // Collect compatible pairs with their cosines.
537    struct PairCos {
538        i: usize,
539        j: usize,
540        cos: f64,
541        antisym_proj: f64,
542    }
543    let mut pairs: Vec<PairCos> = Vec::new();
544    for i in 0..n_pen {
545        for j in (i + 1)..n_pen {
546            let ci = &canonical[i];
547            let cj = &canonical[j];
548            if ci.col_range != cj.col_range {
549                continue;
550            }
551            let local_i = &ci.local;
552            let local_j = &cj.local;
553            let mut dot = 0.0;
554            for r in 0..local_i.nrows() {
555                for c in 0..local_i.ncols() {
556                    dot += local_i[[r, c]] * local_j[[r, c]];
557                }
558            }
559            let cos = if tr_aa[i] > 0.0 && tr_aa[j] > 0.0 {
560                dot / (tr_aa[i].sqrt() * tr_aa[j].sqrt())
561            } else {
562                f64::NAN
563            };
564            let antisym_proj = if v_neg.len() == n_pen {
565                (v_neg[i] - v_neg[j]) / std::f64::consts::SQRT_2
566            } else {
567                f64::NAN
568            };
569            pairs.push(PairCos {
570                i,
571                j,
572                cos,
573                antisym_proj,
574            });
575        }
576    }
577
578    // Headline: structural redundancy detection. Pair cosine above the
579    // structural-redundancy threshold AND the dominant-negative eigenvector's
580    // top-2 absolute components on indices (i, j) with opposite signs.
581    if min_eig < 0.0 && v_neg.len() == n_pen {
582        for p in &pairs {
583            if !(p.cos > INDEF_HESS_STRUCTURAL_REDUNDANCY_COS) {
584                continue;
585            }
586            let mut indexed: Vec<(usize, f64)> = v_neg
587                .iter()
588                .enumerate()
589                .map(|(idx, &val)| (idx, val))
590                .collect();
591            indexed.sort_by(|a, b| {
592                b.1.abs()
593                    .partial_cmp(&a.1.abs())
594                    .unwrap_or(std::cmp::Ordering::Equal)
595            });
596            if indexed.len() < 2 {
597                continue;
598            }
599            let top0 = indexed[0].0;
600            let top1 = indexed[1].0;
601            let (a, b) = if top0 == p.i && top1 == p.j {
602                (indexed[0].1, indexed[1].1)
603            } else if top0 == p.j && top1 == p.i {
604                (indexed[1].1, indexed[0].1)
605            } else {
606                continue;
607            };
608            if a * b >= 0.0 {
609                continue;
610            }
611            log::warn!(
612                "[INDEF-HESS] structural_redundancy_detected pair=({},{}) cos={:.6} antisym_proj={:.4e}",
613                p.i,
614                p.j,
615                p.cos,
616                p.antisym_proj,
617            );
618            break;
619        }
620    }
621
622    // Cap output: dump the full grid when small, otherwise only the top-N
623    // highest-cosine pairs.
624    if n_pen <= INDEF_HESS_PAIR_DUMP_GRID_MAX_K {
625        for p in &pairs {
626            log::warn!(
627                "[INDEF-HESS] pair=({},{}) cos={:.6} tr_ii={:.4e} tr_jj={:.4e} v_neg[i]-v_neg[j]/sqrt2={:.4e}",
628                p.i,
629                p.j,
630                p.cos,
631                tr_aa[p.i],
632                tr_aa[p.j],
633                p.antisym_proj,
634            );
635        }
636        // Note: we no longer log a "ranges_differ" line per skipped pair to
637        // keep the diagnostic O(k). The headline pair already captures intent.
638    } else {
639        let mut top: Vec<&PairCos> = pairs.iter().filter(|p| p.cos.is_finite()).collect();
640        top.sort_by(|a, b| {
641            b.cos
642                .abs()
643                .partial_cmp(&a.cos.abs())
644                .unwrap_or(std::cmp::Ordering::Equal)
645        });
646        for p in top.iter().take(INDEF_HESS_PAIR_DUMP_TOP_N) {
647            log::warn!(
648                "[INDEF-HESS] top_pair=({},{}) cos={:.6} tr_ii={:.4e} tr_jj={:.4e} v_neg[i]-v_neg[j]/sqrt2={:.4e}",
649                p.i,
650                p.j,
651                p.cos,
652                tr_aa[p.i],
653                tr_aa[p.j],
654                p.antisym_proj,
655            );
656        }
657    }
658}
659
660pub(crate) fn compute_smoothing_correction(
661    reml_state: &RemlState<'_>,
662    final_rho: &Array1<f64>,
663    final_fit: &pirls::PirlsResult,
664) -> SmoothingCorrectionComputation {
665    use gam_linalg::faer_ndarray::{FaerCholesky, FaerEigh};
666
667    let n_rho = final_rho.len();
668    if n_rho == 0 {
669        return SmoothingCorrectionComputation {
670            correction: None,
671            hessian_rho: None,
672            rho_covariance: None,
673            active_rank: None,
674        };
675    }
676
677    let n_coeffs_trans = final_fit.beta_transformed.len();
678    let n_coeffs_orig = final_fit.reparam_result.qs.nrows();
679    let lambdas: Array1<f64> = final_rho.mapv(f64::exp);
680
681    // Step 1: Compute the Jacobian J = d(beta)/d(rho) in transformed space.
682    //
683    // Exact implicit-function identity at the inner optimum:
684    //   dβ̂/dρ_k = -H^{-1}(S_k^ρ (β̂ - μ_k)),   S_k^ρ = λ_k S_k,
685    //   λ_k = exp(ρ_k).
686    //
687    // In transformed coordinates with root penalties S_k = R_kᵀR_k:
688    //   S_k (β̂ - μ_k) = R_kᵀ(R_k (β̂ - μ_k)),
689    // so each Jacobian column is one linear solve with H.
690
691    // Use the same objective-consistent inner Hessian surface used by REML:
692    // - non-Firth: H = X'W_HX + S (+ stabilization if present)
693    // - Firth logit: H_total = H - d²Phi/dβ²
694    // Fallback to PIRLS stabilized Hessian only if bundle recovery fails.
695    //
696    // Conclusion:
697    //   J[:,k] = dβ̂/dρ_k must use the Jacobian of the actual stationarity
698    //   system G*(β,ρ)=0, i.e. H_total for Firth-adjusted fits. Using only
699    //   X'W_HX+S here would be inconsistent with the fitted objective and would
700    //   misstate smoothing-parameter uncertainty propagation.
701    let h_trans = reml_state
702        .objective_innerhessian(final_rho)
703        .unwrap_or_else(|_| final_fit.stabilizedhessian_transformed.to_dense());
704
705    // The IFT solve below feeds length-`n_coeffs_trans` right-hand sides into
706    // the Cholesky factor of `h_trans`, and faer asserts `rhs.len() == factor.n()`.
707    // A Hessian that does not match the coefficient dimension (e.g. a degenerate
708    // 0×0 placeholder from a geometry backend that failed to materialize a real
709    // dense inner Hessian) would otherwise abort the whole fit inside the solve.
710    // Bail to the no-correction branch exactly like the Cholesky-`Err` guard
711    // below, so the post-fit smoothing correction is simply skipped.
712    if h_trans.nrows() != n_coeffs_trans || h_trans.ncols() != n_coeffs_trans {
713        log::warn!(
714            "smoothing-correction inner Hessian shape {}x{} does not match coefficient dimension {}; skipping.",
715            h_trans.nrows(),
716            h_trans.ncols(),
717            n_coeffs_trans
718        );
719        return SmoothingCorrectionComputation {
720            correction: None,
721            hessian_rho: None,
722            rho_covariance: None,
723            active_rank: None,
724        };
725    }
726
727    // Factor the Hessian for solving
728    let h_chol = match h_trans.cholesky(faer::Side::Lower) {
729        Ok(c) => c,
730        Err(_) => {
731            log::warn!("Cholesky decomposition failed for smoothing correction; skipping.");
732            return SmoothingCorrectionComputation {
733                correction: None,
734                hessian_rho: None,
735                rho_covariance: None,
736                active_rank: None,
737            };
738        }
739    };
740
741    let beta_trans = final_fit.beta_transformed.as_ref();
742    let ct = &final_fit.reparam_result.canonical_transformed;
743
744    // Build the stationarity-gradient derivative matrix G_ρ where column k is
745    // ∂g(β,ρ)/∂ρ_k = λ_k S_k(β - μ_k), then delegate the IFT solve
746    // dβ/dρ = -H⁻¹G_ρ to the canonical evidence helper. This keeps the
747    // coefficient-space prediction correction and the joint-evidence
748    // Arrow-Schur path on the same hand-derived IFT identity.
749    let mut dg_drho_trans = Array2::<f64>::zeros((n_coeffs_trans, n_rho));
750    // Per-ρ_k support: the coefficient range its stationarity-gradient
751    // derivative ∂g/∂ρ_k is nonzero on. Each column is block-local (only the
752    // k-th penalty block), so this is exactly cp.col_range; structurally
753    // inactive columns keep an empty support and the cone-of-influence solve
754    // skips them entirely (their sensitivity is identically zero). See #779.
755    let mut col_supports: Vec<std::ops::Range<usize>> = vec![0..0; n_rho];
756    for k in 0..n_rho {
757        if k >= ct.len() {
758            continue;
759        }
760        let cp = &ct[k];
761        if cp.rank() == 0 {
762            continue;
763        }
764        // S_k(β - μ) — block-local: R^T (R (β[block] - μ)), embedded into p-vector.
765        let r = &cp.col_range;
766        col_supports[k] = r.start..r.end;
767        let beta_block = beta_trans.slice(s![r.start..r.end]);
768        let centered = &beta_block - &cp.prior_mean;
769        let r_beta = cp.root.dot(&centered);
770        for a in 0..cp.block_dim() {
771            dg_drho_trans[[r.start + a, k]] = lambdas[k]
772                * (0..cp.rank())
773                    .map(|row| cp.root[[row, a]] * r_beta[row])
774                    .sum::<f64>();
775        }
776    }
777    // Lazy/local cone-of-influence propagation (#779): confine each column's
778    // sensitivity to the coupling component of `h_trans` containing the moved
779    // penalty block, and skip structurally inactive columns. Exact on a
780    // block-decoupled Hessian (entries outside the cone are identically zero)
781    // and identical to the full joint solve on a fully coupled Hessian.
782    let jacobian_trans = match crate::sensitivity::FitSensitivity::from_faer_cholesky(
783        &h_chol,
784        n_coeffs_trans,
785    )
786    .mode_response_coned(h_trans.view(), dg_drho_trans.view(), &col_supports)
787    {
788        Some(jacobian) => jacobian,
789        None => {
790            log::warn!("IFT beta-rho sensitivity solve failed for smoothing correction; skipping.");
791            return SmoothingCorrectionComputation {
792                correction: None,
793                hessian_rho: None,
794                rho_covariance: None,
795                active_rank: None,
796            };
797        }
798    };
799
800    // Step 2: Build V_rho by inverting the LAML Hessian in rho-space.
801    // The authoritative inner-strategy path chooses the rho-space Hessian
802    // evaluation policy here. Unified may still perform local numerical
803    // salvage inside the exact branch, but the branch choice itself no longer
804    // lives inline at the call site.
805    let mut hessian_rho = match reml_state.compute_lamlhessian_consistent(final_rho) {
806        Ok(h) => h,
807        Err(err) => {
808            log::warn!(
809                "LAML Hessian unavailable ({}); skipping smoothing correction.",
810                err
811            );
812            return SmoothingCorrectionComputation {
813                correction: None,
814                hessian_rho: None,
815                rho_covariance: None,
816                active_rank: None,
817            };
818        }
819    };
820
821    // Symmetrize the Hessian
822    gam_linalg::matrix::symmetrize_in_place(&mut hessian_rho);
823
824    // Step 3: Invert Hessian to get V_rho.
825    // Add a small ridge before factorization to regularize weakly identified ρ directions.
826    add_relative_diag_ridge(&mut hessian_rho, LAML_RIDGE, LAML_RIDGE);
827
828    let inverted = match invert_regularized_rho_hessian(&hessian_rho) {
829        Some(inv) => inv,
830        None => {
831            log::warn!(
832                "Eigendecomposition of LAML rho Hessian failed for smoothing correction; skipping."
833            );
834            dump_indefinite_rho_hessian_diagnostic(
835                &hessian_rho,
836                final_rho,
837                &final_fit.reparam_result.canonical_transformed,
838                None,
839            );
840            return SmoothingCorrectionComputation {
841                correction: None,
842                hessian_rho: Some(hessian_rho),
843                rho_covariance: None,
844                active_rank: None,
845            };
846        }
847    };
848
849    let n_rho_total = hessian_rho.nrows();
850    if inverted.active_rank == 0 {
851        // All directions non-identified. Pseudo-inverse is zero, so J·V_ρ·J^T
852        // adds nothing; report no correction (consistent with the prior behavior
853        // for fully indefinite Hessians, but now logged with full context).
854        log::info!(
855            "LAML rho Hessian has no identified directions (active_rank=0/{}, dropped_negative={}, dropped_small_positive={}, dropped_numerical_zero={}, min_eig={:.3e}); smoothing correction is zero, skipping.",
856            n_rho_total,
857            inverted.dropped_negative,
858            inverted.dropped_small_positive,
859            inverted.dropped_numerical_zero,
860            inverted.min_eigenvalue,
861        );
862        dump_indefinite_rho_hessian_diagnostic(
863            &hessian_rho,
864            final_rho,
865            &final_fit.reparam_result.canonical_transformed,
866            Some(&inverted),
867        );
868        return SmoothingCorrectionComputation {
869            correction: None,
870            hessian_rho: Some(hessian_rho),
871            rho_covariance: Some(inverted.inverse),
872            active_rank: Some(0),
873        };
874    }
875
876    if inverted.active_rank < n_rho_total {
877        log::info!(
878            "LAML rho Hessian is rank-deficient on the identified subspace (active_rank={}/{}, dropped_negative={}, dropped_small_positive={}, dropped_numerical_zero={}, min_eig={:.3e}); proceeding with pseudo-inverse V_ρ.",
879            inverted.active_rank,
880            n_rho_total,
881            inverted.dropped_negative,
882            inverted.dropped_small_positive,
883            inverted.dropped_numerical_zero,
884            inverted.min_eigenvalue,
885        );
886        dump_indefinite_rho_hessian_diagnostic(
887            &hessian_rho,
888            final_rho,
889            &final_fit.reparam_result.canonical_transformed,
890            Some(&inverted),
891        );
892    }
893
894    let repaired_hessian = inverted.repaired_hessian;
895    let active_rank_used = inverted.active_rank;
896    let v_rho = inverted.inverse;
897    let rho_covariance = v_rho.clone();
898    if repaired_hessian {
899        log::debug!(
900            "Applied rank-deficient pseudo-inverse on identified rho-Hessian subspace before smoothing correction."
901        );
902    }
903
904    // Step 4: Compute V_corr = J * V_rho * J^T in transformed space.
905    //
906    // This is the first-order smoothing-parameter uncertainty inflation:
907    //   Var(β̂) ≈ Var(β̂|ρ̂) + (dβ̂/dρ) Var(ρ̂) (dβ̂/dρ)ᵀ.
908    //
909    // Here:
910    //   J = dβ̂/dρ,  J[:,k] = -H^{-1}(A_k β̂),
911    //   V_ρ = (∇²_{ρρ}V)^{-1} evaluated at the final ρ.
912    let jv_rho = jacobian_trans.dot(&v_rho); // (n_coeffs_trans x n_rho)
913    let v_corr_trans = jv_rho.dot(&jacobian_trans.t()); // (n_coeffs_trans x n_coeffs_trans)
914
915    // Step 5: Transform back to original coefficient basis:
916    // V_corr_orig = Qs * V_corr_trans * Qs^T
917    let qs = &final_fit.reparam_result.qs;
918    let qsv = qs.dot(&v_corr_trans);
919    let mut v_corr_orig = qsv.dot(&qs.t());
920    // The congruence Qs·M·Qsᵀ is symmetric in exact arithmetic, but the two
921    // matrix products fill v[i,j] and v[j,i] via independent dot-products,
922    // leaving O(ε) asymmetry. This is a genuine covariance (added to the base
923    // Vb to form Vp, and consumed by model_comparison's corrected-EDF trace,
924    // which documents a "both symmetric" invariant it then relies on), so we
925    // symmetrize it like every other covariance-assembly site — unlike the
926    // influence matrix F = H⁻¹X'WX, which is deliberately left asymmetric.
927    gam_linalg::matrix::symmetrize_in_place(&mut v_corr_orig);
928
929    // Validate the result
930    if !v_corr_orig.iter().all(|v| v.is_finite()) {
931        log::warn!("Non-finite values in smoothing correction matrix; skipping.");
932        return SmoothingCorrectionComputation {
933            correction: None,
934            hessian_rho: Some(hessian_rho),
935            rho_covariance: Some(rho_covariance),
936            active_rank: Some(active_rank_used),
937        };
938    }
939
940    // Ensure positive semi-definiteness without hiding a failed rho-space
941    // optimum. The smoothing correction V_corr = J·V_ρ·Jᵀ is an SE *inflation*
942    // (Var(β̂|ρ̂) + uncertainty-in-ρ̂). It is PSD *by construction*: V_ρ is the
943    // (active-subspace) inverse of the SPD ρ-Hessian, so the congruence
944    // J·V_ρ·Jᵀ cannot have genuine negative curvature. Crucially it is also
945    // rank-deficient — its rank is at most `n_rho`, while it lives in the
946    // `n_coeffs_orig`-dimensional coefficient space — so for every model with
947    // fewer smoothing parameters than coefficients (i.e. essentially all of
948    // them) it has exact-zero eigenvalues. A symmetric eigensolver renders
949    // those exact zeros as ±O(ε·‖V_corr‖), so the smallest eigenvalue is
950    // routinely a *sub-tolerance negative* that is pure floating-point
951    // roundoff, not curvature. Rejecting the whole correction on such a
952    // roundoff zero silently dropped Vp for the entire #smooth < #coef regime
953    // (every predict() interval lost its smoothing-parameter inflation).
954    //
955    // So we only treat a *material* negative (below the eigensolver roundoff
956    // floor `neg_tol`) as a real failure — that signals a corrupted V_ρ
957    // (near-saddle / pinv-imputed direction) and we skip loudly, letting the
958    // caller fall back to the honest base covariance. Sub-tolerance negatives
959    // are accepted as-is: the matrix is PSD to roundoff, and it is added to the
960    // (dominant) base covariance Vb, which keeps Vp PSD without any relabelling
961    // or clamping.
962    match v_corr_orig.eigh(faer::Side::Lower) {
963        // Eigenvectors are unused: only the smallest eigenvalue's magnitude
964        // distinguishes roundoff from genuine indefiniteness.
965        Ok((eigenvalues, _)) => {
966            let min_eig = eigenvalues.iter().fold(f64::INFINITY, |a, &b| a.min(b));
967            let spectral_scale = eigenvalues
968                .iter()
969                .copied()
970                .map(f64::abs)
971                .fold(0.0_f64, f64::max)
972                .max(1.0);
973            let neg_tol = 64.0 * f64::EPSILON * (n_coeffs_orig.max(1) as f64) * spectral_scale;
974            if min_eig < -neg_tol {
975                log::warn!(
976                    "Smoothing correction has material negative eigenvalue {:.3e} \
977                     below tolerance {:.3e}; skipping correction.",
978                    min_eig,
979                    neg_tol
980                );
981                return SmoothingCorrectionComputation {
982                    correction: None,
983                    hessian_rho: Some(hessian_rho),
984                    rho_covariance: Some(rho_covariance),
985                    active_rank: Some(active_rank_used),
986                };
987            }
988        }
989        Err(_) => {
990            log::warn!("Eigendecomposition failed for smoothing correction validation.");
991        }
992    }
993    SmoothingCorrectionComputation {
994        correction: Some(v_corr_orig),
995        hessian_rho: Some(hessian_rho),
996        rho_covariance: Some(rho_covariance),
997        active_rank: Some(active_rank_used),
998    }
999}
gam_solve/estimate/smoothing_correction.rs

gam_solve/estimate/
smoothing_correction.rs