gam_solve/estimate/optimizer.rs
1use super::*;
2use gam_problem::dispersion_cov::se_from_covariance;
3use std::sync::atomic::{AtomicUsize, Ordering};
4
5/// Optimize smoothing parameters for an external design using the same REML/LAML machinery.
6pub fn optimize_external_design<X>(
7 y: ArrayView1<'_, f64>,
8 w: ArrayView1<'_, f64>,
9 x: X,
10 offset: ArrayView1<'_, f64>,
11 s_list: Vec<BlockwisePenalty>,
12 opts: &ExternalOptimOptions,
13) -> Result<ExternalOptimResult, EstimationError>
14where
15 X: Into<DesignMatrix>,
16{
17 optimize_external_designwith_heuristic_lambdas(y, w, x, offset, s_list, None, opts)
18}
19
20/// Same as `optimize_external_design`, but allows heuristic λ warm-start seeds
21/// for the outer smoothing search.
22pub fn optimize_external_designwith_heuristic_lambdas<X>(
23 y: ArrayView1<'_, f64>,
24 w: ArrayView1<'_, f64>,
25 x: X,
26 offset: ArrayView1<'_, f64>,
27 s_list: Vec<BlockwisePenalty>,
28 heuristic_lambdas: Option<&[f64]>,
29 opts: &ExternalOptimOptions,
30) -> Result<ExternalOptimResult, EstimationError>
31where
32 X: Into<DesignMatrix>,
33{
34 let specs: Vec<PenaltySpec> = s_list
35 .into_iter()
36 .map(PenaltySpec::from_blockwise)
37 .collect();
38 optimize_external_designwith_heuristic_lambdas_andwarm_start(
39 y,
40 w,
41 x,
42 offset,
43 specs,
44 heuristic_lambdas,
45 None,
46 opts,
47 )
48}
49
50pub(crate) fn external_reml_seed_config(k: usize, link: LinkFunction) -> SeedConfig {
51 let gaussian = matches!(link, LinkFunction::Identity);
52 if k >= REML_SEED_SCREENING_RHO_CAP {
53 let seed_budget = if gaussian { 1 } else { 2 };
54 return SeedConfig {
55 bounds: (-12.0, 12.0),
56 max_seeds: seed_budget,
57 seed_budget,
58 risk_profile: if gaussian {
59 SeedRiskProfile::Gaussian
60 } else {
61 SeedRiskProfile::GeneralizedLinear
62 },
63 screen_max_inner_iterations: SeedConfig::default().screen_max_inner_iterations,
64 num_auxiliary_trailing: 0,
65 over_smoothing_probe_rho: None,
66 };
67 }
68 SeedConfig {
69 bounds: (-12.0, 12.0),
70 max_seeds: if gaussian && k <= 4 {
71 2
72 } else if gaussian && k <= 12 {
73 4
74 } else if gaussian {
75 6
76 } else if k <= 4 {
77 6
78 } else if k <= 12 {
79 8
80 } else {
81 10
82 },
83 seed_budget: if gaussian && k <= 6 { 1 } else { 2 },
84 risk_profile: if gaussian {
85 SeedRiskProfile::Gaussian
86 } else {
87 SeedRiskProfile::GeneralizedLinear
88 },
89 screen_max_inner_iterations: SeedConfig::default().screen_max_inner_iterations,
90 num_auxiliary_trailing: 0,
91 over_smoothing_probe_rho: None,
92 }
93}
94
95fn reml_inner_progress_feedback(
96 state: &crate::estimate::reml::RemlState<'_>,
97) -> crate::rho_optimizer::InnerProgressFeedback {
98 crate::rho_optimizer::InnerProgressFeedback {
99 cap: Arc::clone(&state.outer_inner_cap),
100 accepted_iter: Arc::new(AtomicUsize::new(0)),
101 last_iters: Arc::clone(&state.last_inner_iters),
102 last_converged: Arc::clone(&state.last_inner_converged),
103 ift_residual: Arc::clone(&state.last_ift_prediction_residual),
104 accept_rho: Arc::clone(&state.last_pirls_accept_rho),
105 }
106}
107
108fn with_reml_beta_seed_hook<'state, 'data>() -> impl FnMut(
109 &mut &'state mut crate::estimate::reml::RemlState<'data>,
110 &Array1<f64>,
111) -> Result<
112 crate::rho_optimizer::SeedOutcome,
113 EstimationError,
114> {
115 |state, beta| {
116 // The REML state stores β as a starting-iterate HINT and validates
117 // its width against the design (`self.p`) at store time, silently
118 // dropping a mismatched or non-finite hint rather than faulting
119 // (see `setwarm_start_original_beta`). A wrong-length seed is
120 // therefore never an error: a row-relaxed cross-fold prefix seed
121 // degrades to a ρ-only resume, exactly the desired warm-start
122 // behaviour. The slot's post-call state (the supplied β if it fit,
123 // else the prior state) is what the next eval warm-starts from, so
124 // `Installed` is the correct contract reply.
125 state.setwarm_start_original_beta(Some(beta.view()));
126 Ok(crate::rho_optimizer::SeedOutcome::Installed)
127 }
128}
129
130enum RemlInnerCapGuardArm {
131 Standard,
132 MixtureSas,
133}
134
135/// Re-run one full-inner-tolerance `compute_cost` at the converged operating
136/// point so the cached warm-start β is no longer pinned to whatever coarse cap
137/// the outer-aware inner-PIRLS schedule last set (path #3; see the call-site
138/// comments).
139///
140/// `rho` MUST be the smoothing-only penalty-block log-λ — its length equals the
141/// number of penalty blocks, because `compute_cost` exponentiates the whole
142/// vector into the penalty λ vector. For the parameterized-link arms
143/// (`MixtureSas`: SAS / Beta-Logistic / mixture / blended) the outer optimizer
144/// works in an augmented θ that trails the link parameters after the smoothing
145/// block; the caller must slice that block out (and install the link state on
146/// `state`) via `apply_link_theta` BEFORE calling this guard. Passing the raw
147/// augmented θ here over-counts the lambdas and faults the reparameterizer
148/// ("Lambda count mismatch", #1571). The `arm` only selects the log label.
149fn run_outer_inner_cap_guard(
150 state: &mut crate::estimate::reml::RemlState<'_>,
151 rho: &Array1<f64>,
152 arm: RemlInnerCapGuardArm,
153) -> Result<(), EstimationError> {
154 let prev_cap = state.outer_inner_cap.swap(0, Ordering::Relaxed);
155 if prev_cap != 0 {
156 let guard_start = std::time::Instant::now();
157 state.compute_cost(rho)?;
158 match arm {
159 RemlInnerCapGuardArm::Standard => log::info!(
160 "[OUTER guard] convergence-guard re-eval at converged ρ done (prev_cap={prev_cap}, elapsed={:.3}s)",
161 guard_start.elapsed().as_secs_f64()
162 ),
163 RemlInnerCapGuardArm::MixtureSas => log::info!(
164 "[OUTER guard] convergence-guard re-eval at converged ρ done (mixture/SAS arm; prev_cap={prev_cap}, elapsed={:.3}s)",
165 guard_start.elapsed().as_secs_f64()
166 ),
167 }
168 } else if matches!(arm, RemlInnerCapGuardArm::Standard) {
169 log::debug!("[OUTER guard] schedule never lifted (prev_cap=0); skipping refit");
170 }
171 Ok(())
172}
173
174/// The weighted-mean response level an unpenalized intercept would absorb, used
175/// to center the response during outer REML λ-selection (issue #1000).
176///
177/// For an identity-link Gaussian fit, adding a constant to the response only
178/// shifts the intercept, so λ̂ and the smooth shape must be invariant to the
179/// response mean. The outer score/gradient nonetheless accumulate
180/// `yᵀy`-magnitude sufficient statistics, so a large response mean costs
181/// precision and drifts λ̂. Returns `Some(m)` with
182/// `m = Σ wᵢ (yᵢ − offsetᵢ) / Σ wᵢ` — the constant a pure offset relabeling
183/// moves into the intercept — so the caller can subtract it and keep the working
184/// response `O(σ)` regardless of the mean.
185///
186/// Returns `None` (do not center, exact previous behaviour) unless the fit is
187/// identity-link Gaussian and carries an unpenalized intercept column to absorb
188/// the shift, and has no linear constraints that could pin the intercept. A zero
189/// or non-finite mean also returns `None` — there is nothing to gain.
190fn gaussian_identity_response_center(
191 cfg: &RemlConfig,
192 conditioning: &ParametricColumnConditioning,
193 has_linear_constraints: bool,
194 y: ArrayView1<'_, f64>,
195 w: ArrayView1<'_, f64>,
196 offset: ArrayView1<'_, f64>,
197) -> Option<f64> {
198 if has_linear_constraints
199 || conditioning.intercept_idx.is_none()
200 || !matches!(cfg.likelihood.spec.response, ResponseFamily::Gaussian)
201 || !matches!(cfg.link_function(), LinkFunction::Identity)
202 {
203 return None;
204 }
205 let mut weight_sum = 0.0_f64;
206 let mut weighted = KahanSum::default();
207 for ((&yi, &wi), &oi) in y.iter().zip(w.iter()).zip(offset.iter()) {
208 if wi > 0.0 {
209 weight_sum += wi;
210 weighted.add(wi * (yi - oi));
211 }
212 }
213 if weight_sum <= 0.0 {
214 return None;
215 }
216 let m = weighted.sum() / weight_sum;
217 (m.is_finite() && m != 0.0).then_some(m)
218}
219
220/// The multiplicative scale an identity-link Gaussian outer REML λ-search should
221/// divide the (already centered) response by so its magnitude is `O(1)` for the
222/// duration of the search (issue #1127).
223///
224/// Replacing the response `y` by `a·y` (`a > 0`) for an identity-link Gaussian
225/// fit must rescale the entire fit by `a` and leave `λ̂` / EDF unchanged: the
226/// penalized normal equations are exactly linear in `y`, so `β̂(a·y)=a·β̂(y)`
227/// at any fixed `λ`, and the profiled REML criterion is `a`-invariant up to the
228/// additive constant `−(n−p)·ln a` (the dispersion `σ̂²` absorbs the `a²`).
229/// Numerically, though, the outer λ-selection's convergence band is keyed to an
230/// *absolute* objective scale (the inner-solve `objective_scale.max(1.0)` floor
231/// and the outer `1e-6` gradient floor): when the whole Gaussian objective is
232/// `O(a²) ≪ 1` those floors swamp the real signal and the optimizer declares
233/// premature convergence at an over-smoothed `λ` — silently over-smoothing
234/// small-magnitude responses (strains, volts, mole fractions, returns;
235/// `a ≈ 1e-6`). Normalizing the working response to `O(1)` makes the absolute
236/// floors track the true signal, restoring scale equivariance.
237///
238/// Returns `Some(s)` with `s = √(Σ wᵢ (yᵢ − mean)² / Σ wᵢ)` — the weighted RMS
239/// of the centered response — so the caller can divide by it and keep the outer
240/// working response `O(1)` regardless of magnitude. The same gate as
241/// [`gaussian_identity_response_center`] applies (identity-link Gaussian with an
242/// unpenalized intercept and no linear constraints); a non-finite, zero, or
243/// already-`O(1)` RMS returns `None` (do not scale, exact previous behaviour) —
244/// scaling near unity buys nothing and only risks a needless allocation.
245fn gaussian_identity_response_scale(
246 cfg: &RemlConfig,
247 conditioning: &ParametricColumnConditioning,
248 has_linear_constraints: bool,
249 center: f64,
250 y: ArrayView1<'_, f64>,
251 w: ArrayView1<'_, f64>,
252 offset: ArrayView1<'_, f64>,
253) -> Option<f64> {
254 if has_linear_constraints
255 || conditioning.intercept_idx.is_none()
256 || !matches!(cfg.likelihood.spec.response, ResponseFamily::Gaussian)
257 || !matches!(cfg.link_function(), LinkFunction::Identity)
258 {
259 return None;
260 }
261 // A multiplicative response rescale `y → y/s` must be matched by `η → η/s`
262 // for the residual to scale cleanly. The intercept and smooth coefficients
263 // scale freely, but a *fixed* offset column does not — scaling the working
264 // response while leaving the offset on its original scale would change the
265 // residual geometry, not just its magnitude. The offset is shared verbatim
266 // into the outer state and reused by the accept-fit, so rather than thread a
267 // separately scaled copy everywhere, restrict the (rare) offset case to the
268 // exact previous path: only normalize when there is no nonzero offset.
269 if offset.iter().any(|&o| o != 0.0) {
270 return None;
271 }
272 let mut weight_sum = 0.0_f64;
273 let mut weighted_sq = KahanSum::default();
274 for ((&yi, &wi), &oi) in y.iter().zip(w.iter()).zip(offset.iter()) {
275 if wi > 0.0 {
276 weight_sum += wi;
277 let centered = (yi - oi) - center;
278 weighted_sq.add(wi * centered * centered);
279 }
280 }
281 if weight_sum <= 0.0 {
282 return None;
283 }
284 let rms = (weighted_sq.sum() / weight_sum).sqrt();
285 // Only normalize when the magnitude is far enough from `O(1)` to matter; a
286 // factor within ~one order of magnitude of unity cannot push the objective
287 // through the absolute floors, so leave the exact previous path untouched.
288 (rms.is_finite() && rms > 0.0 && !(0.1..=10.0).contains(&rms)).then_some(rms)
289}
290
291pub(crate) fn optimize_external_designwith_heuristic_lambdas_andwarm_start<X>(
292 y: ArrayView1<'_, f64>,
293 w: ArrayView1<'_, f64>,
294 x: X,
295 offset: ArrayView1<'_, f64>,
296 s_list: Vec<PenaltySpec>,
297 heuristic_lambdas: Option<&[f64]>,
298 warm_start_beta: Option<ArrayView1<'_, f64>>,
299 opts: &ExternalOptimOptions,
300) -> Result<ExternalOptimResult, EstimationError>
301where
302 X: Into<DesignMatrix>,
303{
304 if opts.family.is_binomial_mixture() && opts.mixture_link.is_none() {
305 crate::bail_invalid_estim!("BinomialMixture requires mixture_link specification");
306 }
307 let x = x.into();
308 if let Some(message) = row_mismatch_message(y.len(), w.len(), x.nrows(), offset.len()) {
309 crate::bail_invalid_estim!("{}", message);
310 }
311
312 let p = x.ncols();
313 // Raw design row count, captured before `x` is moved (line ~339); used by the
314 // #1266 null-space shrink-out escape's `n ≥ 2·p` determinacy gate, which must
315 // match `relax_smoothing_rho_prior`'s well-determined gate exactly.
316 let n_design_rows = x.nrows();
317 validate_penalty_specs(&s_list, p, "optimize_external_design")?;
318 let (canonical, active_nullspace_dims) = gam_terms::construction::canonicalize_penalty_specs(
319 &s_list,
320 &opts.nullspace_dims,
321 p,
322 "optimize_external_design",
323 )?;
324 let conditioning = ParametricColumnConditioning::infer_from_penalty_specs(&x, &s_list);
325 let x_fit = conditioning.apply_to_design(&x);
326 let fit_linear_constraints =
327 conditioning.transform_linear_constraints_to_internal(opts.linear_constraints.clone());
328 let k = canonical.len();
329 if active_nullspace_dims.len() != k {
330 crate::bail_invalid_estim!(
331 "nullspace_dims length mismatch: expected {k} entries for active penalties, got {}",
332 active_nullspace_dims.len()
333 );
334 }
335 let (cfg, effective_sas_link) = resolved_external_config(opts)?;
336 reject_prefit_unpenalized_rank_deficiency(w, &x_fit, &canonical)?;
337 reject_prefit_binomial_separation(&cfg, y, w, &x_fit, &canonical)?;
338
339 let design_kind = match &x {
340 DesignMatrix::Dense(_) => "dense",
341 DesignMatrix::Sparse(_) => "sparse",
342 };
343 log::info!(
344 "[GAM fit] n={} p={} k={} fam={:?} link={:?} X={} reml_iter={} firth={}",
345 y.len(),
346 p,
347 k,
348 opts.family,
349 cfg.link_function(),
350 design_kind,
351 opts.max_iter,
352 cfg.firth_bias_reduction
353 );
354
355 // Own the external arrays once; the conditioned design is shared through `reml_state`.
356 let y_o = y.to_owned();
357 let w_o = w.to_owned();
358 let x_o = x;
359 let offset_o = offset.to_owned();
360 let canonical_shared = Arc::new(canonical);
361 let cfg_shared = Arc::new(cfg.clone());
362
363 // Issue #1000: for an identity-link Gaussian fit with an unpenalized
364 // intercept, adding a constant `c` to the response is a *pure relabeling of
365 // the intercept* — the hat matrix annihilates the constant column, so the
366 // residuals, the profiled REML criterion, λ̂, and the smooth shape are all
367 // invariant to `c`. Numerically, though, the outer REML score/gradient
368 // accumulate `yᵀy`-magnitude sufficient statistics (e.g. the cached
369 // `XᵀW(y−offset)`), so an uncentered large-mean response injects a `c²`
370 // term that loses precision and drifts λ̂ — silently over-smoothing
371 // large-mean responses (Kelvin temperatures, financial levels, calendar
372 // years). Center the response by the (weighted) mean the intercept would
373 // absorb for the duration of the outer λ-search only: the constant lands in
374 // the intercept, which the final accept-fit below recovers *exactly* by
375 // re-fitting the original (uncentered) response at the REML-selected λ̂.
376 // This mirrors the existing column conditioning, which centers the design
377 // columns into the intercept for the same numerical reason.
378 let response_center = gaussian_identity_response_center(
379 &cfg,
380 &conditioning,
381 opts.linear_constraints.is_some(),
382 y_o.view(),
383 w_o.view(),
384 offset_o.view(),
385 );
386 // Issue #1127 (down-scale sibling of #1000): replacing the response `y` by
387 // `a·y` must rescale the whole fit by `a` and leave `λ̂`/EDF unchanged (the
388 // normal equations are exactly linear in `y`; the profiled REML criterion is
389 // `a`-invariant up to the additive `−(n−p)·ln a` the dispersion absorbs).
390 // But the outer λ-selection's convergence band is keyed to an *absolute*
391 // objective scale (an inner `objective_scale.max(1.0)` floor and a `1e-6`
392 // outer gradient floor); when the Gaussian objective is `O(a²) ≪ 1` those
393 // floors swamp the signal and the optimizer stops early at an over-smoothed
394 // `λ`. Normalize the (centered) working response to `O(1)` for the outer
395 // λ-search only, mirroring the #1000 centering: the final accept-fit below
396 // re-fits the *original* response at the REML-selected λ̂, so β, μ̂, σ̂² and
397 // every reported quantity stay exactly on the user's scale. `center` here is
398 // the constant the intercept already absorbs (so the scale is measured on the
399 // residual signal, not on the offset).
400 let response_scale = gaussian_identity_response_scale(
401 &cfg,
402 &conditioning,
403 opts.linear_constraints.is_some(),
404 response_center.unwrap_or(0.0),
405 y_o.view(),
406 w_o.view(),
407 offset_o.view(),
408 );
409 // The outer loop borrows the response for the lifetime of `reml_state`;
410 // the conditioned copy (when any) is owned at function scope so the borrow
411 // outlives the state. Off the Gaussian-identity path both `response_center`
412 // and `response_scale` are `None` and the outer loop borrows the original
413 // response verbatim — no allocation, no behavioural change. When only one is
414 // active we still apply just that transform. Both are exactly invertible by
415 // the accept-fit, which re-fits the original `y_o` at the selected λ̂.
416 let reml_y_conditioned: Option<Array1<f64>> = match (response_center, response_scale) {
417 (None, None) => None,
418 (center, scale) => {
419 let c = center.unwrap_or(0.0);
420 let s = scale.unwrap_or(1.0);
421 Some((&y_o - c) / s)
422 }
423 };
424 let reml_y_view = reml_y_conditioned
425 .as_ref()
426 .map_or_else(|| y_o.view(), |conditioned| conditioned.view());
427
428 let mut reml_state = RemlState::newwith_offset_shared(
429 reml_y_view,
430 x_fit,
431 w_o.view(),
432 offset_o.view(),
433 Arc::clone(&canonical_shared),
434 p,
435 Arc::clone(&cfg_shared),
436 Some(active_nullspace_dims.clone()),
437 None,
438 fit_linear_constraints.clone(),
439 )?;
440 reml_state.set_penalty_shrinkage_floor(opts.penalty_shrinkage_floor);
441 reml_state.set_rho_prior(opts.rho_prior.clone());
442 if let Some(kron) = opts.kronecker_penalty_system.clone() {
443 reml_state.set_kronecker_penalty_system(kron);
444 }
445 if let Some(kf) = opts.kronecker_factored.clone() {
446 reml_state.set_kronecker_factored(kf);
447 }
448 if opts.persist_warm_start_disk {
449 // Caller opted into cross-process resume (#1082): engage the on-disk
450 // warm-start layer. Default-false keeps replicate/CI loops disk-silent.
451 reml_state.enable_persistent_warm_start_disk();
452 }
453 reml_state.setwarm_start_original_beta(warm_start_beta);
454
455 // Term/margin-order invariance (#1538/#1539). The per-ρ-coordinate canonical
456 // keys label each coordinate by its placement-independent (penalty + data)
457 // content; the induced canonical permutation lets BOTH the objective-grid
458 // seed prepass AND the outer optimizer operate in an identical canonical
459 // coordinate layout for every term order. `None` when the coordinate count
460 // does not match the ρ-dimension (legacy native-order path, unchanged).
461 let canon_keys = reml_state.canonical_rho_keys(k);
462 let canon_perm: Option<Vec<usize>> = canon_keys
463 .as_ref()
464 .and_then(|keys| crate::rho_optimizer::canonical_permutation(keys));
465
466 let reml_seed_config = external_reml_seed_config(k, cfg.link_function());
467 let reml_tol = cfg.reml_convergence_tolerance;
468 let reml_max_iter = opts.max_iter;
469 let outer_eval_idx = AtomicUsize::new(0usize);
470 let mixture_optspec = if opts.optimize_mixture {
471 opts.mixture_link.clone()
472 } else {
473 None
474 };
475 let sas_optspec = if opts.optimize_sas {
476 effective_sas_link
477 } else {
478 None
479 };
480 let mixture_dim = mixture_optspec
481 .as_ref()
482 .map(|s| s.initial_rho.len())
483 .unwrap_or(0);
484 let sas_dim = if sas_optspec.is_some() { 2 } else { 0 };
485 let sasridgeweight = if sas_dim > 0 {
486 sas_log_deltaridgeweight()
487 } else {
488 0.0
489 };
490 // Negative-Binomial outer θ↔λ alternation (#1448). With θ estimated, the
491 // λ-search freezes θ (see `frozen_negbin_theta`, #1082) so the REML criterion
492 // `F(ρ) = REML(ρ, θ_frozen)` is stationary in ρ; the final accept-fit then
493 // ML-refreshes θ at the converged η. A *single* freeze→refresh leaves the
494 // selected ρ optimal only for `θ_frozen`, not for the refreshed `θ_final`.
495 // mgcv `nb()` instead alternates θ-estimation and λ-selection to a joint
496 // fixed point. Wrap the ρ-search + accept-fit in a bounded loop: after each
497 // refit, if the NB θ drifted beyond tolerance, re-freeze the search θ at the
498 // refreshed value, reset the surface caches that depend on it, and re-run the
499 // outer ρ search. The cap bounds the work; for every non-NB / user-fixed-θ
500 // fit the loop runs exactly once (the break condition is met immediately), so
501 // those fits are byte-identical to the pre-#1448 single-pass behaviour.
502 //
503 // 5% relative θ drift is the same band the diagnostic (#1082) flagged as the
504 // point beyond which the ρ-optimum for `θ_frozen` and `θ_final` can differ
505 // enough to matter; below it the one-refresh approximation is already joint-
506 // stationary to the criterion's tolerance.
507 const NEGBIN_THETA_JOINT_DRIFT_TOL: f64 = 5.0e-2;
508 const NEGBIN_OUTER_ALTERNATION_MAX_ROUNDS: usize = 8;
509 let mut final_rho;
510 let mut final_mixture_state;
511 let mut final_sas_state;
512 let mut final_mixture_param_covariance;
513 let mut final_sas_param_covariance;
514 let mut outer_result;
515 let mut pirls_res;
516 let mut negbin_alternation_round: usize = 0;
517 loop {
518 (
519 final_rho,
520 final_mixture_state,
521 final_sas_state,
522 final_mixture_param_covariance,
523 final_sas_param_covariance,
524 outer_result,
525 ) = if mixture_dim > 0 && sas_dim > 0 {
526 crate::bail_invalid_estim!(
527 "simultaneous mixture and SAS optimization is not supported"
528 );
529 } else if mixture_dim == 0 && sas_dim == 0 {
530 use crate::rho_optimizer::{OuterEvalOrder, OuterProblem};
531 use gam_problem::{DeclaredHessianForm, Derivative};
532
533 let analytic_outer_hessian_available = reml_state.analytic_outer_hessian_enabled();
534 // Standard-GAM dense problem dimensions configure both cost models
535 // the planner uses to decide whether ARC+Hessian or BFGS+gradient
536 // is faster end-to-end at large scale:
537 //
538 // - per-inner-solve cost (n · p²) gates the single-Hessian-
539 // assembly downgrade,
540 // - per-outer-eval cost (k² · n · p²) gates the LAML-Hessian
541 // pairwise-assembly downgrade — independent of (1) and
542 // necessary because the LAML outer Hessian's k² pairwise
543 // inner-derived terms can dominate per-outer work even when
544 // each individual inner solve is moderate.
545 //
546 // Sparse designs short-circuit the policy because the n · p²
547 // model does not apply to sparse linear algebra; ARC stays in
548 // place and the sparse path's iteration-count advantage holds.
549 // Gaussian-identity REML has two well-conditioned features that
550 // the outer optimizer can exploit:
551 //
552 // 1. The REML cost is dominated by an O(n) likelihood constant,
553 // so ∂/∂logλ inherits the same scale. A unit-magnitude
554 // `abs` gradient floor (1e-6) becomes binding at large-scale n
555 // even after the relative-from-seed component declared
556 // convergence iters earlier. `with_objective_scale(n)`
557 // lifts the floor to ~n·1e-9 so the loop terminates once
558 // the relative reduction is met.
559 //
560 // 2. The Gaussian profile likelihood is quadratic-like in
561 // log-λ near the optimum, so the analytic Hessian is
562 // trustworthy and the cubic regularization can start
563 // smaller than opt's default sigma=1.0. Setting
564 // sigma=0.25 allows the first ARC step to be ~4× the
565 // default — matching the 2–4 unit log-λ moves typical of
566 // Gaussian-identity REML cold starts on tensor smooths.
567 //
568 // Other families (logit, log, survival) keep the conservative
569 // defaults because their objective is non-quadratic in log-λ
570 // and their gradient is not on an O(n) scale.
571 let gaussian_identity = matches!(cfg.link_function(), LinkFunction::Identity);
572 let n_obs = y_o.len();
573 let prefer_gradient_only = k >= REML_SECOND_ORDER_RHO_CAP;
574 let continuation_prewarm = k < REML_CONTINUATION_PREWARM_RHO_CAP;
575 if prefer_gradient_only {
576 log::info!(
577 "[OUTER] rho_dim {k} reaches exact REML Hessian budget \
578 ({REML_SECOND_ORDER_RHO_CAP}); routing analytic-gradient quasi-Newton"
579 );
580 }
581 if !continuation_prewarm {
582 log::info!(
583 "[OUTER] rho_dim {k} reaches continuation-prewarm budget \
584 ({REML_CONTINUATION_PREWARM_RHO_CAP}); starting optimizer directly from seeds"
585 );
586 }
587 let problem = OuterProblem::new(k)
588 .with_gradient(Derivative::Analytic)
589 .with_hessian(if analytic_outer_hessian_available {
590 DeclaredHessianForm::Either
591 } else {
592 DeclaredHessianForm::Unavailable
593 })
594 .with_prefer_gradient_only(prefer_gradient_only)
595 .with_continuation_prewarm(continuation_prewarm)
596 .with_barrier(
597 crate::estimate::reml::reml_outer_engine::BarrierConfig::from_constraints(
598 fit_linear_constraints.as_ref(),
599 ),
600 )
601 .with_tolerance(reml_tol)
602 .with_max_iter(reml_max_iter)
603 .with_seed_config(reml_seed_config)
604 .with_screening_cap(Arc::clone(&reml_state.screening_max_inner_iterations))
605 .with_outer_inner_cap(reml_inner_progress_feedback(&reml_state))
606 // n-scaled absolute gradient floor for EVERY family (#1082).
607 //
608 // The REML/LAML profiled criterion is a sum over n rows
609 // (deviance / −2·loglik + the penalty/logdet terms), so it and its
610 // ∂/∂logλ gradient inherit an O(n) scale for Poisson, NB, binomial,
611 // Tweedie, beta — exactly as for Gaussian-identity. The previous gate
612 // restricted `with_objective_scale` to the Gaussian-identity arm on
613 // the (incorrect) premise that only that criterion is O(n). For a
614 // non-Gaussian tensor/cyclic/CI/badhealth fit at n≈1.5k–5k the fixed
615 // `abs = tol ≈ 1e-6` gradient floor is then orders of magnitude below
616 // the n-scaled gradient's converged residual: the relative-from-seed
617 // test declares convergence iters earlier, but the binding abs floor
618 // keeps the outer optimizer chasing sub-floor log-λ changes, paying a
619 // full k²·n·p² LAML-Hessian assembly per phantom iteration until it
620 // exhausts the iteration budget — the #1082 outer-loop "cycling"
621 // timeout. Lifting the floor to ~n·1e-9 (the same calibration the
622 // spatial/custom-family outer already uses via `with_problem_size`,
623 // #1053/#1066/#1069) lets the loop terminate as soon as the relative
624 // reduction is met, for every family, while the relative-to-cost
625 // component still owns the actual convergence decision. ARC σ and the
626 // initial trust radius stay Gaussian-gated: those exploit the
627 // Gaussian profile being quadratic-in-log-λ, which is family-specific.
628 .with_objective_scale(Some(n_obs as f64))
629 .with_problem_size(n_obs, x_o.ncols())
630 .with_arc_initial_regularization(if gaussian_identity { Some(0.25) } else { None })
631 .with_operator_initial_trust_radius(if gaussian_identity { Some(4.0) } else { None })
632 .with_rho_bound(crate::estimate::RHO_BOUND)
633 // Make the outer smoothing-parameter search invariant to the order
634 // the smooth terms / tensor margins were written (#1538/#1539). The
635 // structural keys label each ρ-coordinate by its placement-
636 // independent penalty content, so the optimizer canonicalizes the
637 // coordinate layout and resolves the flat double-penalty REML valley
638 // identically for `s(x)+s(z)` vs `s(z)+s(x)` and `te(x,z)` vs
639 // `te(z,x)`. `None` (coordinate count not matching ρ-dim) leaves the
640 // native-order path unchanged.
641 .with_rho_canonical_keys(canon_keys.clone());
642 let problem = if let Some(h) = heuristic_lambdas {
643 problem.with_heuristic_lambdas(h.to_vec())
644 } else {
645 problem
646 };
647 let problem = if let Some(h) = heuristic_lambdas.filter(|h| h.len() == k) {
648 problem.with_initial_rho(Array1::from_iter(h.iter().copied()))
649 } else {
650 problem
651 };
652
653 // Geometric-mean log prior-weight anchor `log g(w) = (1/n₊)·Σ log wᵢ`
654 // over the positive-weight rows. The pure-REML optimum for a *profiled*
655 // (Gaussian-identity) fit drifts by `ρ̂ → ρ̂ + log c` under a global
656 // prior-weight rescale `w → c·w` (`H = XᵀWX + λS`, so λ → c·λ keeps the
657 // penalised curvature proportional to the data curvature, β̂ / EDF /
658 // predictions fixed). The outer ρ-search seed and the relative-from-seed
659 // convergence test would otherwise be referenced to a weight-independent
660 // origin (0), so a heavily up-weighted fit starts `log c` further from
661 // its (shifted) optimum and the optimiser stops short — exactly the
662 // weight-scale non-invariance of λ̂ reported in issue #877. Anchoring the
663 // seed at `log g(w)` makes the search start the SAME relative distance
664 // from the optimum regardless of the weight magnitude.
665 //
666 // This is the SAME gated anchor the outer ρ-prior uses
667 // ([`RemlState::rho_weight_anchor`]): it is the geometric-mean
668 // log-weight for a profiled-dispersion family and *exactly 0* for a
669 // fixed-dispersion family (Poisson, binomial, …). For fixed dispersion
670 // `w = c` is exact `c`-fold replication: the two encodings share an
671 // identical LAML objective and optimum, so anchoring the seed by their
672 // (differing) per-row log-weight mean would seed the weighted encoding
673 // `log c` above its true optimum and the relative-convergence test would
674 // stop it short — over-smoothing vs replication (issue #893). With all
675 // weights 1 (or any fixed-dispersion family) the anchor is exactly 0, so
676 // those fits stay byte-identical.
677 let weight_log_geom_mean: f64 = reml_state.rho_weight_anchor();
678 let gaussian_risk = matches!(
679 reml_seed_config.risk_profile,
680 SeedRiskProfile::Gaussian | SeedRiskProfile::GaussianLocationScale
681 );
682 // The prepass evaluates the *actual* REML/LAML objective on a tiny,
683 // deterministic log-λ grid and only changes startup when that same
684 // criterion improves. It is therefore part of initialization, not a
685 // compatibility fallback. Gaussian fits used to skip this when the
686 // weights were on the unit scale, leaving single-start BFGS/ARC tied to
687 // the arbitrary λ=1 origin; flat or multi-penalty REML surfaces could
688 // then spend the finite outer budget getting into the right basin rather
689 // than resolving the optimum that controls EDF and truth recovery. Run
690 // the same criterion-ranked startup for Gaussian as for GLM/survival,
691 // while retaining the weight-scale anchor from issue #877.
692 let run_gaussian_anchored_prepass = gaussian_risk && weight_log_geom_mean.abs() > 1e-12;
693 // A caller-supplied rho seed (`init_rhos`/`heuristic_lambdas`, now in
694 // rho-space) is an explicit warm-start installed via `with_initial_rho`
695 // above. It still ANCHORS the objective-grid prepass below rather than
696 // short-circuiting it: the grid is criterion-ranked and only adopts a
697 // candidate that STRICTLY lowers the true REML/LAML cost, so a healthy
698 // warm seed is returned unchanged (the grid never beats it → byte-
699 // identical behaviour). What the anchor-and-rank rescues is a warm seed
700 // TRAPPED in a shallow under-smoothing local basin: when the design's
701 // kernel collapses (e.g. the constant-curvature `curv()` smooth fitted
702 // at a trial κ on the +chart side — the geodesic-exponential kernel's
703 // off-diagonals → 1, so its global REML optimum is a LARGE λ that the
704 // local outer optimizer, warm-started from the previous-κ λ̂, slides away
705 // from into the spurious low-λ optimum). The shallow optimum's
706 // spuriously-low deviance made the κ outer objective monotone toward the
707 // +chart bound for any curved data (gam#1464 — hyperbolic truth recovered
708 // as spherical); anchoring the global grid at the warm seed lets the
709 // prepass jump into the correct high-λ basin so the per-κ REML cost
710 // matches the textbook profiled-REML and the curvature SIGN is
711 // identifiable. Same machinery as the gam#1266 double-penalty rescue.
712 let caller_seeded_rho = heuristic_lambdas.is_some_and(|h| h.len() == k);
713 // The grid prepass's lowest-cost sample, kept for the #1371
714 // release-and-rerank guard even when it is not adopted as the initial
715 // seed (i.e. the grid did not strictly move). It is a known-good lower
716 // bound on the achievable REML cost, scored with the SAME functional.
717 // Unconditionally assigned inside the prepass block below (before its
718 // first read by the #1371 guard), so it carries no dead initializer.
719 let release_rerank_seed: Option<Array1<f64>>;
720 // #1548: the well-determined Marra-Wood double-penalty null-space
721 // selection coordinates, recognised exactly as the #1266 shrink-out
722 // escape recognises them (the relaxed `Normal(0, sd=15)` degeneracy
723 // prior, gated by `n ≥ 2·p`). A SUPPORTED such coordinate has a deep
724 // low-λ_null "keep" basin AND a flat high-λ_null annihilation shelf; the
725 // objective-grid prepass below probes the keep corner for exactly these
726 // coordinates so it can seed the well-conditioned keep basin directly
727 // rather than the shelf — see the keep-saturation probe in
728 // `select_objective_seed_on_log_lambda_grid`.
729 let nullspace_seed_coords: Vec<usize> = if n_design_rows >= 2 * p {
730 match reml_state.effective_rho_prior().as_ref() {
731 gam_problem::RhoPrior::Independent(per_coord) => (0..k)
732 .filter(|&i| {
733 per_coord
734 .get(i)
735 .is_some_and(gam_terms::smooth::is_nullspace_degeneracy_prior)
736 })
737 .collect(),
738 _ => Vec::new(),
739 }
740 } else {
741 Vec::new()
742 };
743 let prepass_seed: Option<Array1<f64>> = {
744 let bnds = reml_seed_config.bounds;
745 let (lo, hi_seed) = if bnds.0 <= bnds.1 {
746 bnds
747 } else {
748 (bnds.1, bnds.0)
749 };
750 // The criterion-ranked prepass evaluates the TRUE REML/LAML cost, so
751 // it is safe — and necessary — to let it explore the full
752 // over-smoothing range the outer optimizer itself can reach
753 // (`RHO_BOUND`), not just the narrower default seed-placement band.
754 // A double-penalty (null-space-shrinkage) smooth on data living in
755 // one penalty's null space has its global REML optimum at a LARGE
756 // wiggliness λ (range block fully smoothed), often beyond the seed
757 // band; the cost surface also has a shallower local optimum at a
758 // moderate λ that leaves wiggle under-penalized (EDF inflated,
759 // gam#1266). If the prepass cannot seed past that local optimum, the
760 // outer EFS — which only takes cost-improving steps — relaxes back
761 // into it. The collapsing-kernel spatial smooth (gam#1464) has the
762 // same shape: the high-λ basin sits beyond a shallow low-λ trap.
763 // Widening only the upper (over-smoothing) bound lets the prepass
764 // place the seed in the correct high-λ basin; the lower
765 // (under-smoothing) bound stays at the default so we never seed an
766 // overfit origin. The seed is still only adopted when it strictly
767 // lowers the REML cost, so well-balanced and single-penalty fits are
768 // unaffected.
769 let hi = hi_seed.max(crate::estimate::RHO_BOUND);
770 // risk_shift is the default seed bias when no caller warm-start is given;
771 // it is NOT applied on top of a caller-supplied rho seed.
772 let risk_shift: f64 = match reml_seed_config.risk_profile {
773 SeedRiskProfile::Gaussian | SeedRiskProfile::GaussianLocationScale => 0.0,
774 SeedRiskProfile::GeneralizedLinear => 1.0,
775 SeedRiskProfile::Survival => 2.0,
776 };
777 // Anchor the grid at the caller-supplied `heuristic_lambdas` when one
778 // is present (it is already in rho-space, used as-is) — the grid then
779 // searches relative to the warm start and keeps it unless a candidate
780 // is strictly better. Otherwise anchor the default risk-shift origin
781 // to the weight scale (issue #877).
782 let base = if let Some(h) = heuristic_lambdas.as_ref().filter(|h| h.len() == k) {
783 Array1::from_iter(h.iter().map(|&v| v.clamp(lo, hi)))
784 } else {
785 Array1::from_elem(k, (risk_shift + weight_log_geom_mean).clamp(lo, hi))
786 };
787 // Run the objective-grid seed search in CANONICAL coordinate order
788 // (#1538/#1539) so its greedy per-axis / pairwise-saturation
789 // refinement — which is order-dependent in native layout — explores
790 // the SAME axes for every term order. The grid builds canonical
791 // candidates; the eval closure maps each back to native order before
792 // scoring with the true `compute_cost`, and the refined seed is
793 // mapped native again for `with_initial_rho`. Without a permutation
794 // this is the identity, so the native-order path is byte-for-byte
795 // unchanged.
796 let refined = if let Some(perm) = canon_perm.as_ref() {
797 let to_native = |canon: &Array1<f64>| -> Array1<f64> {
798 let mut out = Array1::zeros(canon.len());
799 for (c, &i) in perm.iter().enumerate() {
800 out[i] = canon[c];
801 }
802 out
803 };
804 let base_canon = Array1::from_iter(perm.iter().map(|&i| base[i]));
805 // Canonical slot `c` carries native coordinate `perm[c]`; a
806 // null-space coordinate must be probed in whichever slot it
807 // occupies in the canonical layout the grid refines.
808 let nullspace_canon: Vec<usize> = (0..k)
809 .filter(|&c| nullspace_seed_coords.contains(&perm[c]))
810 .collect();
811 let refined_canon = crate::seeding::select_objective_seed_on_log_lambda_grid(
812 &base_canon,
813 (lo, hi),
814 k,
815 &nullspace_canon,
816 |canon_rho| {
817 let native = to_native(canon_rho);
818 reml_state
819 .compute_cost(&native)
820 .ok()
821 .filter(|c| c.is_finite())
822 },
823 );
824 to_native(&refined_canon)
825 } else {
826 crate::seeding::select_objective_seed_on_log_lambda_grid(
827 &base,
828 (lo, hi),
829 k,
830 &nullspace_seed_coords,
831 |rho| reml_state.compute_cost(rho).ok().filter(|c| c.is_finite()),
832 )
833 };
834 // Emit the seed when the grid moved it, or — on the Gaussian
835 // weight-anchored path — whenever the anchored `base` is itself
836 // offset from the unanchored origin (so the shifted optimum is
837 // actually seeded even if the coarse grid leaves `base` unchanged).
838 // Record the grid's best sample for the release-and-rerank guard
839 // unconditionally — whether or not it is strong enough to override
840 // the optimizer's own cold start, it is still a scored lower bound
841 // the certified optimum must not be worse than (#1371).
842 release_rerank_seed = Some(refined.clone());
843 let grid_moved = refined
844 .iter()
845 .zip(base.iter())
846 .any(|(&a, &b)| (a - b).abs() > 1e-12);
847 // For a caller-seeded fit, adopt the grid result only when it
848 // STRICTLY moved the warm seed (i.e. found a strictly-cheaper basin);
849 // an unmoved grid leaves the warm start exactly as installed above, so
850 // healthy warm-started fits stay byte-identical. The Gaussian
851 // weight-anchored emit only applies on the non-caller-seeded origin.
852 if grid_moved || (run_gaussian_anchored_prepass && !caller_seeded_rho) {
853 log::info!(
854 "[OUTER] standard REML objective-grid selected seed: {:?} -> {:?}",
855 base.as_slice().unwrap_or(&[]),
856 refined.as_slice().unwrap_or(&[])
857 );
858 Some(refined)
859 } else {
860 None
861 }
862 };
863 let problem = if let Some(seed) = prepass_seed {
864 problem.with_initial_rho(seed)
865 } else {
866 problem
867 };
868 // #1074 DIAGNOSTIC (log-gated, no behavior change unless the crate
869 // logger is installed): sweep each outer log-λ coordinate over a grid
870 // while holding the others at the baseline, logging the REML cost. Used
871 // to decide whether the spatial range railing is an interior optimum the
872 // optimizer misses (optimizer bug) or a genuine criterion preference for
873 // λ→∞ (criterion). Placed BEFORE the objective takes its `&mut
874 // reml_state` borrow so the immutable `compute_cost` reads are valid.
875 // Emitted at warn level so the default-installed crate logger (Info)
876 // prints it without a level change (the ban-scanner forbids direct
877 // stderr printing and process-env reads).
878 if log::log_enabled!(log::Level::Warn) {
879 let grid = [
880 -5.0_f64, -2.0, 0.0, 2.0, 5.0, 8.0, 10.0, 12.0, 16.0, 20.0, 25.0, 30.0,
881 ];
882 let mut baselines: Vec<(&str, Array1<f64>)> =
883 vec![("zeros", Array1::<f64>::zeros(k))];
884 if k == 4 {
885 baselines.push(("conv", Array1::from(vec![9.0_f64, 30.0, 12.0, 30.0])));
886 }
887 if k == 2 {
888 baselines.push(("conv6", Array1::from(vec![6.0_f64, 6.0])));
889 }
890 for (label, baseline) in &baselines {
891 log::warn!("[#1074-sweep] k={k} baseline={label}={baseline:?}");
892 for coord in 0..k {
893 let mut line = format!("[#1074-sweep:{label}] coord={coord}:");
894 for &rho in &grid {
895 let mut p = baseline.clone();
896 p[coord] = rho;
897 let cg = reml_state.compute_cost_and_gradient(&p).ok();
898 let cell = match cg {
899 Some((c, g)) => {
900 format!(
901 "{c:.4}(g{}={:.3e})",
902 coord,
903 g.get(coord).copied().unwrap_or(f64::NAN)
904 )
905 }
906 None => "ERR".to_string(),
907 };
908 line.push_str(&format!(" {rho:.0}->{cell}"));
909 }
910 log::warn!("{line}");
911 }
912 }
913 }
914
915 // Attach the outer-loop cache session. The session shares its
916 // realized-fit-context key with the inner beta record (different
917 // payload namespace), so a SIGKILL mid-outer-iter leaves both the
918 // last accepted β (inner record) and the best rho seen so far
919 // (outer iterate) on disk for the next run.
920 let problem = match reml_state.outer_cache_session() {
921 Some(session) => problem.with_cache_session(session),
922 None => problem,
923 };
924
925 let obj = problem.build_objective_with_screening_proxy(
926 &mut reml_state,
927 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
928 rho: &Array1<f64>| { state.compute_cost(rho) },
929 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
930 rho: &Array1<f64>| {
931 outer_eval_idx.fetch_add(1, Ordering::Relaxed);
932 state.compute_outer_eval_with_order(
933 rho,
934 if analytic_outer_hessian_available {
935 OuterEvalOrder::ValueGradientHessian
936 } else {
937 OuterEvalOrder::ValueAndGradient
938 },
939 )
940 },
941 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
942 rho: &Array1<f64>,
943 order: OuterEvalOrder| {
944 outer_eval_idx.fetch_add(1, Ordering::Relaxed);
945 state.compute_outer_eval_with_order(rho, order)
946 },
947 Some(
948 |state: &mut &mut crate::estimate::reml::RemlState<'_>| {
949 state.reset_outer_seed_state()
950 },
951 ),
952 Some(
953 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
954 rho: &Array1<f64>| { state.compute_efs_steps(rho) },
955 ),
956 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
957 rho: &Array1<f64>| { state.compute_screening_proxy(rho) },
958 );
959 // Standard REML's eval closure publishes
960 // `inner_beta_hint = state.current_original_basis_beta()` on
961 // every accepted eval. The continuation pre-warm carries that
962 // hint forward and calls `seed_inner_state(beta)` before the
963 // next eval — see src/solver/reml/continuation.rs:209-212,
964 // 434-438. Without a hook here, `ClosureObjective::seed_inner_state`
965 // (src/solver/rho_optimizer.rs:2097-2107) rejected any
966 // non-empty β fatally, dropping every seed before the inner
967 // solver started (issue #236). Wire the symmetric consumer:
968 // when the pre-warm forwards the cached β, install it into the
969 // same `warm_start_beta` slot the publisher reads from.
970 let mut obj = obj.with_seed_inner_state(with_reml_beta_seed_hook());
971
972 let mut strategy_result = problem.run(&mut obj, "standard REML")?;
973 drop(obj);
974 // #1371 release-and-rerank guard. The continuation oversmoothing
975 // warm-start can deliver the inner β on the high-λ null-space
976 // "annihilation" shelf of a double-penalty smooth: there the
977 // null-space coefficients are already shrunk to ~0, so the deviance
978 // ρ-gradient vanishes (∂dev/∂ρ_null → 0) AND the Occam terms
979 // (½ tr(H⁻¹ ∂H/∂ρ) − ½ λ tr(S⁺ S_k)) cancel, leaving the analytic
980 // outer gradient ≈ 0. ARC then certifies that point as a stationary
981 // optimum even though its REML cost is FAR ABOVE a point the seed
982 // prepass already evaluated — driving a genuinely-supported null-space
983 // direction (a real linear trend, gam#1371) to EDF → 0. The seed
984 // prepass's grid-refined seed is a known-good lower bound on the cost
985 // (it was scored with the SAME `compute_cost`), so if the certified
986 // optimum is strictly worse than it, re-rank to the seed: re-running
987 // the inner solve there installs the correct β̂. This cannot regress a
988 // fit whose optimum genuinely IS the high-λ corner (gam#1266: an
989 // unsupported term shrinking out) — there the corner is the
990 // lowest-cost point, no cheaper seed exists, and the guard is a no-op.
991 if let Some(seed) = release_rerank_seed.as_ref() {
992 // The certified cost is the optimizer's OWN authoritative
993 // `final_value`, NOT a fresh `compute_cost(strategy_result.rho)`
994 // re-evaluation (load-bearing, #1426/#1477). The REML/LAML objective
995 // for a non-Gaussian family is NOT a pure function of ρ: it carries a
996 // profiled dispersion / nuisance that is established by the inner
997 // solve at the operating ρ, and `compute_cost` warm-starts the inner
998 // PIRLS from whatever β̂/φ the previous eval left behind. Probing the
999 // under-penalized prepass seed FIRST (necessary so the no-op path can
1000 // leave β̂ at the seed for a clean re-install) pollutes that nuisance
1001 // state, so a subsequent re-eval of the cleanly-converged ρ comes back
1002 // a few REML units ABOVE its true certified cost — e.g. a Gamma/log
1003 // optimum certified at 829.857 re-evaluates at 834.90 right after the
1004 // seed probe, which is then (wrongly) above the seed's own 833.47 and
1005 // the guard "escapes" to the under-penalized seed, shipping a
1006 // near-full-basis overfit (EDF ≈ k, falsely tagged converged) that the
1007 // seed loop's keep-best had already rejected (#1426 silent overfit;
1008 // #1477 Tweedie boundary/EDF blow-up). `final_value` was scored at the
1009 // converged ρ with ITS own inner solve, so it is immune to that probe
1010 // pollution and is the honest cost to compare the seed against.
1011 let cost_converged = strategy_result.final_value;
1012 // The seed probe is a non-fatal measurement; a seed that fails to
1013 // evaluate simply skips the comparison. It leaves β̂ at the seed, so
1014 // the no-op branch below relies on the unified β̂ re-install after the
1015 // guard to restore it at `strategy_result.rho`.
1016 //
1017 // Probe the seed WITH its outer gradient (not cost alone): the grid
1018 // prepass scored the seed by `compute_cost`, which runs the inner
1019 // P-IRLS — at an under-penalized (λ→0) ρ the inner solve hits its
1020 // iteration cap and reports a spuriously LOW cost (an invalid REML
1021 // value the line search could not improve) while the analytic outer
1022 // gradient still points strongly toward more penalization. The #1371
1023 // false high-λ shelf this guard exists to escape is, by contrast, a
1024 // GENUINE cheaper optimum: its seed is stationary. Even with the
1025 // pollution-free `final_value` comparison above, a stuck stall can
1026 // still under-cut it on raw cost, so only a seed whose cost is
1027 // trustworthy (small residual gradient) may override the certified ρ.
1028 let seed_eval = reml_state.compute_cost_and_gradient(seed).ok();
1029 // Strict relative improvement so a numerically-equal seed (the common
1030 // case where the optimizer reached the seed's basin) is left untouched
1031 // and the fit stays byte-identical.
1032 let floor = 1e-6 * (1.0 + cost_converged.abs());
1033 if let Some((cost_seed, grad_seed)) = seed_eval.filter(|(c, _)| c.is_finite())
1034 && cost_converged.is_finite()
1035 && cost_seed < cost_converged - floor
1036 {
1037 // Bound-projected residual gradient at the seed (same criterion
1038 // `nonconverged_cost_is_trustworthy` / the flat-valley stall guard
1039 // use): a component pinned at a bound by a gradient pushing past
1040 // it is feasible-stationary and drops out of the norm.
1041 let (blo, bhi) = {
1042 let (a, b) = reml_seed_config.bounds;
1043 let (lo, hi) = if a <= b { (a, b) } else { (b, a) };
1044 (lo, hi.max(crate::estimate::RHO_BOUND))
1045 };
1046 let seed_grad_norm = {
1047 let mut sumsq = 0.0;
1048 for (i, &g) in grad_seed.iter().enumerate() {
1049 let s = seed.get(i).copied().unwrap_or(0.0);
1050 let pinned_lo = s <= blo + 1e-9 && g > 0.0;
1051 let pinned_hi = s >= bhi - 1e-9 && g < 0.0;
1052 if !(pinned_lo || pinned_hi) {
1053 sumsq += g * g;
1054 }
1055 }
1056 sumsq.sqrt()
1057 };
1058 let seed_cost_trustworthy = seed_grad_norm.is_finite()
1059 && seed_grad_norm
1060 <= crate::rho_optimizer::FLAT_VALLEY_STALL_GRAD_CEILING;
1061 if seed_cost_trustworthy {
1062 log::info!(
1063 "[OUTER] #1371 release-and-rerank: certified ρ cost {cost_converged:.6e} \
1064 exceeds the prepass seed cost {cost_seed:.6e} (seed |g|={seed_grad_norm:.3e} \
1065 ≤ ceiling); adopting the seed (false high-λ stationary shelf escaped)"
1066 );
1067 strategy_result.rho = seed.clone();
1068 strategy_result.converged = true;
1069 } else {
1070 // #1426 leak: the cheaper seed is a stuck under-penalized
1071 // (λ→0) stall, not a genuine optimum — its low cost is an
1072 // inner-cap artifact. Adopting it would ship the near-full-
1073 // basis overfit (EDF ≈ k) and, worse, certify it converged.
1074 // Keep the honest certified ρ. β̂ is restored by the unified
1075 // re-install after the guard.
1076 log::info!(
1077 "[OUTER] #1371 release-and-rerank: prepass seed cost {cost_seed:.6e} is \
1078 cheaper than certified ρ {cost_converged:.6e} but UNTRUSTWORTHY \
1079 (seed |g|={seed_grad_norm:.3e} > ceiling — stuck under-penalized stall, \
1080 #1426); keeping the certified ρ"
1081 );
1082 }
1083 }
1084 // Re-install β̂ at the (possibly newly-adopted) reported ρ so the
1085 // cached inner state matches `strategy_result.rho` for the downstream
1086 // cap-guard / final assembly — whether the guard fired (β̂ → seed) or
1087 // was a no-op (β̂ → the certified ρ, undoing the seed probe).
1088 reml_state.compute_cost(&strategy_result.rho)?;
1089 }
1090 // #1074 UPPER-BOUND INWARD-DESCENT ESCAPE. The outer cost-stall /
1091 // convergence check projects out a coordinate sitting on the ρ upper
1092 // bound, so a coordinate that was driven to the over-smoothing rail can
1093 // be certified "converged" even when the REML criterion is strictly
1094 // LOWER at an interior ρ (a feasible inward descent the projection
1095 // masks). On `s(long,lat,bs="tp") + s(depth)` the spatial/depth
1096 // NULL-SPACE (affine-trend) coordinates rail to ρ=30 while
1097 // `compute_cost` is ~23/~5 units lower at ρ≈2, annihilating a SUPPORTED
1098 // spatial trend (#1074). This guard runs a bounded, keep-best
1099 // coordinate-descent polish on EXACTLY the coordinates pinned at the
1100 // upper bound: for each, it line-searches `compute_cost` over a coarse
1101 // inward grid and adopts the strictly-best ρ. It uses the same
1102 // authoritative `compute_cost` the optimizer minimizes, so it can only
1103 // LOWER the certified cost — it never raises it and is a no-op when the
1104 // rail genuinely is the optimum (an unsupported term shrinking out,
1105 // #1266/#1271: no interior point is cheaper, so nothing is adopted).
1106 {
1107 let rho_upper = crate::estimate::RHO_BOUND;
1108 // Coordinates eligible for the inward (less-smoothing) descent. Two
1109 // kinds qualify:
1110 // (1) any coordinate pinned at the ρ upper rail — the original
1111 // #1074 case: the outer convergence check projects out a
1112 // rail-pinned coordinate, masking a feasible interior descent;
1113 // (2) the well-determined double-penalty NULL-SPACE selection
1114 // coordinates (`is_nullspace_degeneracy_prior`). These sit on a
1115 // near-FLAT REML ridge in λ_null, so the outer optimizer can
1116 // certify convergence at ANY high-but-not-railed ρ_null
1117 // depending on its (floating-point) iterate path. Reflecting the
1118 // covariate `x → −x` reverses the basis column order and flips
1119 // that landing shoulder: a SUPPORTED affine trend is kept at
1120 // ρ_null ≈ 0 in one orientation but over-penalized to e.g.
1121 // ρ_null ≈ 25 in the mirror (#1548), even though neither is at
1122 // the exact rail. Descending these to the cheaper interior
1123 // optimum lands BOTH orientations on the same shoulder.
1124 // The descent below probes ONLY strictly-lower ρ, so it never
1125 // over-smooths (raising λ_null is the #1266 escape's job, with its
1126 // EDF parsimony guard against the #1476 concurvity transfer). It is
1127 // keep-best + cold-confirmed against the authoritative penalized cost,
1128 // so it is an exact no-op wherever the current ρ already is the
1129 // optimum — e.g. an unsupported trend correctly shrunk out at the rail
1130 // (#1266/#1271), where no interior point is cheaper.
1131 let mut descent_coords: Vec<usize> = (0..strategy_result.rho.len())
1132 .filter(|&i| strategy_result.rho[i] >= rho_upper - 1e-9)
1133 .collect();
1134 if n_design_rows >= 2 * p
1135 && let gam_problem::RhoPrior::Independent(per_coord) =
1136 reml_state.effective_rho_prior().as_ref()
1137 {
1138 for i in 0..strategy_result.rho.len() {
1139 if strategy_result.rho[i] < rho_upper - 1e-9
1140 && per_coord
1141 .get(i)
1142 .is_some_and(gam_terms::smooth::is_nullspace_degeneracy_prior)
1143 {
1144 descent_coords.push(i);
1145 }
1146 }
1147 }
1148 // Baseline to beat = the optimizer's OWN authoritative converged
1149 // cost (`final_value`), which was scored at the converged ρ with its
1150 // own inner solve and is immune to warm-start pollution from the
1151 // probes below (the #1371 lesson). A probe only wins if it is
1152 // strictly cheaper than this honest cost.
1153 let base_cost = strategy_result.final_value;
1154 if !descent_coords.is_empty() && base_cost.is_finite() {
1155 // Inward probe grid (descending from the rail). Bounded and
1156 // cheap: at most 2 · |railed| · 8 inner solves, and only when a
1157 // coord is actually pinned at the upper rail. Two coordinate-
1158 // descent passes pick up cross-coordinate coupling between the
1159 // railed axes.
1160 const INWARD_GRID: [f64; 8] = [25.0, 20.0, 15.0, 10.0, 5.0, 2.0, 0.0, -2.0];
1161 let mut best_rho = strategy_result.rho.clone();
1162 let mut best_cost = base_cost;
1163 let mut improved = false;
1164 for _pass in 0..2 {
1165 let mut pass_improved = false;
1166 for &coord in &descent_coords {
1167 let mut local_best = best_rho.clone();
1168 let mut local_cost = best_cost;
1169 for &cand in &INWARD_GRID {
1170 // Inward escape only ever DESCENDS (less smoothing):
1171 // skip any grid point at or above this coordinate's
1172 // current ρ. Over-smoothing a null-space coordinate is
1173 // the #1266 escape's job (it carries the EDF parsimony
1174 // guard that prevents the #1476 concurvity transfer);
1175 // this guard must never raise λ without it.
1176 if cand >= best_rho[coord] - 1e-9 {
1177 continue;
1178 }
1179 let mut probe = best_rho.clone();
1180 probe[coord] = cand;
1181 if let Ok(c) = reml_state.compute_cost(&probe)
1182 && c.is_finite()
1183 && c < local_cost - 1e-6 * (1.0 + local_cost.abs())
1184 {
1185 local_cost = c;
1186 local_best = probe;
1187 }
1188 }
1189 if local_cost < best_cost - 1e-6 * (1.0 + best_cost.abs()) {
1190 best_rho = local_best;
1191 best_cost = local_cost;
1192 improved = true;
1193 pass_improved = true;
1194 }
1195 }
1196 if !pass_improved {
1197 break;
1198 }
1199 }
1200 // CONTINUOUS REFINEMENT of each descended coordinate. The coarse
1201 // INWARD_GRID snaps λ to a grid node (e.g. ρ_null = 0), but in the
1202 // OTHER covariate orientation the outer optimizer reports the
1203 // continuous interior minimizer (e.g. ρ_null = −0.37). Leaving one
1204 // orientation on the grid node while the other keeps the continuous
1205 // optimum leaves a small residual reflection asymmetry (#1548:
1206 // ~1.7e-3 mirror drift survives the grid descent alone). Golden-
1207 // section the SAME authoritative penalized cost on each moved
1208 // coordinate so both orientations converge to the identical
1209 // continuous minimum. It can only lower the cost from the grid node
1210 // (the bracket straddles it), and the cold confirmation below still
1211 // gates adoption, so this never raises the certified cost.
1212 if improved {
1213 const GS_R: f64 = 0.618_033_988_749_894_8; // (√5 − 1) / 2
1214 for coord in descent_coords.clone() {
1215 if (best_rho[coord] - strategy_result.rho[coord]).abs() <= 1e-9 {
1216 continue; // coordinate did not descend
1217 }
1218 // Bracket straddling the adopted grid node, never re-entering
1219 // the over-smoothing region above the coordinate's start ρ.
1220 let node = best_rho[coord];
1221 let mut a = node - 3.0;
1222 let mut b = (node + 3.0).min(strategy_result.rho[coord]);
1223 if b <= a + 1e-6 {
1224 continue;
1225 }
1226 let cost_at =
1227 |st: &mut RemlState, base: &Array1<f64>, x: f64| -> Option<f64> {
1228 let mut p = base.clone();
1229 p[coord] = x;
1230 st.compute_cost(&p).ok().filter(|c| c.is_finite())
1231 };
1232 let mut c = b - GS_R * (b - a);
1233 let mut d = a + GS_R * (b - a);
1234 let mut fc = cost_at(&mut reml_state, &best_rho, c);
1235 let mut fd = cost_at(&mut reml_state, &best_rho, d);
1236 let mut refine_ok = fc.is_some() && fd.is_some();
1237 for _ in 0..40 {
1238 if (b - a).abs() < 1e-4 {
1239 break;
1240 }
1241 match (fc, fd) {
1242 (Some(vc), Some(vd)) if vc <= vd => {
1243 b = d;
1244 d = c;
1245 fd = fc;
1246 c = b - GS_R * (b - a);
1247 fc = cost_at(&mut reml_state, &best_rho, c);
1248 }
1249 (Some(_), Some(_)) => {
1250 a = c;
1251 c = d;
1252 fc = fd;
1253 d = a + GS_R * (b - a);
1254 fd = cost_at(&mut reml_state, &best_rho, d);
1255 }
1256 _ => {
1257 refine_ok = false;
1258 break;
1259 }
1260 }
1261 }
1262 if refine_ok {
1263 let xm = 0.5 * (a + b);
1264 if let Some(fm) = cost_at(&mut reml_state, &best_rho, xm)
1265 && fm < best_cost
1266 {
1267 best_rho[coord] = xm;
1268 best_cost = fm;
1269 }
1270 }
1271 }
1272 }
1273 if improved {
1274 // COLD CONFIRMATION (guards against adopting a warm-start /
1275 // inner-cap artifact, the #1426/#1371 trap). The grid probes
1276 // ran warm-started off each other; a λ→0-ish interior point
1277 // can report a spuriously low cost from a capped inner solve.
1278 // Clear the inner cache and re-score the candidate cold; only
1279 // adopt if it STILL strictly beats the authoritative
1280 // `final_value`.
1281 reml_state.reset_outer_seed_state();
1282 let cold = reml_state.compute_cost(&best_rho);
1283 let cold_ok = matches!(cold, Ok(c)
1284 if c.is_finite() && c < base_cost - 1e-6 * (1.0 + base_cost.abs()));
1285 if cold_ok {
1286 let cold_cost = cold.unwrap_or(best_cost);
1287 log::info!(
1288 "[OUTER] #1074/#1548 upper-bound escape: certified ρ cost \
1289 {base_cost:.6e} lowered to {cold_cost:.6e} (cold-confirmed) by \
1290 descending {} over-smoothed coord(s) inward; adopting the cheaper \
1291 interior ρ",
1292 descent_coords.len()
1293 );
1294 strategy_result.rho = best_rho;
1295 strategy_result.final_value = cold_cost;
1296 // β̂ already installed at `best_rho` by the cold eval above.
1297 } else {
1298 // The improvement did not survive a cold re-score — it was
1299 // a warm-start artifact. Keep the certified ρ and restore
1300 // its inner state for downstream assembly.
1301 reml_state.reset_outer_seed_state();
1302 reml_state.compute_cost(&strategy_result.rho)?;
1303 }
1304 }
1305 }
1306 }
1307 // #1266 NULL-SPACE SHRINK-OUT ESCAPE (pure-REML; the OUTWARD-direction
1308 // dual of the #1074 inward escape above).
1309 //
1310 // A default double-penalty smooth (mgcv `select = TRUE`) carries a
1311 // `DoublePenaltyNullspace` shrinkage ridge on the term's penalty null
1312 // space ({1, x} for a 1-D bend) whose only job is SELECTION: drive its
1313 // λ_null UP to shrink an UNSUPPORTED term's constant+linear component out
1314 // (EDF → 0). On a well-determined Gaussian fit the relaxed ρ-prior places
1315 // a WIDE, symmetric `Normal(0, sd=15)` on that coordinate — NOT as a
1316 // selection criterion but purely as a degeneracy-breaker: the #1476
1317 // concurvity flat-ridge needs strictly-positive outer curvature to
1318 // certify an interior allocation. That symmetric prior's `ρ/sd²` gradient
1319 // also OPPOSES the (genuinely shallow) REML shrink-out tail, so the outer
1320 // optimizer certifies a stationary point at a MODERATE λ_null
1321 // (ρ_null ≈ 3.5, EDF ≈ 1.6) instead of following pure REML to the
1322 // shrink-out corner — the residual #1266 "Half B" contract violation. The
1323 // prior cannot be made one-sided: its high-ρ curvature is exactly what
1324 // stops a SUPPORTED concurvity null space from railing out (#1476), so a
1325 // data-INDEPENDENT prior cannot separate "shrink the unsupported term"
1326 // (#1266) from "keep the supported one" (#1476/#1371) — they overlap in ρ.
1327 //
1328 // The data-DEPENDENT discriminator is pure data-REML PLUS a parsimony
1329 // check. For each well-determined null-space selection coordinate,
1330 // line-search the OVER-SMOOTHING (high-ρ) direction on the PURE REML cost
1331 // (`compute_cost − configured_ρ_prior`; the prior is a conditioning
1332 // device, not a selection criterion), then adopt the strictly-best
1333 // COLD-confirmed point ONLY if it also does not increase the model's total
1334 // EDF:
1335 // * UNSUPPORTED, uncorrelated null space (#1266 `s(z)`): pure REML
1336 // descends toward shrink-out AND total EDF drops (z carries no signal,
1337 // so nothing absorbs it) → the escape fires, EDF → 0.
1338 // * SUPPORTED null space (#1371 genuine slope): pure REML strictly RISES
1339 // under over-smoothing (killing a real linear trend dumps its variance
1340 // into σ̂²) → no strict improvement → exact no-op.
1341 // * CONCURVITY null space (#1476 `s(x1)+s(x2)`, corr ≈ 0.9): pure REML
1342 // *marginally* prefers over-smoothing one coordinate because the inner
1343 // β re-solve lets the CORRELATED partner absorb the shared signal — the
1344 // "signal transfer" the degeneracy prior exists to forbid. That
1345 // transfer keeps the deviance flat but INFLATES total EDF (the partner
1346 // spends extra basis), so the EDF-non-increase guard vetoes it →
1347 // no-op, the interior allocation is kept. (Pure REML alone cannot see
1348 // this: the concurvity ridge is flat, so the transfer reads as a tiny
1349 // improvement; the parsimony guard is what distinguishes a genuine
1350 // simplification from a lateral reallocation.)
1351 //
1352 // Unlike #1074 (where the OPTIMIZER's bound projection masks the descent),
1353 // here it is the PRIOR that masks it, so the search runs on the pure
1354 // (prior-stripped) criterion. SCOPE: eligible coordinates are exactly the
1355 // well-determined relaxed null-space degeneracy coordinates
1356 // (`is_nullspace_degeneracy_prior`, gated by `n ≥ 2·p`). This deliberately
1357 // EXCLUDES the under-determined regime (`n < 2·p`, #1392 wine `p > n`),
1358 // where the null-space prior is the AGGRESSIVE PC select-out — a
1359 // deliberate, load-bearing selection push onto a genuinely-flat REML
1360 // score that stripping would undo.
1361 {
1362 let well_determined = n_design_rows >= 2 * p;
1363 let select_coords: Vec<usize> = if well_determined {
1364 match reml_state.effective_rho_prior().as_ref() {
1365 gam_problem::RhoPrior::Independent(per_coord) => {
1366 (0..strategy_result.rho.len())
1367 .filter(|&i| {
1368 per_coord.get(i).is_some_and(
1369 gam_terms::smooth::is_nullspace_degeneracy_prior,
1370 )
1371 })
1372 .collect()
1373 }
1374 _ => Vec::new(),
1375 }
1376 } else {
1377 Vec::new()
1378 };
1379 // Authoritative pure-REML baseline at the converged ρ: the optimizer's
1380 // own `final_value` (immune to warm-start pollution, the #1371 lesson)
1381 // minus the configured ρ-prior + soft λ→0 guard it carried. A probe
1382 // wins only if it strictly beats THIS pure cost.
1383 let conv_prior = reml_state
1384 .configured_rho_prior_atom(&strategy_result.rho)
1385 .cost()
1386 + reml_state
1387 .soft_rho_guard_prior_atom(&strategy_result.rho)
1388 .cost();
1389 let base_pure = strategy_result.final_value - conv_prior;
1390 if !select_coords.is_empty() && base_pure.is_finite() && conv_prior.is_finite() {
1391 // Converged-point total inner EDF, for the PARSIMONY guard below.
1392 // The inner P-IRLS solve at the converged ρ is cached, so this is
1393 // free. A genuine #1266 shrink-out (an UNSUPPORTED, uncorrelated
1394 // term selected out) strictly LOWERS the model's total EDF; a
1395 // concurvity TRANSFER (#1476: one null-space shrinks but its
1396 // correlated partner absorbs the signal via the inner β re-solve)
1397 // INFLATES it. Pure REML alone marginally prefers the transfer on
1398 // a flat concurvity ridge — exactly the allocation the degeneracy
1399 // prior exists to forbid — so the escape must additionally refuse
1400 // any adoption that does not reduce total EDF.
1401 let edf_conv = reml_state
1402 .obtain_eval_bundle(&strategy_result.rho)
1403 .ok()
1404 .map(|b| b.pirls_result.edf);
1405 // Pure data-REML at ρ: penalized `compute_cost` minus the configured
1406 // ρ-prior and the soft λ→0 guard (both `O(K)` functions of ρ alone).
1407 // Subtracting them recovers the mgcv-parity criterion selection
1408 // must follow; the prior bias on λ_null is removed exactly.
1409 let pure_reml = |rho: &Array1<f64>| -> Option<f64> {
1410 let c = reml_state.compute_cost(rho).ok()?;
1411 if !c.is_finite() {
1412 return None;
1413 }
1414 let prior = reml_state.configured_rho_prior_atom(rho).cost()
1415 + reml_state.soft_rho_guard_prior_atom(rho).cost();
1416 if !prior.is_finite() {
1417 return None;
1418 }
1419 Some(c - prior)
1420 };
1421 // Ascending over-smoothing grid in ABSOLUTE ρ (toward the
1422 // shrink-out rail at `RHO_BOUND`); only values strictly above a
1423 // coordinate's current ρ are over-smoothing candidates. Bounded:
1424 // at most 2 · |select| · 6 inner solves, and only fires when a
1425 // null-space coordinate is actually held below the rail.
1426 let rho_upper = crate::estimate::RHO_BOUND;
1427 const OUTWARD_GRID: [f64; 6] = [6.0, 9.0, 12.0, 18.0, 24.0, 30.0];
1428 let mut best_rho = strategy_result.rho.clone();
1429 let mut best_pure = base_pure;
1430 let mut improved = false;
1431 for _pass in 0..2 {
1432 let mut pass_improved = false;
1433 for &coord in &select_coords {
1434 let mut local_best = best_rho.clone();
1435 let mut local_pure = best_pure;
1436 for &cand in &OUTWARD_GRID {
1437 let target = cand.min(rho_upper);
1438 if target <= best_rho[coord] + 1e-9 {
1439 continue;
1440 }
1441 let mut probe = best_rho.clone();
1442 probe[coord] = target;
1443 if let Some(c) = pure_reml(&probe)
1444 && c < local_pure - 1e-6 * (1.0 + local_pure.abs())
1445 {
1446 local_pure = c;
1447 local_best = probe;
1448 }
1449 }
1450 if local_pure < best_pure - 1e-6 * (1.0 + best_pure.abs()) {
1451 best_rho = local_best;
1452 best_pure = local_pure;
1453 improved = true;
1454 pass_improved = true;
1455 }
1456 }
1457 if !pass_improved {
1458 break;
1459 }
1460 }
1461 if improved {
1462 // COLD confirmation (mirror of #1074): the warm grid probes
1463 // ran off each other's inner warm starts and can report a
1464 // spuriously-low cost. Clear the inner cache and re-score the
1465 // candidate cold; adopt only if its PURE REML STILL strictly
1466 // beats the authoritative converged baseline.
1467 reml_state.reset_outer_seed_state();
1468 let cold_penalized = reml_state.compute_cost(&best_rho);
1469 let cold_pure = cold_penalized.as_ref().ok().and_then(|&c| {
1470 c.is_finite().then(|| {
1471 c - reml_state.configured_rho_prior_atom(&best_rho).cost()
1472 - reml_state.soft_rho_guard_prior_atom(&best_rho).cost()
1473 })
1474 });
1475 // Total inner EDF at the candidate (cached from the cold eval).
1476 // The PARSIMONY guard: a genuine shrink-out must not INCREASE
1477 // the model's effective dimension (see `edf_conv`). When either
1478 // EDF is unavailable, refuse the adoption — a shrink that can't
1479 // be certified parsimonious is not worth the #1476 risk.
1480 let edf_best = reml_state
1481 .obtain_eval_bundle(&best_rho)
1482 .ok()
1483 .map(|b| b.pirls_result.edf);
1484 let edf_non_increasing = match (edf_best, edf_conv) {
1485 (Some(eb), Some(ec)) => eb <= ec + 1e-6,
1486 _ => false,
1487 };
1488 if let (Ok(penalized), Some(cold_pure)) = (cold_penalized, cold_pure)
1489 && cold_pure.is_finite()
1490 && cold_pure < base_pure - 1e-6 * (1.0 + base_pure.abs())
1491 && edf_non_increasing
1492 {
1493 // β̂ already installed at `best_rho` by the cold eval above.
1494 // Report the PENALIZED cost there as the objective so the
1495 // cached inner state and `final_value` agree with the
1496 // adopted ρ for the downstream cap-guard / assembly.
1497 log::info!(
1498 "[OUTER] #1266 null-space shrink-out escape: pure REML \
1499 {base_pure:.6e} → {cold_pure:.6e} (cold-confirmed), total \
1500 EDF {edf_conv:?} → {edf_best:?} (parsimonious) by \
1501 over-smoothing {} selection coord(s); adopting the \
1502 shrink-out ρ (penalized cost {penalized:.6e})",
1503 select_coords.len()
1504 );
1505 strategy_result.rho = best_rho;
1506 strategy_result.final_value = penalized;
1507 } else {
1508 // The improvement did not survive a cold re-score (or the
1509 // re-score failed) — a warm-start artifact. Keep the
1510 // certified ρ and restore its inner state.
1511 reml_state.reset_outer_seed_state();
1512 reml_state.compute_cost(&strategy_result.rho)?;
1513 }
1514 }
1515 }
1516 }
1517 // Convergence guard for the outer-aware inner-PIRLS schedule
1518 // (path #3): the BFGS bridge stores a coarsen-then-tighten cap
1519 // into `reml_state.outer_inner_cap` on every accepted gradient
1520 // eval. After the outer optimizer returns, the cached warm-start
1521 // β was computed at whatever cap the schedule last set — which
1522 // for fast-converging fits (≤5 BFGS iters) is a coarse cap of
1523 // 5/10/20 rather than the full inner budget. Reset the cap to 0
1524 // and run one final cost eval at the converged ρ so the cached
1525 // β is at full inner tolerance.
1526 run_outer_inner_cap_guard(
1527 &mut reml_state,
1528 &strategy_result.rho,
1529 RemlInnerCapGuardArm::Standard,
1530 )?;
1531 // Honour an explicit caller rho seed as the accepted log-λ: when the
1532 // caller pins `init_rhos`, the outer search is warm-started there and
1533 // the seed is the requested operating point, so report it verbatim
1534 // rather than the optimizer's (possibly clamped) returned rho.
1535 //
1536 // EXCEPTION (gam#1464): a caller seed that arrives as a warm-start hint
1537 // (the spatial-κ sweep reuses the previous-κ λ̂ as `heuristic_lambdas`)
1538 // must NOT pin the fit at a seed the optimizer has just been able to
1539 // strictly improve on. At a collapsing kernel (the constant-curvature
1540 // `curv()` smooth on the +κ side) the warm seed sits in a shallow
1541 // under-smoothing basin whose spuriously-low deviance, if reported
1542 // verbatim, makes the κ outer objective rail to the +chart bound for any
1543 // curved data. The objective-grid prepass and the #1371 release-and-
1544 // rerank guard above redirect `strategy_result.rho` into the correct
1545 // high-λ basin; defer to that converged ρ whenever it is STRICTLY cheaper
1546 // than the caller seed under the same REML cost. A genuine user pin (or a
1547 // healthy warm start) converges at the seed, so the seed stays cheapest
1548 // and is honoured verbatim, byte-for-byte as before.
1549 let accepted_rho = match heuristic_lambdas.filter(|h| h.len() == k) {
1550 Some(h) => {
1551 let seed = Array1::from_iter(h.iter().copied());
1552 let prefer_converged = {
1553 let cost_seed = reml_state.compute_cost(&seed).ok();
1554 let cost_converged = reml_state.compute_cost(&strategy_result.rho).ok();
1555 // Restore the cached β̂ to the converged operating point after
1556 // the seed probe (the no-op path below expects β̂ at
1557 // `strategy_result.rho`). Propagate any failure rather than
1558 // swallowing it: proceeding with β̂ at the wrong operating
1559 // point would silently corrupt the reported fit.
1560 reml_state.compute_cost(&strategy_result.rho)?;
1561 match (cost_seed, cost_converged) {
1562 (Some(cs), Some(cc)) if cs.is_finite() && cc.is_finite() => {
1563 cc < cs - 1e-6 * (1.0 + cs.abs())
1564 }
1565 _ => false,
1566 }
1567 };
1568 if prefer_converged {
1569 log::info!(
1570 "[OUTER] #1464 warm-seed override: converged ρ is strictly cheaper than \
1571 the caller warm seed; reporting the optimizer's ρ instead of the seed"
1572 );
1573 strategy_result.rho.clone()
1574 } else {
1575 seed
1576 }
1577 }
1578 None => strategy_result.rho.clone(),
1579 };
1580 (
1581 accepted_rho,
1582 cfg.link_kind.mixture_state().cloned(),
1583 cfg.link_kind.sas_state().copied(),
1584 None,
1585 None,
1586 strategy_result,
1587 )
1588 } else {
1589 let use_mixture = mixture_dim > 0;
1590 let use_sas = sas_dim > 0;
1591 let use_beta_logistic =
1592 use_sas && matches!(cfg.link_function(), LinkFunction::BetaLogistic);
1593 let theta_dim = k + mixture_dim + sas_dim;
1594 let sasspec = sas_optspec;
1595 let mixspec = mixture_optspec
1596 .clone()
1597 .or_else(|| {
1598 if use_mixture {
1599 None
1600 } else {
1601 Some(MixtureLinkSpec {
1602 components: Vec::new(),
1603 initial_rho: Array1::zeros(0),
1604 })
1605 }
1606 })
1607 .ok_or_else(|| EstimationError::InvalidInput("missing mixture spec".to_string()))?;
1608 let mut heuristic_theta = Vec::new();
1609 if let Some(hvals) = heuristic_lambdas
1610 && hvals.len() == k
1611 {
1612 heuristic_theta.extend_from_slice(hvals);
1613 if use_mixture {
1614 heuristic_theta
1615 .extend_from_slice(mixspec.initial_rho.as_slice().unwrap_or(&[]));
1616 }
1617 if let Some(spec) = sasspec {
1618 heuristic_theta.push(spec.initial_epsilon);
1619 heuristic_theta.push(spec.initial_log_delta);
1620 }
1621 }
1622 let heuristic_theta_ref = if heuristic_theta.len() == theta_dim {
1623 Some(heuristic_theta.as_slice())
1624 } else {
1625 None
1626 };
1627 let aux_dim_outer = if use_mixture { mixture_dim } else { sas_dim };
1628 let mut reml_seed_config_mix = reml_seed_config;
1629 reml_seed_config_mix.num_auxiliary_trailing = aux_dim_outer;
1630 if theta_dim >= REML_SEED_SCREENING_RHO_CAP {
1631 reml_seed_config_mix.max_seeds = 1;
1632 reml_seed_config_mix.seed_budget = 1;
1633 }
1634 use crate::rho_optimizer::OuterProblem;
1635 use gam_problem::{DeclaredHessianForm, Derivative, HessianResult, OuterEval};
1636 let initial_link_kind = cfg.link_kind.clone();
1637 let prefer_gradient_only = theta_dim >= REML_SECOND_ORDER_RHO_CAP;
1638 let continuation_prewarm = theta_dim < REML_CONTINUATION_PREWARM_RHO_CAP;
1639 if prefer_gradient_only {
1640 log::info!(
1641 "[OUTER] theta_dim {theta_dim} reaches exact REML Hessian budget \
1642 ({REML_SECOND_ORDER_RHO_CAP}); routing analytic-gradient quasi-Newton"
1643 );
1644 }
1645 if !continuation_prewarm {
1646 log::info!(
1647 "[OUTER] theta_dim {theta_dim} reaches continuation-prewarm budget \
1648 ({REML_CONTINUATION_PREWARM_RHO_CAP}); starting optimizer directly from seeds"
1649 );
1650 }
1651 let problem = OuterProblem::new(theta_dim)
1652 .with_gradient(Derivative::Analytic)
1653 .with_hessian(DeclaredHessianForm::Either)
1654 .with_prefer_gradient_only(prefer_gradient_only)
1655 .with_continuation_prewarm(continuation_prewarm)
1656 .with_psi_dim(mixture_dim + sas_dim)
1657 .with_barrier(
1658 crate::estimate::reml::reml_outer_engine::BarrierConfig::from_constraints(
1659 fit_linear_constraints.as_ref(),
1660 ),
1661 )
1662 .with_tolerance(reml_tol)
1663 .with_max_iter(reml_max_iter)
1664 .with_seed_config(reml_seed_config_mix)
1665 .with_screening_cap(Arc::clone(&reml_state.screening_max_inner_iterations))
1666 .with_outer_inner_cap(reml_inner_progress_feedback(&reml_state))
1667 .with_rho_bound(crate::estimate::RHO_BOUND);
1668 let problem = if let Some(h) = heuristic_theta_ref {
1669 problem.with_heuristic_lambdas(h.to_vec())
1670 } else {
1671 problem
1672 };
1673 let problem = if let Some(h) = heuristic_theta_ref {
1674 problem.with_initial_rho(Array1::from_iter(h.iter().copied()))
1675 } else {
1676 problem
1677 };
1678 let problem = match reml_state.outer_cache_session() {
1679 Some(session) => problem.with_cache_session(session),
1680 None => problem,
1681 };
1682 // Shared helper: parse theta into rho + link params, update link state.
1683 let apply_link_theta =
1684 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1685 theta: &Array1<f64>|
1686 -> Result<Array1<f64>, EstimationError> {
1687 let rho = theta.slice(s![..k]).to_owned();
1688 let mut cfg_eval = cfg.clone();
1689 if use_mixture {
1690 let mix_rho = theta.slice(s![k..(k + mixture_dim)]).to_owned();
1691 cfg_eval.link_kind = InverseLink::Mixture(
1692 state_fromspec(&MixtureLinkSpec {
1693 components: mixspec.components.clone(),
1694 initial_rho: mix_rho,
1695 })
1696 .map_err(|e| {
1697 EstimationError::InvalidInput(format!(
1698 "invalid blended inverse link: {e}"
1699 ))
1700 })?,
1701 );
1702 }
1703 if use_sas {
1704 let epsilon = if use_beta_logistic {
1705 theta[k]
1706 } else {
1707 let (v, _) = sas_effective_epsilon(theta[k]);
1708 v
1709 };
1710 let delta_like = theta[k + 1];
1711 cfg_eval.link_kind = if use_beta_logistic {
1712 InverseLink::BetaLogistic(
1713 state_from_beta_logisticspec(SasLinkSpec {
1714 initial_epsilon: epsilon,
1715 initial_log_delta: delta_like,
1716 })
1717 .map_err(|e| {
1718 EstimationError::InvalidInput(format!(
1719 "invalid Beta-Logistic link: {e}"
1720 ))
1721 })?,
1722 )
1723 } else {
1724 InverseLink::Sas(
1725 state_from_sasspec(SasLinkSpec {
1726 initial_epsilon: epsilon,
1727 initial_log_delta: delta_like,
1728 })
1729 .map_err(|e| {
1730 EstimationError::InvalidInput(format!("invalid SAS link: {e}"))
1731 })?,
1732 )
1733 };
1734 }
1735 state.set_link_states(
1736 cfg_eval.link_kind.mixture_state().cloned(),
1737 cfg_eval.link_kind.sas_state().copied(),
1738 );
1739 Ok(rho)
1740 };
1741
1742 // SAS ridge/barrier cost correction (shared between cost_fn, eval_fn, efs_fn).
1743 let sas_ridge_cost = |theta: &Array1<f64>| -> f64 {
1744 let sasridge = if use_sas && !use_beta_logistic {
1745 sasridgeweight
1746 } else {
1747 0.0
1748 };
1749 if use_sas && sasridge > 0.0 {
1750 let log_delta = theta[k + 1];
1751 let mut extra = 0.5 * sasridge * log_delta * log_delta;
1752 if !use_beta_logistic {
1753 let (barriercost, _) = sas_log_delta_edge_barriercostgrad(log_delta);
1754 extra += barriercost;
1755 }
1756 extra
1757 } else {
1758 0.0
1759 }
1760 };
1761
1762 let obj = problem.build_objective(
1763 &mut reml_state,
1764 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1765 theta: &Array1<f64>| {
1766 let rho = apply_link_theta(state, theta)?;
1767 let cost = state.compute_cost(&rho)? + sas_ridge_cost(theta);
1768 Ok(cost)
1769 },
1770 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1771 theta: &Array1<f64>| {
1772 let eval_idx = outer_eval_idx.fetch_add(1, Ordering::Relaxed) + 1;
1773 let rho = apply_link_theta(state, theta)?;
1774 let tcost = Instant::now();
1775
1776 // Use the unified REML evaluator with link ext_coords.
1777 // This computes ρ gradient AND link parameter gradient jointly
1778 // through the same HyperCoord infrastructure used for aniso ψ.
1779 let eval_mode =
1780 crate::estimate::reml::reml_outer_engine::EvalMode::ValueGradientHessian;
1781 let result = state.evaluate_unified_with_link_ext(&rho, eval_mode)?;
1782
1783 let cost = result.cost + sas_ridge_cost(theta);
1784 let mut grad = result.gradient.ok_or_else(|| {
1785 EstimationError::InvalidInput(
1786 "unified evaluator returned no gradient in ValueGradientHessian mode"
1787 .to_string(),
1788 )
1789 })?;
1790
1791 assert_eq!(
1792 grad.len(),
1793 theta_dim,
1794 "unified evaluator gradient length {} != theta_dim {}",
1795 grad.len(),
1796 theta_dim
1797 );
1798
1799 let grad_effective = grad.clone();
1800 let mut hessian = materialize_link_outer_hessian(result.hessian, theta_dim)?;
1801
1802 // SAS epsilon reparameterization chain rule.
1803 if use_sas && !use_beta_logistic {
1804 let (_, d_eps_d_raw, d2_eps_d_raw2) = sas_effective_epsilon_second(theta[k]);
1805 for j in 0..theta_dim {
1806 hessian[[k, j]] *= d_eps_d_raw;
1807 hessian[[j, k]] *= d_eps_d_raw;
1808 }
1809 hessian[[k, k]] += grad_effective[k] * d2_eps_d_raw2;
1810 grad[k] *= d_eps_d_raw;
1811 }
1812 // SAS log_delta ridge + barrier gradient/Hessian.
1813 if use_sas && !use_beta_logistic && sasridgeweight > 0.0 {
1814 let log_delta = theta[k + 1];
1815 grad[k + 1] += sasridgeweight * log_delta;
1816 hessian[[k + 1, k + 1]] += sasridgeweight;
1817 let (_, barriergrad, barrierhess) =
1818 sas_log_delta_edge_barriercostgradhess(log_delta);
1819 grad[k + 1] += barriergrad;
1820 hessian[[k + 1, k + 1]] += barrierhess;
1821 }
1822
1823 let cost_sec = tcost.elapsed().as_secs_f64();
1824 let aux_dim = if use_mixture { mixture_dim } else { sas_dim };
1825 log::debug!(
1826 "[outer-eval {eval_idx}] theta_dim={} aux_dim={} unified_link_ext time_sec={:.3}",
1827 theta_dim,
1828 aux_dim,
1829 cost_sec,
1830 );
1831 Ok(OuterEval {
1832 cost,
1833 gradient: grad,
1834 hessian: HessianResult::Analytic(hessian),
1835 inner_beta_hint: state.current_original_basis_beta(),
1836 })
1837 },
1838 Some(|state: &mut &mut crate::estimate::reml::RemlState<'_>| {
1839 state.reset_outer_seed_state();
1840 state.set_link_states(
1841 initial_link_kind.mixture_state().cloned(),
1842 initial_link_kind.sas_state().copied(),
1843 );
1844 }),
1845 Some(
1846 |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1847 theta: &Array1<f64>| {
1848 let rho = apply_link_theta(state, theta)?;
1849 let mut efs_eval = state.compute_efs_steps_with_link_ext(&rho)?;
1850
1851 // SAS reparameterization chain rule on ψ steps.
1852 if use_sas && !use_beta_logistic {
1853 let (_, d_eps_d_raw) = sas_effective_epsilon(theta[k]);
1854 if efs_eval.steps.len() > k {
1855 efs_eval.steps[k] *= d_eps_d_raw;
1856 }
1857 if let Some(ref mut pg) = efs_eval.psi_gradient
1858 && !pg.is_empty() {
1859 pg[0] *= d_eps_d_raw;
1860 }
1861 }
1862
1863 // SAS log-δ ridge + edge barrier: their gradients enter
1864 // `result.gradient` from the unified evaluator (estimate.rs
1865 // 2170+), and `compute_efs_steps_with_link_ext` runs the
1866 // universal-form EFS step `Δρ = log(1 − 2·g_full/q_eff)`
1867 // which absorbs them automatically. We only need to
1868 // mirror that contribution into the *cost* slot here so
1869 // the outer fixed-point bridge's line search compares
1870 // augmented-cost trial points consistently.
1871 efs_eval.cost += sas_ridge_cost(theta);
1872 Ok(efs_eval)
1873 },
1874 ),
1875 );
1876 // Same publish/consume symmetry as the standard REML arm above
1877 // (issue #236). The mixture/SAS eval closure also surfaces
1878 // `inner_beta_hint = state.current_original_basis_beta()` (see
1879 // src/solver/estimate.rs:3275), so continuation pre-warm needs
1880 // a real seed hook to install it.
1881 let mut obj = obj.with_seed_inner_state(with_reml_beta_seed_hook());
1882 let outer_result = problem.run(&mut obj, "mixture/SAS flexible link")?;
1883 drop(obj);
1884 // Convergence guard for the outer-aware inner-PIRLS schedule
1885 // (path #3) — see the matching comment in the standard REML arm
1886 // above. Reset the cap and run one final compute_cost at the
1887 // converged θ so the cached warm-start β is at full inner
1888 // tolerance regardless of where the BFGS schedule was when the
1889 // optimizer terminated.
1890 //
1891 // The outer vector here is the AUGMENTED θ = [ρ_smooth (k) | link
1892 // params (mixture_dim and/or sas_dim)], not a smoothing-only ρ.
1893 // `compute_cost` exponentiates its argument wholesale into the
1894 // penalty λ vector (loop_driver.rs `rho.mapv(exp)`), so the guard
1895 // must receive exactly the same smoothing-only ρ — and the same
1896 // installed link state — the outer evaluator operated on, never the
1897 // raw augmented θ. Feeding the full θ in made the guard hand `k +
1898 // mixture_dim + sas_dim` "lambdas" to a `k`-penalty reparameterizer,
1899 // which faults with "Lambda count mismatch" (#1571). Route θ through
1900 // the same `apply_link_theta` the eval closure (optimizer.rs:1759)
1901 // and the accept-fit slice (the `final_rho` line just below) use: it
1902 // installs the converged mixture/SAS link state onto `reml_state`
1903 // and returns the smoothing-only ρ block.
1904 let guard_rho = {
1905 let mut state_ref: &mut crate::estimate::reml::RemlState<'_> =
1906 &mut reml_state;
1907 apply_link_theta(&mut state_ref, &outer_result.rho)?
1908 };
1909 run_outer_inner_cap_guard(
1910 &mut reml_state,
1911 &guard_rho,
1912 RemlInnerCapGuardArm::MixtureSas,
1913 )?;
1914 let final_rho = outer_result.rho.slice(s![..k]).to_owned();
1915 let final_mix_state = if use_mixture {
1916 let final_mix_rho = outer_result.rho.slice(s![k..(k + mixture_dim)]).to_owned();
1917 Some(
1918 state_fromspec(&MixtureLinkSpec {
1919 components: mixspec.components.clone(),
1920 initial_rho: final_mix_rho,
1921 })
1922 .map_err(|e| {
1923 EstimationError::InvalidInput(format!("invalid blended inverse link: {e}"))
1924 })?,
1925 )
1926 } else {
1927 None
1928 };
1929 let final_sas_state = if use_sas {
1930 let epsilon_eff = if use_beta_logistic {
1931 outer_result.rho[k]
1932 } else {
1933 let (v, _) = sas_effective_epsilon(outer_result.rho[k]);
1934 v
1935 };
1936 Some(if use_beta_logistic {
1937 state_from_beta_logisticspec(SasLinkSpec {
1938 initial_epsilon: epsilon_eff,
1939 initial_log_delta: outer_result.rho[k + 1],
1940 })
1941 .map_err(|e| {
1942 EstimationError::InvalidInput(format!("invalid Beta-Logistic link: {e}"))
1943 })?
1944 } else {
1945 state_from_sasspec(SasLinkSpec {
1946 initial_epsilon: epsilon_eff,
1947 initial_log_delta: outer_result.rho[k + 1],
1948 })
1949 .map_err(|e| EstimationError::InvalidInput(format!("invalid SAS link: {e}")))?
1950 })
1951 } else {
1952 cfg.link_kind.sas_state().copied()
1953 };
1954 let aux_param_covariance = None;
1955 let (mix_cov, sas_cov) = if use_mixture {
1956 (aux_param_covariance, None)
1957 } else if use_sas {
1958 (None, aux_param_covariance)
1959 } else {
1960 (None, None)
1961 };
1962 (
1963 final_rho,
1964 final_mix_state,
1965 final_sas_state,
1966 mix_cov,
1967 sas_cov,
1968 outer_result,
1969 )
1970 };
1971 // Reuse the Gaussian-Identity XᵀWX cache the outer loop already populated,
1972 // so the final accept-fit skips the streaming GEMM as well.
1973 //
1974 // When the outer loop conditioned the response (centering for #1000, scaling
1975 // for #1127), that cache holds `XᵀW((y−center)/scale)`; the accept-fit runs
1976 // on the *original* response `y_o`, so reusing the conditioned `XᵀWy` would
1977 // solve on the shifted/rescaled scale and report every fitted value, residual
1978 // and dispersion off the user's scale. Rebuild the cross-product from the
1979 // original response in that case — the constant `XᵀWX` block is the only part
1980 // the cache would have saved, a one-off cost paid only on the rare
1981 // large-mean / small-magnitude responses that trigger conditioning.
1982 let final_cache_handle = if response_center.is_some() || response_scale.is_some() {
1983 None
1984 } else {
1985 reml_state.gaussian_fixed_cache_if_eligible()
1986 };
1987 let pirls_res_pair = pirls::fit_model_for_fixed_rho_with_adaptive_kkt(
1988 LogSmoothingParamsView::new(final_rho.view()),
1989 pirls::PirlsProblem {
1990 x: reml_state.x(),
1991 offset: offset_o.view(),
1992 y: y_o.view(),
1993 priorweights: w_o.view(),
1994 covariate_se: None,
1995 gaussian_fixed_cache: final_cache_handle.as_deref(),
1996 // The final reported fit must be exact at the converged ρ/ψ — never
1997 // serve the frozen-W first-step approximation here.
1998 glm_first_step_gram: None,
1999 },
2000 pirls::PenaltyConfig {
2001 canonical_penalties: reml_state.canonical_penalties(),
2002 balanced_penalty_root: Some(reml_state.balanced_penalty_root()),
2003 reparam_invariant: None,
2004 p,
2005 coefficient_lower_bounds: None,
2006 linear_constraints_original: fit_linear_constraints.as_ref(),
2007 penalty_shrinkage_floor: opts.penalty_shrinkage_floor,
2008 kronecker_factored: None,
2009 },
2010 &pirls::PirlsConfig {
2011 link_kind: if let Some(state) = final_mixture_state.clone() {
2012 InverseLink::Mixture(state)
2013 } else if let Some(state) = final_sas_state {
2014 if matches!(cfg.link_function(), LinkFunction::BetaLogistic) {
2015 InverseLink::BetaLogistic(state)
2016 } else {
2017 InverseLink::Sas(state)
2018 }
2019 } else {
2020 cfg.link_kind.clone()
2021 },
2022 ..cfg.as_pirls_config()
2023 },
2024 None,
2025 None,
2026 // Final, reported fit at the REML-selected λ: refine the family's
2027 // estimated dispersion nuisance at the converged η. For Gamma this
2028 // re-estimates the shape so `dispersion_phi()` and every SE / interval
2029 // reflect the conditional noise, not the spread of μ (#678); for Beta
2030 // it drives the precision φ and the mean β̂ to their joint fixed point,
2031 // undoing the slope attenuation from a φ frozen at the null predictor
2032 // (#769). λ is fixed here, so there is no scale↔λ feedback.
2033 true,
2034 )?;
2035 pirls_res = pirls_res_pair.0;
2036
2037 // Negative-Binomial outer θ↔λ alternation decision (#1448, supersedes the
2038 // #1082 drift diagnostic).
2039 //
2040 // θ was frozen at the λ-search value (`frozen_negbin_theta`) so `F(ρ)` is
2041 // stationary in ρ; the accept-fit above ML-refreshed θ at the converged η.
2042 // If that refreshed θ_final drifted from the search θ_frozen by more than the
2043 // joint-stationarity tolerance, the ρ we just selected was optimal for the
2044 // OLD θ, not θ_final: re-freeze the search at θ_final, reset the outer seed
2045 // state (eval bundle, PIRLS cache, warm-start signals, inner caps — all keyed
2046 // to the old θ), and run the ρ search again. Iterate to the (ρ, θ) joint
2047 // fixed point or until the round cap, after which we accept the last fit and
2048 // log the residual drift. For non-NB / user-fixed-θ fits the criterion below
2049 // is never met (θ is not estimated), so the loop breaks on round 0 and the
2050 // fit is byte-identical to the pre-#1448 single pass.
2051 let mut should_alternate = false;
2052 if pirls_res.likelihood.negbin_theta_is_estimated() {
2053 let frozen_bits = reml_state.frozen_negbin_theta.load(Ordering::Relaxed);
2054 if frozen_bits != 0
2055 && let Some(theta_final) = pirls_res.likelihood.negbin_theta()
2056 {
2057 let theta_frozen = f64::from_bits(frozen_bits);
2058 if theta_frozen.is_finite() && theta_frozen > 0.0 && theta_final.is_finite() {
2059 let rel_drift =
2060 (theta_final - theta_frozen).abs() / theta_frozen.max(f64::MIN_POSITIVE);
2061 let drift_pct = rel_drift * 100.0;
2062 if rel_drift > NEGBIN_THETA_JOINT_DRIFT_TOL {
2063 if negbin_alternation_round + 1 < NEGBIN_OUTER_ALTERNATION_MAX_ROUNDS {
2064 log::info!(
2065 "[OUTER] negative-binomial θ↔λ alternation round {}: θ drifted \
2066 {drift_pct:.1}% (θ_frozen={theta_frozen:.6e} → θ_final={theta_final:.6e}); \
2067 re-freezing at θ_final and re-running the ρ search (#1448).",
2068 negbin_alternation_round + 1
2069 );
2070 // Re-freeze the λ-search θ at the refreshed value. The
2071 // capture in `solve_for_unified_rho` only writes when the
2072 // frozen slot is 0, so a non-zero value here pins every
2073 // subsequent λ-search inner solve to θ_final rather than
2074 // re-deriving it from the seed η.
2075 reml_state
2076 .frozen_negbin_theta
2077 .store(theta_final.to_bits(), Ordering::Relaxed);
2078 // The cached criterion / factor bundle and warm-start
2079 // signals were all computed at θ_frozen; drop them so the
2080 // next round's ρ search recomputes `F(ρ) = REML(ρ, θ_final)`.
2081 reml_state.reset_outer_seed_state();
2082 should_alternate = true;
2083 } else {
2084 log::warn!(
2085 "[OUTER] negative-binomial θ↔λ alternation hit the round cap \
2086 ({NEGBIN_OUTER_ALTERNATION_MAX_ROUNDS}) with residual θ drift \
2087 {drift_pct:.1}% (θ_frozen={theta_frozen:.6e} → θ_final={theta_final:.6e}); \
2088 accepting the last fit (#1448)."
2089 );
2090 }
2091 } else {
2092 log::debug!(
2093 "[OUTER] negative-binomial (ρ, θ) jointly stationary after {} \
2094 alternation round(s): drift {drift_pct:.2}% \
2095 (θ_frozen={theta_frozen:.6e} → θ_final={theta_final:.6e}).",
2096 negbin_alternation_round + 1
2097 );
2098 }
2099 }
2100 }
2101 }
2102 if should_alternate {
2103 negbin_alternation_round += 1;
2104 continue;
2105 }
2106 break;
2107 } // negbin θ↔λ alternation loop (#1448)
2108 // Ensure we don't report 0 iterations to the caller; at least 1 is more meaningful.
2109 let iters = std::cmp::max(1, outer_result.iterations);
2110
2111 // Map beta back to original basis
2112 let beta_orig_internal = pirls_res
2113 .reparam_result
2114 .qs
2115 .dot(pirls_res.beta_transformed.as_ref());
2116 let beta_orig = conditioning.backtransform_beta(&beta_orig_internal);
2117
2118 // Effective sample size for dispersion/REML accounting.
2119 //
2120 // A prior weight of exactly 0 makes a row contribute nothing to any weighted
2121 // cross-product (XᵀWX, XᵀWy) or to the weighted RSS (w_i·r_i² = 0), so such a
2122 // row is statistically equivalent to an absent row. The *only* channel left by
2123 // which it could still perturb the fit is an explicit observation count. To
2124 // keep zero-weight rows exactly equivalent to absent rows (R's `n.ok =
2125 // nobs − Σ[w==0]`, mgcv's dropped zero-weight observations), the dispersion
2126 // sample size must be the count of positive-weight rows, not the raw row
2127 // count. Otherwise the Gaussian scale φ̂ = weighted_rss / (n − edf) puts a
2128 // numerator that already excludes zero-weight rows over a denominator that
2129 // counts them, biasing φ̂ low and shrinking every SE (#584). The REML
2130 // criterion's own observation count (which drives λ selection) lives in the
2131 // inner-solution assembly and must apply the same positive-weight count.
2132 let n = w_o.iter().filter(|&&wi| wi > 0.0).count() as f64;
2133 let weighted_rss = if matches!(cfg.link_function(), LinkFunction::Identity) {
2134 let fitted = {
2135 let mut eta = offset_o.clone();
2136 eta += &x_o.matrixvectormultiply(&beta_orig);
2137 eta
2138 };
2139 let resid = y_o.to_owned() - &fitted;
2140 w_o.iter()
2141 .zip(resid.iter())
2142 .map(|(&wi, &ri)| wi * ri * ri)
2143 .sum()
2144 } else {
2145 0.0
2146 };
2147
2148 // Default solver policy stays on the REML/Laplace path. Joint HMC remains
2149 // available through explicit sampling flows, but fitting does not
2150 // automatically densify the Hessian or escalate into NUTS.
2151 let (final_rho, pirls_res) = (final_rho, pirls_res);
2152
2153 // Recompute beta in the finalized basis/parameterization.
2154 let beta_orig_internal = pirls_res
2155 .reparam_result
2156 .qs
2157 .dot(pirls_res.beta_transformed.as_ref());
2158
2159 let lambdas = final_rho.mapv(f64::exp);
2160 let p_dim = pirls_res.beta_transformed.len();
2161 let penalty_rank_total = pirls_res.reparam_result.e_transformed.nrows();
2162 let mp = (p_dim as f64 - penalty_rank_total as f64).max(0.0);
2163 let mut edf_by_block = vec![0.0; k];
2164 // Raw per-block penalty trace tr_kk = λ_kk·tr(H⁻¹S_kk), retained so per-term
2165 // EDF can be assembled as |coeff_range| − Σ tr_kk (issue #1219).
2166 let mut penalty_block_trace = vec![0.0; k];
2167 let mut edf_total = 0.0;
2168 let mut smoothing_correction = None;
2169 let mut rho_covariance = None;
2170 let mut penalized_hessian = Array2::<f64>::zeros((0, 0));
2171 let mut beta_covariance = None;
2172 let mut beta_standard_errors = None;
2173 let mut beta_covariance_corrected = None;
2174 let mut beta_standard_errors_corrected = None;
2175 let mut beta_covariance_frequentist = None;
2176 let mut coefficient_influence = None;
2177 let mut weighted_gram = None;
2178 // Factorization of stabilized Hessian in transformed basis, reused for
2179 // SE computation via solve-on-demand after dispersion is determined.
2180 let mut edf_factor: Option<Box<dyn FactorizedSystem>> = None;
2181 let mut bias_correction_beta = None;
2182 let mut rho_posterior_certificate = None;
2183 let mut rho_posterior_escalation = None;
2184
2185 if opts.compute_inference {
2186 // EDF by block using stabilized H and penalty roots in transformed basis.
2187 let h = &pirls_res.stabilizedhessian_transformed;
2188 let p_dim = h.nrows();
2189 // Sparse-aware factorization with ridge retry — no densification.
2190 // Uses SymmetricMatrix::factorize() -> sparse Cholesky for sparse,
2191 // dense Cholesky for dense.
2192 let factor = {
2193 let scale = h.max_abs_diag();
2194 let min_step = scale * 1e-10;
2195 let mut ridge = 0.0_f64;
2196 let mut attempts = 0_usize;
2197 loop {
2198 let candidate = if ridge > 0.0 {
2199 match h.addridge(ridge) {
2200 Ok(c) => c,
2201 Err(_) => h.clone(),
2202 }
2203 } else {
2204 h.clone()
2205 };
2206 if let Ok(f) = candidate.factorize() {
2207 if ridge > 0.0 {
2208 // This ridged factor is reused for the reported standard
2209 // errors, covariance, and bias correction below, so those
2210 // quantities are stabilized approximations, not the exact
2211 // (unridged) Hessian-based values.
2212 log::warn!(
2213 "Inference Hessian was rank-deficient and required a stabilizing \
2214 ridge {:.3e}; reported standard errors, covariance, and bias \
2215 correction are computed from the ridge-stabilized factor and are \
2216 approximations, not exact unridged values",
2217 ridge,
2218 );
2219 }
2220 break f;
2221 }
2222 attempts += 1;
2223 if attempts >= MAX_FACTORIZATION_ATTEMPTS {
2224 return Err(EstimationError::ModelIsIllConditioned {
2225 condition_number: f64::INFINITY,
2226 });
2227 }
2228 ridge = if ridge <= 0.0 { min_step } else { ridge * 10.0 };
2229 }
2230 };
2231 let mut traces = vec![0.0f64; k];
2232 for (kk, cp) in pirls_res
2233 .reparam_result
2234 .canonical_transformed
2235 .iter()
2236 .enumerate()
2237 {
2238 // Build the p × rank RHS with nonzeros only in [start..end] rows.
2239 let r = &cp.col_range;
2240 let rank = cp.rank();
2241 let mut rhs = Array2::<f64>::zeros((p_dim, rank));
2242 for col in 0..rank {
2243 for row in 0..cp.block_dim() {
2244 rhs[[r.start + row, col]] = cp.root[[col, row]];
2245 }
2246 }
2247 let sol =
2248 factor
2249 .solvemulti(&rhs)
2250 .map_err(|_| EstimationError::ModelIsIllConditioned {
2251 condition_number: f64::INFINITY,
2252 })?;
2253 // Frobenius inner product: only the block rows of rhs are nonzero.
2254 let mut frob = 0.0f64;
2255 for col in 0..rank {
2256 for row in 0..cp.block_dim() {
2257 frob += sol[[r.start + row, col]] * rhs[[r.start + row, col]];
2258 }
2259 }
2260 // The per-block penalty trace `tr_kk = λ_kk·tr(H⁻¹ S_kk)` is the
2261 // penalized effective d.f. of block `kk`, mathematically confined to
2262 // `[0, rank_kk]` (a PSD penalty can absorb at most its own rank). When
2263 // the outer REML / spatial-κ optimizer drives a redundant block's
2264 // `λ_kk = exp(ρ_kk)` to the finite ceiling (gam#1379: the Matérn kernel
2265 // already controls the smoothness a redundant operator block also
2266 // penalizes, so REML wants `λ → ∞`), the raw product `λ_kk · frob`
2267 // can overflow to `+∞` on the ridge-stabilized inference Hessian even
2268 // though the true value is just `rank_kk` — poisoning
2269 // `penalty_block_trace[kk]` and tripping the fit-result finiteness
2270 // validator (`fit_result.penalty_block_trace[kk] must be finite, got
2271 // inf`). Clamp to the valid `[0, rank]` interval so a fully-penalized
2272 // direction reads its exact saturated trace `rank_kk` instead of `+∞`.
2273 // Ordinary finite traces are inside `[0, rank]` and pass through
2274 // unchanged, so non-degenerate fits and their recorded EDF accounting
2275 // are bit-identical (the `edf_by_block` channel already clamps the
2276 // complementary `rank − trace` to `[0, rank]`).
2277 // f64::clamp does NOT fix NaN (only ±inf): a NaN product (e.g.
2278 // inf*0 from an overflowed solve) would slip through and trip the
2279 // penalty_block_trace finiteness validator. Map any non-finite
2280 // product to the saturated `rank` bound, exactly as the inf case
2281 // already resolves (gam#1379).
2282 let trace_val = lambdas[kk] * frob;
2283 traces[kk] = if trace_val.is_finite() {
2284 trace_val.clamp(0.0, rank as f64)
2285 } else {
2286 rank as f64
2287 };
2288 }
2289 edf_total = (p_dim as f64 - kahan_sum(traces.iter().copied())).clamp(mp, p_dim as f64);
2290 penalty_block_trace.clone_from(&traces);
2291 for (kk, cp) in pirls_res
2292 .reparam_result
2293 .canonical_transformed
2294 .iter()
2295 .enumerate()
2296 {
2297 let p_k = cp.rank() as f64;
2298 let edf_k = (p_k - traces[kk]).clamp(0.0, p_k);
2299 edf_by_block[kk] = edf_k;
2300 }
2301
2302 // Reconcile the EDF accounting with the influence matrix F = H⁻¹X'WX.
2303 //
2304 // The block-trace channel above factorizes the TRANSFORMED stabilized
2305 // Hessian with a bespoke 10×-escalation ridge loop. On rank-deficient
2306 // spatial-smooth corners (degenerate-Hessian thin-plate fits) that loop
2307 // can take an enormous ridge, inflating Σ tr_kk toward `p` and collapsing
2308 // `edf_total = p − Σ tr_kk` onto its floor `mp` (e.g. 1.0 for a single
2309 // smooth) even though the fitted surface — and the influence matrix `F`
2310 // that the prediction, dispersion, and per-term EDF all consume — has
2311 // legitimately spent ~70 EDF (issue #1356). The authoritative model
2312 // definition of EDF is the influence-matrix trace; the per-term EDF
2313 // (`FitResult::per_term_edf`) reads `tr(F)` over each block. Recompute the
2314 // per-block penalty traces from the SAME rank-revealing inverse `F` uses
2315 // (`matrix_inversewith_regularization` of the original-basis Hessian), so
2316 // `edf_total = p − Σ tr_kk = tr(F)`, `Σ edf_by_block = edf_total`, and the
2317 // total can never fall below a single term's own EDF. Done before the
2318 // dispersion `σ̂² = RSS/(n − edf_total)` is formed so it, too, uses the
2319 // honest effective d.f. (the trace-channel collapse otherwise biased
2320 // σ̂² high → inflated SEs on the same seeds).
2321 //
2322 // Per-block traces `tr_kk = λ_kk·tr(H⁻¹ S_kk)` are basis-invariant; map
2323 // each canonical block's penalty root into the original coefficient basis
2324 // (`root_orig = Qs · root_t`) and contract against the original-basis
2325 // inverse. Restricted to small models (where the dense inverse `F` itself
2326 // is formed); large models keep the trace-channel value.
2327 {
2328 let p_orig = pirls_res.reparam_result.qs.nrows();
2329 const COV_FULL_INVERSE_MAX_P: usize = 10_000;
2330 if p_orig <= COV_FULL_INVERSE_MAX_P {
2331 let h_orig = map_hessian_to_original_basis(&pirls_res)?;
2332 if let Some(h_inv) =
2333 matrix_inversewith_regularization(&h_orig, "edf reconciliation")
2334 {
2335 let qs = &pirls_res.reparam_result.qs;
2336 let p_t = qs.ncols();
2337 let mut traces_f = vec![0.0f64; k];
2338 for (kk, cp) in pirls_res
2339 .reparam_result
2340 .canonical_transformed
2341 .iter()
2342 .enumerate()
2343 {
2344 if kk >= lambdas.len() {
2345 continue;
2346 }
2347 let r = &cp.col_range;
2348 let rank = cp.rank();
2349 let mut root_t = Array2::<f64>::zeros((p_t, rank));
2350 for col in 0..rank {
2351 for row in 0..cp.block_dim() {
2352 root_t[[r.start + row, col]] = cp.root[[col, row]];
2353 }
2354 }
2355 // S_kk = Rᵀ R; λ_kk·tr(H⁻¹ S_kk) = λ_kk·Σ_col (R_col)ᵀ H⁻¹ R_col.
2356 let root_orig = qs.dot(&root_t); // p_orig × rank
2357 let sol = h_inv.dot(&root_orig); // H⁻¹ R
2358 let mut frob = 0.0f64;
2359 for col in 0..rank {
2360 for row in 0..p_orig {
2361 frob += sol[[row, col]] * root_orig[[row, col]];
2362 }
2363 }
2364 // Same `[0, rank]` clamp as the trace-channel path above
2365 // (gam#1379): a ceiling-`λ` redundant block's
2366 // `λ_kk·tr(H⁻¹ S_kk)` can overflow to `+∞` here too; the
2367 // penalized trace is bounded by the block rank, so clamp to
2368 // keep `penalty_block_trace` finite and the EDF accounting
2369 // consistent. Finite in-range traces are untouched.
2370 // NaN-safe (gam#1379): f64::clamp leaves NaN as NaN, so
2371 // map any non-finite product to the saturated `rank`.
2372 let trace_val = lambdas[kk] * frob;
2373 traces_f[kk] = if trace_val.is_finite() {
2374 trace_val.clamp(0.0, rank as f64)
2375 } else {
2376 rank as f64
2377 };
2378 }
2379 edf_total = (p_orig as f64 - kahan_sum(traces_f.iter().copied()))
2380 .clamp(mp, p_orig as f64);
2381 penalty_block_trace.clone_from(&traces_f);
2382 for (kk, cp) in pirls_res
2383 .reparam_result
2384 .canonical_transformed
2385 .iter()
2386 .enumerate()
2387 {
2388 let p_k = cp.rank() as f64;
2389 edf_by_block[kk] = (p_k - traces_f[kk]).clamp(0.0, p_k);
2390 }
2391 }
2392 }
2393 }
2394
2395 // O(n⁻¹) frequentist bias correction vector b̂ = H⁻¹ S(λ̂)(β̂ - μ).
2396 // Computed in transformed PIRLS basis (where the factorization above lives)
2397 // and then mapped to the original coefficient basis via Qs.
2398 // Frequentist bias of the linear predictor at x is -s_*(x)^T b̂; the
2399 // corrected predictor is η̂_BC(x) = η̂(x) + s_*(x)^T b̂.
2400 let beta_t = pirls_res.beta_transformed.as_ref();
2401 let mut s_beta_t = Array1::<f64>::zeros(p_dim);
2402 for (kk, cp) in pirls_res
2403 .reparam_result
2404 .canonical_transformed
2405 .iter()
2406 .enumerate()
2407 {
2408 // S_k(β - μ): only the col_range of beta couples through local penalty.
2409 let r = &cp.col_range;
2410 let local = cp.local_ref();
2411 let beta_block = beta_t.slice(ndarray::s![r.clone()]);
2412 let centered = &beta_block - &cp.prior_mean;
2413 let local_beta = local.dot(¢ered);
2414 let lam_k = lambdas[kk];
2415 let mut acc = s_beta_t.slice_mut(ndarray::s![r.clone()]);
2416 acc.scaled_add(lam_k, &local_beta);
2417 }
2418 match factor.solve(&s_beta_t) {
2419 Ok(b_t) => {
2420 let qs = &pirls_res.reparam_result.qs;
2421 let b_orig = qs.dot(&b_t);
2422 if b_orig.iter().all(|v| v.is_finite()) {
2423 bias_correction_beta = Some(b_orig);
2424 } else {
2425 log::warn!("bias-correction vector contained non-finite entries; skipping");
2426 }
2427 }
2428 Err(e) => {
2429 log::warn!("bias-correction solve failed: {e}");
2430 }
2431 }
2432 // Preserve the factorization for solve-on-demand SE and covariance
2433 // computation below, after dispersion has been determined.
2434 edf_factor = Some(factor);
2435 }
2436
2437 // Persist residual-based scale for Gaussian identity models.
2438 // Contract: residual standard deviation sigma, not variance.
2439 //
2440 // Gaussian REML scale: σ̂² = RSS / (n − edf_total), matching mgcv's gam.scale.
2441 // Using the null-space dim (mp = p − rank(Σ_k S_k)) here was wrong: mp is the
2442 // minimum possible edf (all smooths fully penalized to their null space), so
2443 // n − mp ≥ n − edf_total, and σ̂² was systematically biased low whenever any
2444 // smooth/random-effect spent real edf. edf_total ∈ [mp, p_dim] is the effective
2445 // df computed just above from tr(λ_k · H⁻¹ S_k), and is exactly the residual
2446 // df mgcv uses. When inference is off, edf_total is unavailable, so the MLE
2447 // RSS/n is returned instead.
2448 let standard_deviation = match &pirls_res.likelihood.spec.response {
2449 ResponseFamily::Gaussian => {
2450 let denom = if opts.compute_inference {
2451 (n - edf_total).max(1.0)
2452 } else {
2453 n.max(1.0)
2454 };
2455 (weighted_rss / denom).sqrt()
2456 }
2457 ResponseFamily::Gamma => pirls_res.likelihood.gamma_shape().unwrap_or(1.0),
2458 ResponseFamily::Binomial
2459 | ResponseFamily::Tweedie { .. }
2460 | ResponseFamily::NegativeBinomial { .. }
2461 | ResponseFamily::Beta { .. }
2462 | ResponseFamily::Poisson
2463 | ResponseFamily::RoystonParmar => 1.0,
2464 };
2465 let dispersion = dispersion_from_likelihood(&pirls_res.likelihood, standard_deviation);
2466
2467 // Explicit dispersion contract for coefficient covariance matrices:
2468 // Vb = H⁻¹ · cov_scale, where the stored penalized Hessian is always
2469 // H = XᵀWX + S_λ with the penalty added UNSCALED. The multiplier therefore
2470 // restores ONLY the dispersion the working weight W does not already carry:
2471 //
2472 // * Profiled Gaussian keeps W scale-free (W = priorweights), so the data
2473 // term has unit implicit scale and Vb = H⁻¹·σ̂².
2474 // * Every other family folds its reciprocal dispersion / full Fisher
2475 // information into W (Gamma W = prior/φ, Tweedie W = prior·μ^{2−p}/φ,
2476 // Beta/NB the complete fixed-scale Fisher info, Poisson/Binomial φ ≡ 1),
2477 // so H already equals the true penalized Hessian (identical to mgcv's
2478 // XᵀW_sfX/φ + S_λ) and Vb = H⁻¹ with NO extra dispersion factor. A
2479 // post-hoc ×φ here would double-count the dispersion and shrink every SE
2480 // by √φ (= 1/√shape for Gamma); see #679.
2481 //
2482 // The single source of truth for this invariant is
2483 // `GlmLikelihoodSpec::coefficient_covariance_scale`; the response-level
2484 // observation noise used by predictive intervals stays in `dispersion`
2485 // above (a deliberately distinct quantity, e.g. 1/shape for Gamma).
2486 let cov_scale = pirls_res
2487 .likelihood
2488 .coefficient_covariance_scale(standard_deviation * standard_deviation)
2489 .max(f64::MIN_POSITIVE);
2490
2491 // Compute gradient norm at final rho for reporting
2492 let finalgrad = reml_state
2493 .compute_gradient(&final_rho)
2494 .unwrap_or_else(|_| Array1::from_elem(final_rho.len(), f64::NAN));
2495 let finalgrad_norm_rho = finalgrad.dot(&finalgrad).sqrt();
2496 let finalgrad_norm = if finalgrad_norm_rho.is_finite() {
2497 finalgrad_norm_rho
2498 } else {
2499 outer_result.final_grad_norm.unwrap_or(0.0)
2500 };
2501
2502 if opts.compute_inference {
2503 penalized_hessian = map_hessian_to_original_basis(&pirls_res)?;
2504 let p_cov = penalized_hessian.nrows();
2505 let qs = &pirls_res.reparam_result.qs;
2506
2507 // Auto-select covariance strategy based on model size.
2508 //
2509 // For small-to-medium models (p ≤ COV_FULL_INVERSE_MAX_P) we can afford
2510 // the full p×p inverse: O(p³) compute, O(p²) memory. The full matrix is
2511 // needed for the frequentist covariance Ve = H⁻¹ X'WX H⁻¹ φ, the
2512 // influence matrix F = H⁻¹ X'WX, and the smoothing-parameter correction.
2513 //
2514 // For large models we use solve-on-demand against the Cholesky factor
2515 // already computed for EDF traces above. We solve H_t Z_t = Qs^T in
2516 // column chunks of size COV_SE_CHUNK, then extract the diagonal of
2517 // Qs · Z_t = H_orig⁻¹ to get exact posterior SEs without ever
2518 // materialising the p×p inverse. Prediction bands continue to work via
2519 // the factorised-Hessian path in PredictionCovarianceBackend::Factorized.
2520 const COV_FULL_INVERSE_MAX_P: usize = 10_000;
2521 const COV_SE_CHUNK: usize = 512;
2522
2523 // Attempt the full inverse when the model is small enough.
2524 let beta_covariance_unscaled: Option<Array2<f64>> = if p_cov <= COV_FULL_INVERSE_MAX_P {
2525 match matrix_inversewith_regularization(&penalized_hessian, "posterior covariance") {
2526 Some(h_inv) => Some(h_inv),
2527 None => {
2528 log::warn!(
2529 "posterior covariance inversion failed (p={p_cov}): \
2530 falling back to solve-on-demand standard errors"
2531 );
2532 None
2533 }
2534 }
2535 } else {
2536 None
2537 };
2538
2539 if let Some(ref h_inv) = beta_covariance_unscaled {
2540 // Full inverse available: wrap as phi-scaled covariance, compute
2541 // frequentist quantities, and pass to smoothing-correction cubature.
2542 beta_covariance = Some(gam_problem::dispersion_cov::PhiScaledCovariance::wrap(
2543 scaled_covariance(h_inv.clone(), cov_scale),
2544 ));
2545
2546 // Frequentist covariance Ve = F H⁻¹ φ and influence matrix F = H⁻¹ X'WX.
2547 // Both require the full unscaled inverse; computed in original basis.
2548 //
2549 // The canonical penalties live in the TRANSFORMED frame, while
2550 // `h_inv` is the ORIGINAL-basis inverse — assemble S(λ) in the
2551 // transformed frame and map it through the same congruence as the
2552 // Hessian (`S_orig = Qs·S_t·Qsᵀ`, issue #1027). Pairing the
2553 // transformed-frame S directly with the original-frame inverse made
2554 // `F` (and everything reconstructed from it) frame-inconsistent.
2555 let p_t = qs.ncols();
2556 let mut s_t = Array2::<f64>::zeros((p_t, p_t));
2557 for (kk, cp) in pirls_res
2558 .reparam_result
2559 .canonical_transformed
2560 .iter()
2561 .enumerate()
2562 {
2563 if kk >= lambdas.len() {
2564 continue;
2565 }
2566 let r = &cp.col_range;
2567 let local = cp.local_ref();
2568 let lam = lambdas[kk];
2569 for i in 0..cp.block_dim() {
2570 for j in 0..cp.block_dim() {
2571 s_t[[r.start + i, r.start + j]] += lam * local[[i, j]];
2572 }
2573 }
2574 }
2575 let mut s_mat = qs.dot(&s_t).dot(&qs.t());
2576 gam_linalg::matrix::symmetrize_in_place(&mut s_mat);
2577 // Influence matrix F = I − H⁻¹·S(λ) = H⁻¹·X'WX. This is a product
2578 // of two symmetric matrices and is therefore generally NOT
2579 // symmetric; it must not be symmetrized — `gam_linalg::matrix::symmetrize_in_place(F)`
2580 // both breaks the H·F = X'WX consistency identity (so any
2581 // downstream code that reconstructs X'WX from H·F lands on an
2582 // asymmetric/indefinite matrix) AND corrupts the frequentist
2583 // covariance `Ve = F·H⁻¹·φ` (since (F_sym)·H⁻¹ ≠ H⁻¹·X'WX·H⁻¹)
2584 // AND distorts the Wood-corrected reference d.f.
2585 // `tr(F_jj)² / tr(F_jj²)` consumed by `smooth_test::reference_df`
2586 // (tr(F²) ≠ tr(F_sym²) in general). See issue #1027.
2587 let mut f_mat = Array2::<f64>::eye(p_cov);
2588 f_mat -= &h_inv.dot(&s_mat);
2589 let mut ve = f_mat.dot(h_inv);
2590 ve *= cov_scale;
2591 gam_linalg::matrix::symmetrize_in_place(&mut ve);
2592 // X'WX = H − S(λ) in the original basis — the genuine PSD weighted
2593 // Gram, reconstructed from the same `penalized_hessian` and `s_mat`
2594 // that define `F = H⁻¹X'WX` (issue #1027). Stored directly so the
2595 // WPS corrected-EDF correction never has to recover it from an
2596 // inconsistent `H·F` product.
2597 let mut xwx = &penalized_hessian - &s_mat;
2598 gam_linalg::matrix::symmetrize_in_place(&mut xwx);
2599 weighted_gram = Some(xwx);
2600 coefficient_influence = Some(f_mat);
2601 beta_covariance_frequentist = Some(ve);
2602 }
2603
2604 // Smoothing-parameter correction (first-order delta + optional cubature).
2605 // Passes None for large models; compute_smoothing_correction_auto falls
2606 // back to first-order correction when no base covariance is supplied.
2607 // `cov_scale` is the coefficient-covariance multiplier at the optimum
2608 // (σ̂² for profiled Gaussian, 1 for every weight-carries-dispersion
2609 // family). The cubature path multiplies its dispersion-free curvature
2610 // block `E_ρ[H(ρ)⁻¹] − H_opt⁻¹` by this scale so the FULL cubature
2611 // correction lands on the same c² variance scale as `Vb = cov_scale·H_opt⁻¹`
2612 // (#582); the var_beta = Cov_ρ[β̂] block is already on that scale and
2613 // stays unscaled.
2614 let smoothing_outcome = reml_state.compute_smoothing_correction_auto(
2615 &final_rho,
2616 &pirls_res,
2617 beta_covariance_unscaled.as_ref(),
2618 cov_scale,
2619 finalgrad_norm,
2620 );
2621 rho_covariance = smoothing_outcome.rho_covariance().cloned();
2622 smoothing_correction = smoothing_outcome.into_correction();
2623
2624 // Tier-0 marginal-smoothing certificate (#938): while the REML objective
2625 // is still live, sample the outer criterion around the converged ρ̂ to
2626 // read the PSIS k̂ that says whether the plug-in + first-order V_ρ
2627 // correction is adequate. This is the objective-lifecycle seam — the
2628 // certificate runs against the SAME objective the fit converged on, so
2629 // its criterion is the fit's own bit-for-bit (no retain/rebuild). Absent
2630 // when there are no smoothing parameters or the outer Hessian is
2631 // unavailable; never fatal. Superseded intermediate fits skip this block
2632 // and the caller must refit with a live objective before returning that
2633 // model. When the certificate reads Escalate, the auto-selected escalation
2634 // tier (quadrature for K≤4, NUTS over ρ for K≤16, honest Unavailable
2635 // beyond) runs at this same live seam.
2636 if !opts.skip_rho_posterior_inference {
2637 (rho_posterior_certificate, rho_posterior_escalation) =
2638 reml_state.rho_posterior_inference(&final_rho, None);
2639 }
2640
2641 // Standard errors: prefer the diagonal of the full inverse when
2642 // available; otherwise use the factorised Hessian from the EDF pass
2643 // (in transformed basis) to compute exact diagonal of H_orig⁻¹ =
2644 // Qs H_t⁻¹ Qs' via chunked solve-on-demand. Memory per chunk:
2645 // 2 × p × COV_SE_CHUNK × 8 bytes.
2646 beta_standard_errors = if let Some(ref h_inv) = beta_covariance_unscaled {
2647 // Fast path: SE from stored full inverse (already phi-scaled via
2648 // beta_covariance, but we need the unscaled diagonal here).
2649 let raw_se = Array1::from_iter(
2650 h_inv
2651 .diag()
2652 .iter()
2653 .map(|&v| (cov_scale * v.max(0.0)).sqrt()),
2654 );
2655 Some(raw_se)
2656 } else if let Some(ref factor_t) = edf_factor {
2657 // Solve-on-demand: process columns of Qs^T in chunks.
2658 // Qs is (p_cov × p_t) orthogonal. H_orig⁻¹ = Qs H_t⁻¹ Qs'.
2659 // (H_orig⁻¹)_{ii} = Qs[i,:] · H_t⁻¹ · Qs[i,:]'
2660 // Batch: column i of Qs^T is row i of Qs. Solve H_t Z = Qs^T[:,chunk]
2661 // then dot each solution column back with the corresponding Qs row.
2662 let mut diag_inv = Array1::<f64>::zeros(p_cov);
2663 let mut col_start = 0usize;
2664 while col_start < p_cov {
2665 let col_end = (col_start + COV_SE_CHUNK).min(p_cov);
2666 let chunk = col_end - col_start;
2667 // qs.t() has shape (p_t, p_cov); slice to (p_t, chunk).
2668 let rhs = qs.t().slice(ndarray::s![.., col_start..col_end]).to_owned();
2669 match factor_t.solvemulti(&rhs) {
2670 Ok(z_chunk) => {
2671 // z_chunk is (p_t × chunk).
2672 // (H_orig⁻¹)_{ii} = qs.row(i) · z_chunk.column(i - col_start)
2673 for local_i in 0..chunk {
2674 let global_i = col_start + local_i;
2675 let qs_row = qs.row(global_i);
2676 let z_col = z_chunk.column(local_i);
2677 diag_inv[global_i] = qs_row.dot(&z_col);
2678 }
2679 }
2680 Err(e) => {
2681 log::warn!(
2682 "SE solve-on-demand failed at chunk {col_start}..{col_end}: {e}"
2683 );
2684 // Leave remaining entries as 0 (no SE).
2685 break;
2686 }
2687 }
2688 col_start = col_end;
2689 }
2690 let se = diag_inv.mapv(|v| (cov_scale * v.max(0.0)).sqrt());
2691 if se.iter().all(|v| v.is_finite()) {
2692 Some(se)
2693 } else {
2694 log::warn!("SE solve-on-demand produced non-finite entries; discarding");
2695 None
2696 }
2697 } else {
2698 None
2699 };
2700
2701 // Vp = Vb + J·V_ρ·Jᵀ, both terms on the SAME dispersion (variance) scale.
2702 //
2703 // The smoothing correction is built from the coefficient sensitivities
2704 // J = dβ̂/dρ = −H⁻¹(λ_k S_k(β̂ − μ_k)), which are linear in β̂, and from
2705 // V_ρ = (∇²_ρρ V)⁻¹. Under a Gaussian rescaling y → c·y the fit is exactly
2706 // equivariant: β̂ → c·β̂ (so J → c·J), H is response-scale-invariant, the
2707 // REML/LAML cost gains only a ρ-independent (n/2)·log(c²) offset (so its
2708 // ρ-gradient and ρ-Hessian — hence V_ρ — are dispersion-free), and φ̂ → c²·φ̂.
2709 // Therefore J·V_ρ·Jᵀ ∝ c · c⁰ · c = c², i.e. the correction is already on
2710 // the c² variance scale, exactly like Vb = φ̂·H⁻¹ ∝ c². It must be added
2711 // directly to Vb. Multiplying it by cov_scale
2712 // (≈ c²) again would make the correction scale as c⁴, inflating every
2713 // predict() interval for large-magnitude responses (#582). cov_scale is
2714 // applied once, where it belongs: in Vb = scaled_covariance(H⁻¹, cov_scale).
2715 beta_covariance_corrected = match (&beta_covariance, &smoothing_correction) {
2716 (Some(base_cov), Some(corr)) if base_cov.as_array().dim() == corr.dim() => {
2717 let mut corrected = base_cov.as_array().clone();
2718 corrected += corr;
2719 gam_linalg::matrix::symmetrize_in_place(&mut corrected);
2720 Some(corrected)
2721 }
2722 (Some(_), Some(corr)) => {
2723 log::warn!(
2724 "Skipping corrected covariance: dimension mismatch (base {:?}, corr {:?})",
2725 beta_covariance.as_ref().map(|c| c.as_array().dim()),
2726 Some(corr.dim())
2727 );
2728 None
2729 }
2730 _ => None,
2731 };
2732 beta_standard_errors_corrected = beta_covariance_corrected.as_ref().map(se_from_covariance);
2733 }
2734 let inference = opts.compute_inference.then(|| FitInference {
2735 edf_by_block,
2736 penalty_block_trace,
2737 edf_total,
2738 smoothing_correction,
2739 penalized_hessian: penalized_hessian.into(),
2740 working_weights: pirls_res.solveweights.clone(),
2741 working_response: pirls_res.solveworking_response.clone(),
2742 reparam_qs: Some(pirls_res.reparam_result.qs.clone()),
2743 dispersion,
2744 beta_covariance,
2745 beta_standard_errors,
2746 beta_covariance_corrected,
2747 beta_standard_errors_corrected,
2748 beta_covariance_frequentist,
2749 coefficient_influence,
2750 weighted_gram,
2751 bias_correction_beta,
2752 });
2753
2754 let pirls_status = pirls_res.status;
2755 let likelihood_scale_field = pirls_res.likelihood.scale;
2756 let log_likelihood = crate::pirls::calculate_loglikelihood_omitting_constants(
2757 y_o.view(),
2758 &pirls_res.finalmu,
2759 &pirls_res.likelihood,
2760 w_o.view(),
2761 );
2762
2763 // Report the fitted Negative-Binomial overdispersion `theta` on the family
2764 // variant (issue #802). Unlike the Gamma shape / Tweedie φ (which live only
2765 // in `likelihood_scale`) and the Beta φ (whose estimate downstream consumers
2766 // read from `likelihood_scale` via a separate override), NB `theta` is the
2767 // *canonical* parameter on `ResponseFamily::NegativeBinomial { theta }` that
2768 // every NB predictive consumer (prediction-interval variance, quadrature,
2769 // sampling, `generate` draws) reads directly off the saved family. The fit
2770 // updated it in lock-step with the `EstimatedNegBinTheta` scale metadata via
2771 // `with_negbin_theta`, so threading that fitted `theta` back onto the reported
2772 // family is what makes those consumers see the data's overdispersion instead
2773 // of the seed. Non-NB families keep `opts.family` (their estimates live in the
2774 // scale metadata), preserving the existing seed-in-family convention.
2775 let mut reported_family = opts.family.clone();
2776 if let (
2777 ResponseFamily::NegativeBinomial { theta, .. },
2778 LikelihoodScaleMetadata::EstimatedNegBinTheta {
2779 theta: fitted_theta,
2780 },
2781 ) = (&mut reported_family.response, likelihood_scale_field)
2782 {
2783 *theta = fitted_theta;
2784 }
2785
2786 let result = ExternalOptimResult {
2787 beta: beta_orig_internal,
2788 lambdas: lambdas.to_owned(),
2789 likelihood_family: reported_family,
2790 likelihood_scale: likelihood_scale_field,
2791 log_likelihood_normalization: LogLikelihoodNormalization::OmittingResponseConstants,
2792 log_likelihood,
2793 standard_deviation,
2794 iterations: iters,
2795 finalgrad_norm,
2796 outer_converged: outer_result.converged,
2797 pirls_status,
2798 deviance: pirls_res.deviance,
2799 stable_penalty_term: pirls_res.stable_penalty_term,
2800 used_device: pirls_res.used_device,
2801 max_abs_eta: pirls_res.max_abs_eta,
2802 constraint_kkt: pirls_res.constraint_kkt.clone(),
2803 artifacts: FitArtifacts {
2804 pirls: Some(pirls_res),
2805 criterion_certificate: outer_result.criterion_certificate.clone(),
2806 rho_posterior_certificate,
2807 rho_posterior_escalation,
2808 rho_covariance,
2809 ..Default::default()
2810 },
2811 inference,
2812 reml_score: outer_result.final_value,
2813 outer_cost_evals: usize::try_from(*reml_state.arena.cost_eval_count.read().unwrap())
2814 .unwrap_or(usize::MAX),
2815 inner_pirls_solves: usize::try_from(
2816 reml_state
2817 .arena
2818 .inner_pirls_solve_count
2819 .load(std::sync::atomic::Ordering::Relaxed),
2820 )
2821 .unwrap_or(usize::MAX),
2822 fitted_link: if let Some(state) = final_mixture_state {
2823 FittedLinkState::Mixture {
2824 state,
2825 covariance: final_mixture_param_covariance,
2826 }
2827 } else if let Some(state) = opts.latent_cloglog {
2828 FittedLinkState::LatentCLogLog { state }
2829 } else if let Some(state) = final_sas_state {
2830 if opts.family.is_binomial_sas() {
2831 FittedLinkState::Sas {
2832 state,
2833 covariance: final_sas_param_covariance,
2834 }
2835 } else if opts.family.is_binomial_beta_logistic() {
2836 FittedLinkState::BetaLogistic {
2837 state,
2838 covariance: final_sas_param_covariance,
2839 }
2840 } else {
2841 FittedLinkState::Standard(None)
2842 }
2843 } else {
2844 FittedLinkState::Standard(None)
2845 },
2846 };
2847 Ok(conditioning.backtransform_external_result(result))
2848}
2849
2850#[cfg(test)]
2851mod blended_mixture_link_solve_tests {
2852 //! Regression coverage for issue #1598.
2853 //!
2854 //! A `blended(logit, probit)` learnable link NESTS each pure component
2855 //! (mixing weight 1 on one inverse link). On data generated from a plain
2856 //! logit link, the joint (smoothing + mixing-weight ρ) solve must therefore
2857 //! converge and recover a near-logit fit — at the ρ=0 seed the mixture
2858 //! collapses to its well-conditioned first component (logit).
2859 //!
2860 //! Before the fix the inner P-IRLS observed-Hessian curvature build aborted
2861 //! with "observed Hessian curvature is not positive finite at row N" the
2862 //! moment a single row's residual-dependent observed weight went
2863 //! non-positive (which the non-canonical mixture path produces on finite
2864 //! data even at the logit-collapsed seed). That abort propagated up as
2865 //! "no candidate seeds passed outer startup validation (mixture/SAS flexible
2866 //! link)" and the whole fit failed. The downstream solver weight floor
2867 //! (`solver_hessian_weights_into`) is designed to clamp exactly those rows
2868 //! to keep the Newton system SPD, so the array build must tolerate a finite
2869 //! non-positive observed weight rather than hard-bail on it.
2870
2871 use super::optimize_external_design;
2872 use crate::estimate::external_options::ExternalOptimOptions;
2873 use gam_problem::{
2874 InverseLink, LikelihoodSpec, LinkComponent, MixtureLinkSpec, ResponseFamily, StandardLink,
2875 };
2876 use gam_terms::smooth::BlockwisePenalty;
2877 use ndarray::{Array1, Array2};
2878
2879 #[test]
2880 fn blended_logit_probit_fits_clean_logit_data() {
2881 // --- Synthetic clean logit data ---------------------------------
2882 // y_i ~ Bernoulli(logistic(eta_i)), eta_i = b0 + b1 * x_i.
2883 let n = 120usize;
2884 let b0 = -0.3_f64;
2885 let b1 = 1.7_f64;
2886 // Deterministic pseudo-random x and Bernoulli draws so the test is
2887 // reproducible without an RNG dependency.
2888 let mut x = Array1::<f64>::zeros(n);
2889 let mut y = Array1::<f64>::zeros(n);
2890 let mut true_p = Array1::<f64>::zeros(n);
2891 let mut seed = 0x9E3779B97F4A7C15u64;
2892 let mut next_unit = || {
2893 // SplitMix64 → uniform in [0, 1).
2894 seed = seed.wrapping_add(0x9E3779B97F4A7C15);
2895 let mut z = seed;
2896 z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
2897 z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
2898 z ^= z >> 31;
2899 (z >> 11) as f64 / (1u64 << 53) as f64
2900 };
2901 for i in 0..n {
2902 let xi = -2.5 + 5.0 * next_unit();
2903 let eta = b0 + b1 * xi;
2904 let p = 1.0 / (1.0 + (-eta).exp());
2905 x[i] = xi;
2906 true_p[i] = p;
2907 y[i] = if next_unit() < p { 1.0 } else { 0.0 };
2908 }
2909
2910 // Design with intercept + slope, both unpenalized (zero penalty block).
2911 let mut design = Array2::<f64>::zeros((n, 2));
2912 for i in 0..n {
2913 design[[i, 0]] = 1.0;
2914 design[[i, 1]] = x[i];
2915 }
2916 let w = Array1::<f64>::ones(n);
2917 let offset = Array1::<f64>::zeros(n);
2918 let penalty = BlockwisePenalty::new(0..2, Array2::<f64>::zeros((2, 2)));
2919
2920 // blended(logit, probit) with the documented ρ=0 (pure first-component)
2921 // seed: mixing weight 1 on logit, 0 on probit.
2922 let mix_spec = MixtureLinkSpec {
2923 components: vec![LinkComponent::Logit, LinkComponent::Probit],
2924 initial_rho: Array1::zeros(1),
2925 };
2926
2927 let opts = ExternalOptimOptions {
2928 family: LikelihoodSpec::new(
2929 ResponseFamily::Binomial,
2930 InverseLink::Standard(StandardLink::Logit),
2931 ),
2932 latent_cloglog: None,
2933 mixture_link: Some(mix_spec),
2934 optimize_mixture: true,
2935 sas_link: None,
2936 optimize_sas: false,
2937 compute_inference: false,
2938 skip_rho_posterior_inference: true,
2939 max_iter: 200,
2940 tol: 1e-9,
2941 nullspace_dims: vec![2],
2942 linear_constraints: None,
2943 firth_bias_reduction: None,
2944 penalty_shrinkage_floor: None,
2945 rho_prior: Default::default(),
2946 kronecker_penalty_system: None,
2947 kronecker_factored: None,
2948 persist_warm_start_disk: false,
2949 };
2950
2951 // The joint mixture solve must converge (no error) on data its own pure
2952 // logit component fits trivially.
2953 let result = optimize_external_design(
2954 y.view(),
2955 w.view(),
2956 design.clone(),
2957 offset.view(),
2958 vec![penalty],
2959 &opts,
2960 )
2961 .expect("blended(logit, probit) joint solve must converge on clean logit data");
2962
2963 // Reconstruct the fitted linear predictor η̂ = X·β̂ and correlate it
2964 // with the true probabilities. The mixture inverse link is monotone, so
2965 // a faithful fit makes corr(η̂, true_p) ≈ 1.
2966 assert_eq!(result.beta.len(), 2, "fitted β has intercept + slope");
2967 let mut eta_hat = Array1::<f64>::zeros(n);
2968 for i in 0..n {
2969 eta_hat[i] = design[[i, 0]] * result.beta[0] + design[[i, 1]] * result.beta[1];
2970 }
2971 let corr = pearson(&eta_hat, &true_p);
2972 assert!(
2973 corr > 0.95,
2974 "blended(logit, probit) on clean logit data must recover a near-logit \
2975 fit: corr(η̂, true_p)={corr:.4} (β̂={:?}, λ̂={:?})",
2976 result.beta.as_slice().unwrap(),
2977 result.lambdas.as_slice().unwrap(),
2978 );
2979 // The recovered slope must keep the sign and order of magnitude of the
2980 // generating coefficient (it is not penalized).
2981 assert!(
2982 result.beta[1] > 0.5,
2983 "recovered slope must stay strongly positive: β̂₁={}",
2984 result.beta[1]
2985 );
2986 }
2987
2988 fn pearson(a: &Array1<f64>, b: &Array1<f64>) -> f64 {
2989 let n = a.len() as f64;
2990 let ma = a.sum() / n;
2991 let mb = b.sum() / n;
2992 let mut cov = 0.0;
2993 let mut va = 0.0;
2994 let mut vb = 0.0;
2995 for i in 0..a.len() {
2996 let da = a[i] - ma;
2997 let db = b[i] - mb;
2998 cov += da * db;
2999 va += da * da;
3000 vb += db * db;
3001 }
3002 cov / (va.sqrt() * vb.sqrt())
3003 }
3004}