Skip to main content

gam_solve/estimate/
optimizer.rs

1use super::*;
2use gam_problem::dispersion_cov::se_from_covariance;
3use std::sync::atomic::{AtomicUsize, Ordering};
4
5/// Optimize smoothing parameters for an external design using the same REML/LAML machinery.
6pub fn optimize_external_design<X>(
7    y: ArrayView1<'_, f64>,
8    w: ArrayView1<'_, f64>,
9    x: X,
10    offset: ArrayView1<'_, f64>,
11    s_list: Vec<BlockwisePenalty>,
12    opts: &ExternalOptimOptions,
13) -> Result<ExternalOptimResult, EstimationError>
14where
15    X: Into<DesignMatrix>,
16{
17    optimize_external_designwith_heuristic_lambdas(y, w, x, offset, s_list, None, opts)
18}
19
20/// Same as `optimize_external_design`, but allows heuristic λ warm-start seeds
21/// for the outer smoothing search.
22pub fn optimize_external_designwith_heuristic_lambdas<X>(
23    y: ArrayView1<'_, f64>,
24    w: ArrayView1<'_, f64>,
25    x: X,
26    offset: ArrayView1<'_, f64>,
27    s_list: Vec<BlockwisePenalty>,
28    heuristic_lambdas: Option<&[f64]>,
29    opts: &ExternalOptimOptions,
30) -> Result<ExternalOptimResult, EstimationError>
31where
32    X: Into<DesignMatrix>,
33{
34    let specs: Vec<PenaltySpec> = s_list
35        .into_iter()
36        .map(PenaltySpec::from_blockwise)
37        .collect();
38    optimize_external_designwith_heuristic_lambdas_andwarm_start(
39        y,
40        w,
41        x,
42        offset,
43        specs,
44        heuristic_lambdas,
45        None,
46        opts,
47    )
48}
49
50pub(crate) fn external_reml_seed_config(k: usize, link: LinkFunction) -> SeedConfig {
51    let gaussian = matches!(link, LinkFunction::Identity);
52    if k >= REML_SEED_SCREENING_RHO_CAP {
53        let seed_budget = if gaussian { 1 } else { 2 };
54        return SeedConfig {
55            bounds: (-12.0, 12.0),
56            max_seeds: seed_budget,
57            seed_budget,
58            risk_profile: if gaussian {
59                SeedRiskProfile::Gaussian
60            } else {
61                SeedRiskProfile::GeneralizedLinear
62            },
63            screen_max_inner_iterations: SeedConfig::default().screen_max_inner_iterations,
64            num_auxiliary_trailing: 0,
65            over_smoothing_probe_rho: None,
66        };
67    }
68    SeedConfig {
69        bounds: (-12.0, 12.0),
70        max_seeds: if gaussian && k <= 4 {
71            2
72        } else if gaussian && k <= 12 {
73            4
74        } else if gaussian {
75            6
76        } else if k <= 4 {
77            6
78        } else if k <= 12 {
79            8
80        } else {
81            10
82        },
83        seed_budget: if gaussian && k <= 6 { 1 } else { 2 },
84        risk_profile: if gaussian {
85            SeedRiskProfile::Gaussian
86        } else {
87            SeedRiskProfile::GeneralizedLinear
88        },
89        screen_max_inner_iterations: SeedConfig::default().screen_max_inner_iterations,
90        num_auxiliary_trailing: 0,
91        over_smoothing_probe_rho: None,
92    }
93}
94
95fn reml_inner_progress_feedback(
96    state: &crate::estimate::reml::RemlState<'_>,
97) -> crate::rho_optimizer::InnerProgressFeedback {
98    crate::rho_optimizer::InnerProgressFeedback {
99        cap: Arc::clone(&state.outer_inner_cap),
100        accepted_iter: Arc::new(AtomicUsize::new(0)),
101        last_iters: Arc::clone(&state.last_inner_iters),
102        last_converged: Arc::clone(&state.last_inner_converged),
103        ift_residual: Arc::clone(&state.last_ift_prediction_residual),
104        accept_rho: Arc::clone(&state.last_pirls_accept_rho),
105    }
106}
107
108fn with_reml_beta_seed_hook<'state, 'data>() -> impl FnMut(
109    &mut &'state mut crate::estimate::reml::RemlState<'data>,
110    &Array1<f64>,
111) -> Result<
112    crate::rho_optimizer::SeedOutcome,
113    EstimationError,
114> {
115    |state, beta| {
116        // The REML state stores β as a starting-iterate HINT and validates
117        // its width against the design (`self.p`) at store time, silently
118        // dropping a mismatched or non-finite hint rather than faulting
119        // (see `setwarm_start_original_beta`). A wrong-length seed is
120        // therefore never an error: a row-relaxed cross-fold prefix seed
121        // degrades to a ρ-only resume, exactly the desired warm-start
122        // behaviour. The slot's post-call state (the supplied β if it fit,
123        // else the prior state) is what the next eval warm-starts from, so
124        // `Installed` is the correct contract reply.
125        state.setwarm_start_original_beta(Some(beta.view()));
126        Ok(crate::rho_optimizer::SeedOutcome::Installed)
127    }
128}
129
130enum RemlInnerCapGuardArm {
131    Standard,
132    MixtureSas,
133}
134
135/// Re-run one full-inner-tolerance `compute_cost` at the converged operating
136/// point so the cached warm-start β is no longer pinned to whatever coarse cap
137/// the outer-aware inner-PIRLS schedule last set (path #3; see the call-site
138/// comments).
139///
140/// `rho` MUST be the smoothing-only penalty-block log-λ — its length equals the
141/// number of penalty blocks, because `compute_cost` exponentiates the whole
142/// vector into the penalty λ vector. For the parameterized-link arms
143/// (`MixtureSas`: SAS / Beta-Logistic / mixture / blended) the outer optimizer
144/// works in an augmented θ that trails the link parameters after the smoothing
145/// block; the caller must slice that block out (and install the link state on
146/// `state`) via `apply_link_theta` BEFORE calling this guard. Passing the raw
147/// augmented θ here over-counts the lambdas and faults the reparameterizer
148/// ("Lambda count mismatch", #1571). The `arm` only selects the log label.
149fn run_outer_inner_cap_guard(
150    state: &mut crate::estimate::reml::RemlState<'_>,
151    rho: &Array1<f64>,
152    arm: RemlInnerCapGuardArm,
153) -> Result<(), EstimationError> {
154    let prev_cap = state.outer_inner_cap.swap(0, Ordering::Relaxed);
155    if prev_cap != 0 {
156        let guard_start = std::time::Instant::now();
157        state.compute_cost(rho)?;
158        match arm {
159            RemlInnerCapGuardArm::Standard => log::info!(
160                "[OUTER guard] convergence-guard re-eval at converged ρ done (prev_cap={prev_cap}, elapsed={:.3}s)",
161                guard_start.elapsed().as_secs_f64()
162            ),
163            RemlInnerCapGuardArm::MixtureSas => log::info!(
164                "[OUTER guard] convergence-guard re-eval at converged ρ done (mixture/SAS arm; prev_cap={prev_cap}, elapsed={:.3}s)",
165                guard_start.elapsed().as_secs_f64()
166            ),
167        }
168    } else if matches!(arm, RemlInnerCapGuardArm::Standard) {
169        log::debug!("[OUTER guard] schedule never lifted (prev_cap=0); skipping refit");
170    }
171    Ok(())
172}
173
174/// The weighted-mean response level an unpenalized intercept would absorb, used
175/// to center the response during outer REML λ-selection (issue #1000).
176///
177/// For an identity-link Gaussian fit, adding a constant to the response only
178/// shifts the intercept, so λ̂ and the smooth shape must be invariant to the
179/// response mean. The outer score/gradient nonetheless accumulate
180/// `yᵀy`-magnitude sufficient statistics, so a large response mean costs
181/// precision and drifts λ̂. Returns `Some(m)` with
182/// `m = Σ wᵢ (yᵢ − offsetᵢ) / Σ wᵢ` — the constant a pure offset relabeling
183/// moves into the intercept — so the caller can subtract it and keep the working
184/// response `O(σ)` regardless of the mean.
185///
186/// Returns `None` (do not center, exact previous behaviour) unless the fit is
187/// identity-link Gaussian and carries an unpenalized intercept column to absorb
188/// the shift, and has no linear constraints that could pin the intercept. A zero
189/// or non-finite mean also returns `None` — there is nothing to gain.
190fn gaussian_identity_response_center(
191    cfg: &RemlConfig,
192    conditioning: &ParametricColumnConditioning,
193    has_linear_constraints: bool,
194    y: ArrayView1<'_, f64>,
195    w: ArrayView1<'_, f64>,
196    offset: ArrayView1<'_, f64>,
197) -> Option<f64> {
198    if has_linear_constraints
199        || conditioning.intercept_idx.is_none()
200        || !matches!(cfg.likelihood.spec.response, ResponseFamily::Gaussian)
201        || !matches!(cfg.link_function(), LinkFunction::Identity)
202    {
203        return None;
204    }
205    let mut weight_sum = 0.0_f64;
206    let mut weighted = KahanSum::default();
207    for ((&yi, &wi), &oi) in y.iter().zip(w.iter()).zip(offset.iter()) {
208        if wi > 0.0 {
209            weight_sum += wi;
210            weighted.add(wi * (yi - oi));
211        }
212    }
213    if weight_sum <= 0.0 {
214        return None;
215    }
216    let m = weighted.sum() / weight_sum;
217    (m.is_finite() && m != 0.0).then_some(m)
218}
219
220/// The multiplicative scale an identity-link Gaussian outer REML λ-search should
221/// divide the (already centered) response by so its magnitude is `O(1)` for the
222/// duration of the search (issue #1127).
223///
224/// Replacing the response `y` by `a·y` (`a > 0`) for an identity-link Gaussian
225/// fit must rescale the entire fit by `a` and leave `λ̂` / EDF unchanged: the
226/// penalized normal equations are exactly linear in `y`, so `β̂(a·y)=a·β̂(y)`
227/// at any fixed `λ`, and the profiled REML criterion is `a`-invariant up to the
228/// additive constant `−(n−p)·ln a` (the dispersion `σ̂²` absorbs the `a²`).
229/// Numerically, though, the outer λ-selection's convergence band is keyed to an
230/// *absolute* objective scale (the inner-solve `objective_scale.max(1.0)` floor
231/// and the outer `1e-6` gradient floor): when the whole Gaussian objective is
232/// `O(a²) ≪ 1` those floors swamp the real signal and the optimizer declares
233/// premature convergence at an over-smoothed `λ` — silently over-smoothing
234/// small-magnitude responses (strains, volts, mole fractions, returns;
235/// `a ≈ 1e-6`). Normalizing the working response to `O(1)` makes the absolute
236/// floors track the true signal, restoring scale equivariance.
237///
238/// Returns `Some(s)` with `s = √(Σ wᵢ (yᵢ − mean)² / Σ wᵢ)` — the weighted RMS
239/// of the centered response — so the caller can divide by it and keep the outer
240/// working response `O(1)` regardless of magnitude. The same gate as
241/// [`gaussian_identity_response_center`] applies (identity-link Gaussian with an
242/// unpenalized intercept and no linear constraints); a non-finite, zero, or
243/// already-`O(1)` RMS returns `None` (do not scale, exact previous behaviour) —
244/// scaling near unity buys nothing and only risks a needless allocation.
245fn gaussian_identity_response_scale(
246    cfg: &RemlConfig,
247    conditioning: &ParametricColumnConditioning,
248    has_linear_constraints: bool,
249    center: f64,
250    y: ArrayView1<'_, f64>,
251    w: ArrayView1<'_, f64>,
252    offset: ArrayView1<'_, f64>,
253) -> Option<f64> {
254    if has_linear_constraints
255        || conditioning.intercept_idx.is_none()
256        || !matches!(cfg.likelihood.spec.response, ResponseFamily::Gaussian)
257        || !matches!(cfg.link_function(), LinkFunction::Identity)
258    {
259        return None;
260    }
261    // A multiplicative response rescale `y → y/s` must be matched by `η → η/s`
262    // for the residual to scale cleanly. The intercept and smooth coefficients
263    // scale freely, but a *fixed* offset column does not — scaling the working
264    // response while leaving the offset on its original scale would change the
265    // residual geometry, not just its magnitude. The offset is shared verbatim
266    // into the outer state and reused by the accept-fit, so rather than thread a
267    // separately scaled copy everywhere, restrict the (rare) offset case to the
268    // exact previous path: only normalize when there is no nonzero offset.
269    if offset.iter().any(|&o| o != 0.0) {
270        return None;
271    }
272    let mut weight_sum = 0.0_f64;
273    let mut weighted_sq = KahanSum::default();
274    for ((&yi, &wi), &oi) in y.iter().zip(w.iter()).zip(offset.iter()) {
275        if wi > 0.0 {
276            weight_sum += wi;
277            let centered = (yi - oi) - center;
278            weighted_sq.add(wi * centered * centered);
279        }
280    }
281    if weight_sum <= 0.0 {
282        return None;
283    }
284    let rms = (weighted_sq.sum() / weight_sum).sqrt();
285    // Only normalize when the magnitude is far enough from `O(1)` to matter; a
286    // factor within ~one order of magnitude of unity cannot push the objective
287    // through the absolute floors, so leave the exact previous path untouched.
288    (rms.is_finite() && rms > 0.0 && !(0.1..=10.0).contains(&rms)).then_some(rms)
289}
290
291pub(crate) fn optimize_external_designwith_heuristic_lambdas_andwarm_start<X>(
292    y: ArrayView1<'_, f64>,
293    w: ArrayView1<'_, f64>,
294    x: X,
295    offset: ArrayView1<'_, f64>,
296    s_list: Vec<PenaltySpec>,
297    heuristic_lambdas: Option<&[f64]>,
298    warm_start_beta: Option<ArrayView1<'_, f64>>,
299    opts: &ExternalOptimOptions,
300) -> Result<ExternalOptimResult, EstimationError>
301where
302    X: Into<DesignMatrix>,
303{
304    if opts.family.is_binomial_mixture() && opts.mixture_link.is_none() {
305        crate::bail_invalid_estim!("BinomialMixture requires mixture_link specification");
306    }
307    let x = x.into();
308    if let Some(message) = row_mismatch_message(y.len(), w.len(), x.nrows(), offset.len()) {
309        crate::bail_invalid_estim!("{}", message);
310    }
311
312    let p = x.ncols();
313    // Raw design row count, captured before `x` is moved (line ~339); used by the
314    // #1266 null-space shrink-out escape's `n ≥ 2·p` determinacy gate, which must
315    // match `relax_smoothing_rho_prior`'s well-determined gate exactly.
316    let n_design_rows = x.nrows();
317    validate_penalty_specs(&s_list, p, "optimize_external_design")?;
318    let (canonical, active_nullspace_dims) = gam_terms::construction::canonicalize_penalty_specs(
319        &s_list,
320        &opts.nullspace_dims,
321        p,
322        "optimize_external_design",
323    )?;
324    let conditioning = ParametricColumnConditioning::infer_from_penalty_specs(&x, &s_list);
325    let x_fit = conditioning.apply_to_design(&x);
326    let fit_linear_constraints =
327        conditioning.transform_linear_constraints_to_internal(opts.linear_constraints.clone());
328    let k = canonical.len();
329    if active_nullspace_dims.len() != k {
330        crate::bail_invalid_estim!(
331            "nullspace_dims length mismatch: expected {k} entries for active penalties, got {}",
332            active_nullspace_dims.len()
333        );
334    }
335    let (cfg, effective_sas_link) = resolved_external_config(opts)?;
336    reject_prefit_unpenalized_rank_deficiency(w, &x_fit, &canonical)?;
337    reject_prefit_binomial_separation(&cfg, y, w, &x_fit, &canonical)?;
338
339    let design_kind = match &x {
340        DesignMatrix::Dense(_) => "dense",
341        DesignMatrix::Sparse(_) => "sparse",
342    };
343    log::info!(
344        "[GAM fit] n={} p={} k={} fam={:?} link={:?} X={} reml_iter={} firth={}",
345        y.len(),
346        p,
347        k,
348        opts.family,
349        cfg.link_function(),
350        design_kind,
351        opts.max_iter,
352        cfg.firth_bias_reduction
353    );
354
355    // Own the external arrays once; the conditioned design is shared through `reml_state`.
356    let y_o = y.to_owned();
357    let w_o = w.to_owned();
358    let x_o = x;
359    let offset_o = offset.to_owned();
360    let canonical_shared = Arc::new(canonical);
361    let cfg_shared = Arc::new(cfg.clone());
362
363    // Issue #1000: for an identity-link Gaussian fit with an unpenalized
364    // intercept, adding a constant `c` to the response is a *pure relabeling of
365    // the intercept* — the hat matrix annihilates the constant column, so the
366    // residuals, the profiled REML criterion, λ̂, and the smooth shape are all
367    // invariant to `c`. Numerically, though, the outer REML score/gradient
368    // accumulate `yᵀy`-magnitude sufficient statistics (e.g. the cached
369    // `XᵀW(y−offset)`), so an uncentered large-mean response injects a `c²`
370    // term that loses precision and drifts λ̂ — silently over-smoothing
371    // large-mean responses (Kelvin temperatures, financial levels, calendar
372    // years). Center the response by the (weighted) mean the intercept would
373    // absorb for the duration of the outer λ-search only: the constant lands in
374    // the intercept, which the final accept-fit below recovers *exactly* by
375    // re-fitting the original (uncentered) response at the REML-selected λ̂.
376    // This mirrors the existing column conditioning, which centers the design
377    // columns into the intercept for the same numerical reason.
378    let response_center = gaussian_identity_response_center(
379        &cfg,
380        &conditioning,
381        opts.linear_constraints.is_some(),
382        y_o.view(),
383        w_o.view(),
384        offset_o.view(),
385    );
386    // Issue #1127 (down-scale sibling of #1000): replacing the response `y` by
387    // `a·y` must rescale the whole fit by `a` and leave `λ̂`/EDF unchanged (the
388    // normal equations are exactly linear in `y`; the profiled REML criterion is
389    // `a`-invariant up to the additive `−(n−p)·ln a` the dispersion absorbs).
390    // But the outer λ-selection's convergence band is keyed to an *absolute*
391    // objective scale (an inner `objective_scale.max(1.0)` floor and a `1e-6`
392    // outer gradient floor); when the Gaussian objective is `O(a²) ≪ 1` those
393    // floors swamp the signal and the optimizer stops early at an over-smoothed
394    // `λ`. Normalize the (centered) working response to `O(1)` for the outer
395    // λ-search only, mirroring the #1000 centering: the final accept-fit below
396    // re-fits the *original* response at the REML-selected λ̂, so β, μ̂, σ̂² and
397    // every reported quantity stay exactly on the user's scale. `center` here is
398    // the constant the intercept already absorbs (so the scale is measured on the
399    // residual signal, not on the offset).
400    let response_scale = gaussian_identity_response_scale(
401        &cfg,
402        &conditioning,
403        opts.linear_constraints.is_some(),
404        response_center.unwrap_or(0.0),
405        y_o.view(),
406        w_o.view(),
407        offset_o.view(),
408    );
409    // The outer loop borrows the response for the lifetime of `reml_state`;
410    // the conditioned copy (when any) is owned at function scope so the borrow
411    // outlives the state. Off the Gaussian-identity path both `response_center`
412    // and `response_scale` are `None` and the outer loop borrows the original
413    // response verbatim — no allocation, no behavioural change. When only one is
414    // active we still apply just that transform. Both are exactly invertible by
415    // the accept-fit, which re-fits the original `y_o` at the selected λ̂.
416    let reml_y_conditioned: Option<Array1<f64>> = match (response_center, response_scale) {
417        (None, None) => None,
418        (center, scale) => {
419            let c = center.unwrap_or(0.0);
420            let s = scale.unwrap_or(1.0);
421            Some((&y_o - c) / s)
422        }
423    };
424    let reml_y_view = reml_y_conditioned
425        .as_ref()
426        .map_or_else(|| y_o.view(), |conditioned| conditioned.view());
427
428    let mut reml_state = RemlState::newwith_offset_shared(
429        reml_y_view,
430        x_fit,
431        w_o.view(),
432        offset_o.view(),
433        Arc::clone(&canonical_shared),
434        p,
435        Arc::clone(&cfg_shared),
436        Some(active_nullspace_dims.clone()),
437        None,
438        fit_linear_constraints.clone(),
439    )?;
440    reml_state.set_penalty_shrinkage_floor(opts.penalty_shrinkage_floor);
441    reml_state.set_rho_prior(opts.rho_prior.clone());
442    if let Some(kron) = opts.kronecker_penalty_system.clone() {
443        reml_state.set_kronecker_penalty_system(kron);
444    }
445    if let Some(kf) = opts.kronecker_factored.clone() {
446        reml_state.set_kronecker_factored(kf);
447    }
448    if opts.persist_warm_start_disk {
449        // Caller opted into cross-process resume (#1082): engage the on-disk
450        // warm-start layer. Default-false keeps replicate/CI loops disk-silent.
451        reml_state.enable_persistent_warm_start_disk();
452    }
453    reml_state.setwarm_start_original_beta(warm_start_beta);
454
455    // Term/margin-order invariance (#1538/#1539). The per-ρ-coordinate canonical
456    // keys label each coordinate by its placement-independent (penalty + data)
457    // content; the induced canonical permutation lets BOTH the objective-grid
458    // seed prepass AND the outer optimizer operate in an identical canonical
459    // coordinate layout for every term order. `None` when the coordinate count
460    // does not match the ρ-dimension (legacy native-order path, unchanged).
461    let canon_keys = reml_state.canonical_rho_keys(k);
462    let canon_perm: Option<Vec<usize>> = canon_keys
463        .as_ref()
464        .and_then(|keys| crate::rho_optimizer::canonical_permutation(keys));
465
466    let reml_seed_config = external_reml_seed_config(k, cfg.link_function());
467    let reml_tol = cfg.reml_convergence_tolerance;
468    let reml_max_iter = opts.max_iter;
469    let outer_eval_idx = AtomicUsize::new(0usize);
470    let mixture_optspec = if opts.optimize_mixture {
471        opts.mixture_link.clone()
472    } else {
473        None
474    };
475    let sas_optspec = if opts.optimize_sas {
476        effective_sas_link
477    } else {
478        None
479    };
480    let mixture_dim = mixture_optspec
481        .as_ref()
482        .map(|s| s.initial_rho.len())
483        .unwrap_or(0);
484    let sas_dim = if sas_optspec.is_some() { 2 } else { 0 };
485    let sasridgeweight = if sas_dim > 0 {
486        sas_log_deltaridgeweight()
487    } else {
488        0.0
489    };
490    // Negative-Binomial outer θ↔λ alternation (#1448). With θ estimated, the
491    // λ-search freezes θ (see `frozen_negbin_theta`, #1082) so the REML criterion
492    // `F(ρ) = REML(ρ, θ_frozen)` is stationary in ρ; the final accept-fit then
493    // ML-refreshes θ at the converged η. A *single* freeze→refresh leaves the
494    // selected ρ optimal only for `θ_frozen`, not for the refreshed `θ_final`.
495    // mgcv `nb()` instead alternates θ-estimation and λ-selection to a joint
496    // fixed point. Wrap the ρ-search + accept-fit in a bounded loop: after each
497    // refit, if the NB θ drifted beyond tolerance, re-freeze the search θ at the
498    // refreshed value, reset the surface caches that depend on it, and re-run the
499    // outer ρ search. The cap bounds the work; for every non-NB / user-fixed-θ
500    // fit the loop runs exactly once (the break condition is met immediately), so
501    // those fits are byte-identical to the pre-#1448 single-pass behaviour.
502    //
503    // 5% relative θ drift is the same band the diagnostic (#1082) flagged as the
504    // point beyond which the ρ-optimum for `θ_frozen` and `θ_final` can differ
505    // enough to matter; below it the one-refresh approximation is already joint-
506    // stationary to the criterion's tolerance.
507    const NEGBIN_THETA_JOINT_DRIFT_TOL: f64 = 5.0e-2;
508    const NEGBIN_OUTER_ALTERNATION_MAX_ROUNDS: usize = 8;
509    let mut final_rho;
510    let mut final_mixture_state;
511    let mut final_sas_state;
512    let mut final_mixture_param_covariance;
513    let mut final_sas_param_covariance;
514    let mut outer_result;
515    let mut pirls_res;
516    let mut negbin_alternation_round: usize = 0;
517    loop {
518        (
519            final_rho,
520            final_mixture_state,
521            final_sas_state,
522            final_mixture_param_covariance,
523            final_sas_param_covariance,
524            outer_result,
525        ) = if mixture_dim > 0 && sas_dim > 0 {
526            crate::bail_invalid_estim!(
527                "simultaneous mixture and SAS optimization is not supported"
528            );
529        } else if mixture_dim == 0 && sas_dim == 0 {
530            use crate::rho_optimizer::{OuterEvalOrder, OuterProblem};
531            use gam_problem::{DeclaredHessianForm, Derivative};
532
533            let analytic_outer_hessian_available = reml_state.analytic_outer_hessian_enabled();
534            // Standard-GAM dense problem dimensions configure both cost models
535            // the planner uses to decide whether ARC+Hessian or BFGS+gradient
536            // is faster end-to-end at large scale:
537            //
538            //   - per-inner-solve cost (n · p²) gates the single-Hessian-
539            //     assembly downgrade,
540            //   - per-outer-eval cost (k² · n · p²) gates the LAML-Hessian
541            //     pairwise-assembly downgrade — independent of (1) and
542            //     necessary because the LAML outer Hessian's k² pairwise
543            //     inner-derived terms can dominate per-outer work even when
544            //     each individual inner solve is moderate.
545            //
546            // Sparse designs short-circuit the policy because the n · p²
547            // model does not apply to sparse linear algebra; ARC stays in
548            // place and the sparse path's iteration-count advantage holds.
549            // Gaussian-identity REML has two well-conditioned features that
550            // the outer optimizer can exploit:
551            //
552            //   1. The REML cost is dominated by an O(n) likelihood constant,
553            //      so ∂/∂logλ inherits the same scale. A unit-magnitude
554            //      `abs` gradient floor (1e-6) becomes binding at large-scale n
555            //      even after the relative-from-seed component declared
556            //      convergence iters earlier. `with_objective_scale(n)`
557            //      lifts the floor to ~n·1e-9 so the loop terminates once
558            //      the relative reduction is met.
559            //
560            //   2. The Gaussian profile likelihood is quadratic-like in
561            //      log-λ near the optimum, so the analytic Hessian is
562            //      trustworthy and the cubic regularization can start
563            //      smaller than opt's default sigma=1.0. Setting
564            //      sigma=0.25 allows the first ARC step to be ~4× the
565            //      default — matching the 2–4 unit log-λ moves typical of
566            //      Gaussian-identity REML cold starts on tensor smooths.
567            //
568            // Other families (logit, log, survival) keep the conservative
569            // defaults because their objective is non-quadratic in log-λ
570            // and their gradient is not on an O(n) scale.
571            let gaussian_identity = matches!(cfg.link_function(), LinkFunction::Identity);
572            let n_obs = y_o.len();
573            let prefer_gradient_only = k >= REML_SECOND_ORDER_RHO_CAP;
574            let continuation_prewarm = k < REML_CONTINUATION_PREWARM_RHO_CAP;
575            if prefer_gradient_only {
576                log::info!(
577                    "[OUTER] rho_dim {k} reaches exact REML Hessian budget \
578                   ({REML_SECOND_ORDER_RHO_CAP}); routing analytic-gradient quasi-Newton"
579                );
580            }
581            if !continuation_prewarm {
582                log::info!(
583                    "[OUTER] rho_dim {k} reaches continuation-prewarm budget \
584                   ({REML_CONTINUATION_PREWARM_RHO_CAP}); starting optimizer directly from seeds"
585                );
586            }
587            let problem = OuterProblem::new(k)
588            .with_gradient(Derivative::Analytic)
589            .with_hessian(if analytic_outer_hessian_available {
590                DeclaredHessianForm::Either
591            } else {
592                DeclaredHessianForm::Unavailable
593            })
594            .with_prefer_gradient_only(prefer_gradient_only)
595            .with_continuation_prewarm(continuation_prewarm)
596            .with_barrier(
597                crate::estimate::reml::reml_outer_engine::BarrierConfig::from_constraints(
598                    fit_linear_constraints.as_ref(),
599                ),
600            )
601            .with_tolerance(reml_tol)
602            .with_max_iter(reml_max_iter)
603            .with_seed_config(reml_seed_config)
604            .with_screening_cap(Arc::clone(&reml_state.screening_max_inner_iterations))
605            .with_outer_inner_cap(reml_inner_progress_feedback(&reml_state))
606            // n-scaled absolute gradient floor for EVERY family (#1082).
607            //
608            // The REML/LAML profiled criterion is a sum over n rows
609            // (deviance / −2·loglik + the penalty/logdet terms), so it and its
610            // ∂/∂logλ gradient inherit an O(n) scale for Poisson, NB, binomial,
611            // Tweedie, beta — exactly as for Gaussian-identity. The previous gate
612            // restricted `with_objective_scale` to the Gaussian-identity arm on
613            // the (incorrect) premise that only that criterion is O(n). For a
614            // non-Gaussian tensor/cyclic/CI/badhealth fit at n≈1.5k–5k the fixed
615            // `abs = tol ≈ 1e-6` gradient floor is then orders of magnitude below
616            // the n-scaled gradient's converged residual: the relative-from-seed
617            // test declares convergence iters earlier, but the binding abs floor
618            // keeps the outer optimizer chasing sub-floor log-λ changes, paying a
619            // full k²·n·p² LAML-Hessian assembly per phantom iteration until it
620            // exhausts the iteration budget — the #1082 outer-loop "cycling"
621            // timeout. Lifting the floor to ~n·1e-9 (the same calibration the
622            // spatial/custom-family outer already uses via `with_problem_size`,
623            // #1053/#1066/#1069) lets the loop terminate as soon as the relative
624            // reduction is met, for every family, while the relative-to-cost
625            // component still owns the actual convergence decision. ARC σ and the
626            // initial trust radius stay Gaussian-gated: those exploit the
627            // Gaussian profile being quadratic-in-log-λ, which is family-specific.
628            .with_objective_scale(Some(n_obs as f64))
629            .with_problem_size(n_obs, x_o.ncols())
630            .with_arc_initial_regularization(if gaussian_identity { Some(0.25) } else { None })
631            .with_operator_initial_trust_radius(if gaussian_identity { Some(4.0) } else { None })
632            .with_rho_bound(crate::estimate::RHO_BOUND)
633            // Make the outer smoothing-parameter search invariant to the order
634            // the smooth terms / tensor margins were written (#1538/#1539). The
635            // structural keys label each ρ-coordinate by its placement-
636            // independent penalty content, so the optimizer canonicalizes the
637            // coordinate layout and resolves the flat double-penalty REML valley
638            // identically for `s(x)+s(z)` vs `s(z)+s(x)` and `te(x,z)` vs
639            // `te(z,x)`. `None` (coordinate count not matching ρ-dim) leaves the
640            // native-order path unchanged.
641            .with_rho_canonical_keys(canon_keys.clone());
642            let problem = if let Some(h) = heuristic_lambdas {
643                problem.with_heuristic_lambdas(h.to_vec())
644            } else {
645                problem
646            };
647            let problem = if let Some(h) = heuristic_lambdas.filter(|h| h.len() == k) {
648                problem.with_initial_rho(Array1::from_iter(h.iter().copied()))
649            } else {
650                problem
651            };
652
653            // Geometric-mean log prior-weight anchor `log g(w) = (1/n₊)·Σ log wᵢ`
654            // over the positive-weight rows. The pure-REML optimum for a *profiled*
655            // (Gaussian-identity) fit drifts by `ρ̂ → ρ̂ + log c` under a global
656            // prior-weight rescale `w → c·w` (`H = XᵀWX + λS`, so λ → c·λ keeps the
657            // penalised curvature proportional to the data curvature, β̂ / EDF /
658            // predictions fixed). The outer ρ-search seed and the relative-from-seed
659            // convergence test would otherwise be referenced to a weight-independent
660            // origin (0), so a heavily up-weighted fit starts `log c` further from
661            // its (shifted) optimum and the optimiser stops short — exactly the
662            // weight-scale non-invariance of λ̂ reported in issue #877. Anchoring the
663            // seed at `log g(w)` makes the search start the SAME relative distance
664            // from the optimum regardless of the weight magnitude.
665            //
666            // This is the SAME gated anchor the outer ρ-prior uses
667            // ([`RemlState::rho_weight_anchor`]): it is the geometric-mean
668            // log-weight for a profiled-dispersion family and *exactly 0* for a
669            // fixed-dispersion family (Poisson, binomial, …). For fixed dispersion
670            // `w = c` is exact `c`-fold replication: the two encodings share an
671            // identical LAML objective and optimum, so anchoring the seed by their
672            // (differing) per-row log-weight mean would seed the weighted encoding
673            // `log c` above its true optimum and the relative-convergence test would
674            // stop it short — over-smoothing vs replication (issue #893). With all
675            // weights 1 (or any fixed-dispersion family) the anchor is exactly 0, so
676            // those fits stay byte-identical.
677            let weight_log_geom_mean: f64 = reml_state.rho_weight_anchor();
678            let gaussian_risk = matches!(
679                reml_seed_config.risk_profile,
680                SeedRiskProfile::Gaussian | SeedRiskProfile::GaussianLocationScale
681            );
682            // The prepass evaluates the *actual* REML/LAML objective on a tiny,
683            // deterministic log-λ grid and only changes startup when that same
684            // criterion improves.  It is therefore part of initialization, not a
685            // compatibility fallback.  Gaussian fits used to skip this when the
686            // weights were on the unit scale, leaving single-start BFGS/ARC tied to
687            // the arbitrary λ=1 origin; flat or multi-penalty REML surfaces could
688            // then spend the finite outer budget getting into the right basin rather
689            // than resolving the optimum that controls EDF and truth recovery.  Run
690            // the same criterion-ranked startup for Gaussian as for GLM/survival,
691            // while retaining the weight-scale anchor from issue #877.
692            let run_gaussian_anchored_prepass = gaussian_risk && weight_log_geom_mean.abs() > 1e-12;
693            // A caller-supplied rho seed (`init_rhos`/`heuristic_lambdas`, now in
694            // rho-space) is an explicit warm-start installed via `with_initial_rho`
695            // above. It still ANCHORS the objective-grid prepass below rather than
696            // short-circuiting it: the grid is criterion-ranked and only adopts a
697            // candidate that STRICTLY lowers the true REML/LAML cost, so a healthy
698            // warm seed is returned unchanged (the grid never beats it → byte-
699            // identical behaviour). What the anchor-and-rank rescues is a warm seed
700            // TRAPPED in a shallow under-smoothing local basin: when the design's
701            // kernel collapses (e.g. the constant-curvature `curv()` smooth fitted
702            // at a trial κ on the +chart side — the geodesic-exponential kernel's
703            // off-diagonals → 1, so its global REML optimum is a LARGE λ that the
704            // local outer optimizer, warm-started from the previous-κ λ̂, slides away
705            // from into the spurious low-λ optimum). The shallow optimum's
706            // spuriously-low deviance made the κ outer objective monotone toward the
707            // +chart bound for any curved data (gam#1464 — hyperbolic truth recovered
708            // as spherical); anchoring the global grid at the warm seed lets the
709            // prepass jump into the correct high-λ basin so the per-κ REML cost
710            // matches the textbook profiled-REML and the curvature SIGN is
711            // identifiable. Same machinery as the gam#1266 double-penalty rescue.
712            let caller_seeded_rho = heuristic_lambdas.is_some_and(|h| h.len() == k);
713            // The grid prepass's lowest-cost sample, kept for the #1371
714            // release-and-rerank guard even when it is not adopted as the initial
715            // seed (i.e. the grid did not strictly move). It is a known-good lower
716            // bound on the achievable REML cost, scored with the SAME functional.
717            // Unconditionally assigned inside the prepass block below (before its
718            // first read by the #1371 guard), so it carries no dead initializer.
719            let release_rerank_seed: Option<Array1<f64>>;
720            // #1548: the well-determined Marra-Wood double-penalty null-space
721            // selection coordinates, recognised exactly as the #1266 shrink-out
722            // escape recognises them (the relaxed `Normal(0, sd=15)` degeneracy
723            // prior, gated by `n ≥ 2·p`). A SUPPORTED such coordinate has a deep
724            // low-λ_null "keep" basin AND a flat high-λ_null annihilation shelf; the
725            // objective-grid prepass below probes the keep corner for exactly these
726            // coordinates so it can seed the well-conditioned keep basin directly
727            // rather than the shelf — see the keep-saturation probe in
728            // `select_objective_seed_on_log_lambda_grid`.
729            let nullspace_seed_coords: Vec<usize> = if n_design_rows >= 2 * p {
730                match reml_state.effective_rho_prior().as_ref() {
731                    gam_problem::RhoPrior::Independent(per_coord) => (0..k)
732                        .filter(|&i| {
733                            per_coord
734                                .get(i)
735                                .is_some_and(gam_terms::smooth::is_nullspace_degeneracy_prior)
736                        })
737                        .collect(),
738                    _ => Vec::new(),
739                }
740            } else {
741                Vec::new()
742            };
743            let prepass_seed: Option<Array1<f64>> = {
744                let bnds = reml_seed_config.bounds;
745                let (lo, hi_seed) = if bnds.0 <= bnds.1 {
746                    bnds
747                } else {
748                    (bnds.1, bnds.0)
749                };
750                // The criterion-ranked prepass evaluates the TRUE REML/LAML cost, so
751                // it is safe — and necessary — to let it explore the full
752                // over-smoothing range the outer optimizer itself can reach
753                // (`RHO_BOUND`), not just the narrower default seed-placement band.
754                // A double-penalty (null-space-shrinkage) smooth on data living in
755                // one penalty's null space has its global REML optimum at a LARGE
756                // wiggliness λ (range block fully smoothed), often beyond the seed
757                // band; the cost surface also has a shallower local optimum at a
758                // moderate λ that leaves wiggle under-penalized (EDF inflated,
759                // gam#1266). If the prepass cannot seed past that local optimum, the
760                // outer EFS — which only takes cost-improving steps — relaxes back
761                // into it. The collapsing-kernel spatial smooth (gam#1464) has the
762                // same shape: the high-λ basin sits beyond a shallow low-λ trap.
763                // Widening only the upper (over-smoothing) bound lets the prepass
764                // place the seed in the correct high-λ basin; the lower
765                // (under-smoothing) bound stays at the default so we never seed an
766                // overfit origin. The seed is still only adopted when it strictly
767                // lowers the REML cost, so well-balanced and single-penalty fits are
768                // unaffected.
769                let hi = hi_seed.max(crate::estimate::RHO_BOUND);
770                // risk_shift is the default seed bias when no caller warm-start is given;
771                // it is NOT applied on top of a caller-supplied rho seed.
772                let risk_shift: f64 = match reml_seed_config.risk_profile {
773                    SeedRiskProfile::Gaussian | SeedRiskProfile::GaussianLocationScale => 0.0,
774                    SeedRiskProfile::GeneralizedLinear => 1.0,
775                    SeedRiskProfile::Survival => 2.0,
776                };
777                // Anchor the grid at the caller-supplied `heuristic_lambdas` when one
778                // is present (it is already in rho-space, used as-is) — the grid then
779                // searches relative to the warm start and keeps it unless a candidate
780                // is strictly better. Otherwise anchor the default risk-shift origin
781                // to the weight scale (issue #877).
782                let base = if let Some(h) = heuristic_lambdas.as_ref().filter(|h| h.len() == k) {
783                    Array1::from_iter(h.iter().map(|&v| v.clamp(lo, hi)))
784                } else {
785                    Array1::from_elem(k, (risk_shift + weight_log_geom_mean).clamp(lo, hi))
786                };
787                // Run the objective-grid seed search in CANONICAL coordinate order
788                // (#1538/#1539) so its greedy per-axis / pairwise-saturation
789                // refinement — which is order-dependent in native layout — explores
790                // the SAME axes for every term order. The grid builds canonical
791                // candidates; the eval closure maps each back to native order before
792                // scoring with the true `compute_cost`, and the refined seed is
793                // mapped native again for `with_initial_rho`. Without a permutation
794                // this is the identity, so the native-order path is byte-for-byte
795                // unchanged.
796                let refined = if let Some(perm) = canon_perm.as_ref() {
797                    let to_native = |canon: &Array1<f64>| -> Array1<f64> {
798                        let mut out = Array1::zeros(canon.len());
799                        for (c, &i) in perm.iter().enumerate() {
800                            out[i] = canon[c];
801                        }
802                        out
803                    };
804                    let base_canon = Array1::from_iter(perm.iter().map(|&i| base[i]));
805                    // Canonical slot `c` carries native coordinate `perm[c]`; a
806                    // null-space coordinate must be probed in whichever slot it
807                    // occupies in the canonical layout the grid refines.
808                    let nullspace_canon: Vec<usize> = (0..k)
809                        .filter(|&c| nullspace_seed_coords.contains(&perm[c]))
810                        .collect();
811                    let refined_canon = crate::seeding::select_objective_seed_on_log_lambda_grid(
812                        &base_canon,
813                        (lo, hi),
814                        k,
815                        &nullspace_canon,
816                        |canon_rho| {
817                            let native = to_native(canon_rho);
818                            reml_state
819                                .compute_cost(&native)
820                                .ok()
821                                .filter(|c| c.is_finite())
822                        },
823                    );
824                    to_native(&refined_canon)
825                } else {
826                    crate::seeding::select_objective_seed_on_log_lambda_grid(
827                        &base,
828                        (lo, hi),
829                        k,
830                        &nullspace_seed_coords,
831                        |rho| reml_state.compute_cost(rho).ok().filter(|c| c.is_finite()),
832                    )
833                };
834                // Emit the seed when the grid moved it, or — on the Gaussian
835                // weight-anchored path — whenever the anchored `base` is itself
836                // offset from the unanchored origin (so the shifted optimum is
837                // actually seeded even if the coarse grid leaves `base` unchanged).
838                // Record the grid's best sample for the release-and-rerank guard
839                // unconditionally — whether or not it is strong enough to override
840                // the optimizer's own cold start, it is still a scored lower bound
841                // the certified optimum must not be worse than (#1371).
842                release_rerank_seed = Some(refined.clone());
843                let grid_moved = refined
844                    .iter()
845                    .zip(base.iter())
846                    .any(|(&a, &b)| (a - b).abs() > 1e-12);
847                // For a caller-seeded fit, adopt the grid result only when it
848                // STRICTLY moved the warm seed (i.e. found a strictly-cheaper basin);
849                // an unmoved grid leaves the warm start exactly as installed above, so
850                // healthy warm-started fits stay byte-identical. The Gaussian
851                // weight-anchored emit only applies on the non-caller-seeded origin.
852                if grid_moved || (run_gaussian_anchored_prepass && !caller_seeded_rho) {
853                    log::info!(
854                        "[OUTER] standard REML objective-grid selected seed: {:?} -> {:?}",
855                        base.as_slice().unwrap_or(&[]),
856                        refined.as_slice().unwrap_or(&[])
857                    );
858                    Some(refined)
859                } else {
860                    None
861                }
862            };
863            let problem = if let Some(seed) = prepass_seed {
864                problem.with_initial_rho(seed)
865            } else {
866                problem
867            };
868            // #1074 DIAGNOSTIC (log-gated, no behavior change unless the crate
869            // logger is installed): sweep each outer log-λ coordinate over a grid
870            // while holding the others at the baseline, logging the REML cost. Used
871            // to decide whether the spatial range railing is an interior optimum the
872            // optimizer misses (optimizer bug) or a genuine criterion preference for
873            // λ→∞ (criterion). Placed BEFORE the objective takes its `&mut
874            // reml_state` borrow so the immutable `compute_cost` reads are valid.
875            // Emitted at warn level so the default-installed crate logger (Info)
876            // prints it without a level change (the ban-scanner forbids direct
877            // stderr printing and process-env reads).
878            if log::log_enabled!(log::Level::Warn) {
879                let grid = [
880                    -5.0_f64, -2.0, 0.0, 2.0, 5.0, 8.0, 10.0, 12.0, 16.0, 20.0, 25.0, 30.0,
881                ];
882                let mut baselines: Vec<(&str, Array1<f64>)> =
883                    vec![("zeros", Array1::<f64>::zeros(k))];
884                if k == 4 {
885                    baselines.push(("conv", Array1::from(vec![9.0_f64, 30.0, 12.0, 30.0])));
886                }
887                if k == 2 {
888                    baselines.push(("conv6", Array1::from(vec![6.0_f64, 6.0])));
889                }
890                for (label, baseline) in &baselines {
891                    log::warn!("[#1074-sweep] k={k} baseline={label}={baseline:?}");
892                    for coord in 0..k {
893                        let mut line = format!("[#1074-sweep:{label}] coord={coord}:");
894                        for &rho in &grid {
895                            let mut p = baseline.clone();
896                            p[coord] = rho;
897                            let cg = reml_state.compute_cost_and_gradient(&p).ok();
898                            let cell = match cg {
899                                Some((c, g)) => {
900                                    format!(
901                                        "{c:.4}(g{}={:.3e})",
902                                        coord,
903                                        g.get(coord).copied().unwrap_or(f64::NAN)
904                                    )
905                                }
906                                None => "ERR".to_string(),
907                            };
908                            line.push_str(&format!(" {rho:.0}->{cell}"));
909                        }
910                        log::warn!("{line}");
911                    }
912                }
913            }
914
915            // Attach the outer-loop cache session. The session shares its
916            // realized-fit-context key with the inner beta record (different
917            // payload namespace), so a SIGKILL mid-outer-iter leaves both the
918            // last accepted β (inner record) and the best rho seen so far
919            // (outer iterate) on disk for the next run.
920            let problem = match reml_state.outer_cache_session() {
921                Some(session) => problem.with_cache_session(session),
922                None => problem,
923            };
924
925            let obj = problem.build_objective_with_screening_proxy(
926                &mut reml_state,
927                |state: &mut &mut crate::estimate::reml::RemlState<'_>,
928                 rho: &Array1<f64>| { state.compute_cost(rho) },
929                |state: &mut &mut crate::estimate::reml::RemlState<'_>,
930                 rho: &Array1<f64>| {
931                    outer_eval_idx.fetch_add(1, Ordering::Relaxed);
932                    state.compute_outer_eval_with_order(
933                        rho,
934                        if analytic_outer_hessian_available {
935                            OuterEvalOrder::ValueGradientHessian
936                        } else {
937                            OuterEvalOrder::ValueAndGradient
938                        },
939                    )
940                },
941                |state: &mut &mut crate::estimate::reml::RemlState<'_>,
942                 rho: &Array1<f64>,
943                 order: OuterEvalOrder| {
944                    outer_eval_idx.fetch_add(1, Ordering::Relaxed);
945                    state.compute_outer_eval_with_order(rho, order)
946                },
947                Some(
948                    |state: &mut &mut crate::estimate::reml::RemlState<'_>| {
949                        state.reset_outer_seed_state()
950                    },
951                ),
952                Some(
953                    |state: &mut &mut crate::estimate::reml::RemlState<'_>,
954                     rho: &Array1<f64>| { state.compute_efs_steps(rho) },
955                ),
956                |state: &mut &mut crate::estimate::reml::RemlState<'_>,
957                 rho: &Array1<f64>| { state.compute_screening_proxy(rho) },
958            );
959            // Standard REML's eval closure publishes
960            // `inner_beta_hint = state.current_original_basis_beta()` on
961            // every accepted eval. The continuation pre-warm carries that
962            // hint forward and calls `seed_inner_state(beta)` before the
963            // next eval — see src/solver/reml/continuation.rs:209-212,
964            // 434-438. Without a hook here, `ClosureObjective::seed_inner_state`
965            // (src/solver/rho_optimizer.rs:2097-2107) rejected any
966            // non-empty β fatally, dropping every seed before the inner
967            // solver started (issue #236). Wire the symmetric consumer:
968            // when the pre-warm forwards the cached β, install it into the
969            // same `warm_start_beta` slot the publisher reads from.
970            let mut obj = obj.with_seed_inner_state(with_reml_beta_seed_hook());
971
972            let mut strategy_result = problem.run(&mut obj, "standard REML")?;
973            drop(obj);
974            // #1371 release-and-rerank guard. The continuation oversmoothing
975            // warm-start can deliver the inner β on the high-λ null-space
976            // "annihilation" shelf of a double-penalty smooth: there the
977            // null-space coefficients are already shrunk to ~0, so the deviance
978            // ρ-gradient vanishes (∂dev/∂ρ_null → 0) AND the Occam terms
979            // (½ tr(H⁻¹ ∂H/∂ρ) − ½ λ tr(S⁺ S_k)) cancel, leaving the analytic
980            // outer gradient ≈ 0. ARC then certifies that point as a stationary
981            // optimum even though its REML cost is FAR ABOVE a point the seed
982            // prepass already evaluated — driving a genuinely-supported null-space
983            // direction (a real linear trend, gam#1371) to EDF → 0. The seed
984            // prepass's grid-refined seed is a known-good lower bound on the cost
985            // (it was scored with the SAME `compute_cost`), so if the certified
986            // optimum is strictly worse than it, re-rank to the seed: re-running
987            // the inner solve there installs the correct β̂. This cannot regress a
988            // fit whose optimum genuinely IS the high-λ corner (gam#1266: an
989            // unsupported term shrinking out) — there the corner is the
990            // lowest-cost point, no cheaper seed exists, and the guard is a no-op.
991            if let Some(seed) = release_rerank_seed.as_ref() {
992                // The certified cost is the optimizer's OWN authoritative
993                // `final_value`, NOT a fresh `compute_cost(strategy_result.rho)`
994                // re-evaluation (load-bearing, #1426/#1477). The REML/LAML objective
995                // for a non-Gaussian family is NOT a pure function of ρ: it carries a
996                // profiled dispersion / nuisance that is established by the inner
997                // solve at the operating ρ, and `compute_cost` warm-starts the inner
998                // PIRLS from whatever β̂/φ the previous eval left behind. Probing the
999                // under-penalized prepass seed FIRST (necessary so the no-op path can
1000                // leave β̂ at the seed for a clean re-install) pollutes that nuisance
1001                // state, so a subsequent re-eval of the cleanly-converged ρ comes back
1002                // a few REML units ABOVE its true certified cost — e.g. a Gamma/log
1003                // optimum certified at 829.857 re-evaluates at 834.90 right after the
1004                // seed probe, which is then (wrongly) above the seed's own 833.47 and
1005                // the guard "escapes" to the under-penalized seed, shipping a
1006                // near-full-basis overfit (EDF ≈ k, falsely tagged converged) that the
1007                // seed loop's keep-best had already rejected (#1426 silent overfit;
1008                // #1477 Tweedie boundary/EDF blow-up). `final_value` was scored at the
1009                // converged ρ with ITS own inner solve, so it is immune to that probe
1010                // pollution and is the honest cost to compare the seed against.
1011                let cost_converged = strategy_result.final_value;
1012                // The seed probe is a non-fatal measurement; a seed that fails to
1013                // evaluate simply skips the comparison. It leaves β̂ at the seed, so
1014                // the no-op branch below relies on the unified β̂ re-install after the
1015                // guard to restore it at `strategy_result.rho`.
1016                //
1017                // Probe the seed WITH its outer gradient (not cost alone): the grid
1018                // prepass scored the seed by `compute_cost`, which runs the inner
1019                // P-IRLS — at an under-penalized (λ→0) ρ the inner solve hits its
1020                // iteration cap and reports a spuriously LOW cost (an invalid REML
1021                // value the line search could not improve) while the analytic outer
1022                // gradient still points strongly toward more penalization. The #1371
1023                // false high-λ shelf this guard exists to escape is, by contrast, a
1024                // GENUINE cheaper optimum: its seed is stationary. Even with the
1025                // pollution-free `final_value` comparison above, a stuck stall can
1026                // still under-cut it on raw cost, so only a seed whose cost is
1027                // trustworthy (small residual gradient) may override the certified ρ.
1028                let seed_eval = reml_state.compute_cost_and_gradient(seed).ok();
1029                // Strict relative improvement so a numerically-equal seed (the common
1030                // case where the optimizer reached the seed's basin) is left untouched
1031                // and the fit stays byte-identical.
1032                let floor = 1e-6 * (1.0 + cost_converged.abs());
1033                if let Some((cost_seed, grad_seed)) = seed_eval.filter(|(c, _)| c.is_finite())
1034                    && cost_converged.is_finite()
1035                    && cost_seed < cost_converged - floor
1036                {
1037                    // Bound-projected residual gradient at the seed (same criterion
1038                    // `nonconverged_cost_is_trustworthy` / the flat-valley stall guard
1039                    // use): a component pinned at a bound by a gradient pushing past
1040                    // it is feasible-stationary and drops out of the norm.
1041                    let (blo, bhi) = {
1042                        let (a, b) = reml_seed_config.bounds;
1043                        let (lo, hi) = if a <= b { (a, b) } else { (b, a) };
1044                        (lo, hi.max(crate::estimate::RHO_BOUND))
1045                    };
1046                    let seed_grad_norm = {
1047                        let mut sumsq = 0.0;
1048                        for (i, &g) in grad_seed.iter().enumerate() {
1049                            let s = seed.get(i).copied().unwrap_or(0.0);
1050                            let pinned_lo = s <= blo + 1e-9 && g > 0.0;
1051                            let pinned_hi = s >= bhi - 1e-9 && g < 0.0;
1052                            if !(pinned_lo || pinned_hi) {
1053                                sumsq += g * g;
1054                            }
1055                        }
1056                        sumsq.sqrt()
1057                    };
1058                    let seed_cost_trustworthy = seed_grad_norm.is_finite()
1059                        && seed_grad_norm
1060                            <= crate::rho_optimizer::FLAT_VALLEY_STALL_GRAD_CEILING;
1061                    if seed_cost_trustworthy {
1062                        log::info!(
1063                            "[OUTER] #1371 release-and-rerank: certified ρ cost {cost_converged:.6e} \
1064                         exceeds the prepass seed cost {cost_seed:.6e} (seed |g|={seed_grad_norm:.3e} \
1065                         ≤ ceiling); adopting the seed (false high-λ stationary shelf escaped)"
1066                        );
1067                        strategy_result.rho = seed.clone();
1068                        strategy_result.converged = true;
1069                    } else {
1070                        // #1426 leak: the cheaper seed is a stuck under-penalized
1071                        // (λ→0) stall, not a genuine optimum — its low cost is an
1072                        // inner-cap artifact. Adopting it would ship the near-full-
1073                        // basis overfit (EDF ≈ k) and, worse, certify it converged.
1074                        // Keep the honest certified ρ. β̂ is restored by the unified
1075                        // re-install after the guard.
1076                        log::info!(
1077                            "[OUTER] #1371 release-and-rerank: prepass seed cost {cost_seed:.6e} is \
1078                         cheaper than certified ρ {cost_converged:.6e} but UNTRUSTWORTHY \
1079                         (seed |g|={seed_grad_norm:.3e} > ceiling — stuck under-penalized stall, \
1080                         #1426); keeping the certified ρ"
1081                        );
1082                    }
1083                }
1084                // Re-install β̂ at the (possibly newly-adopted) reported ρ so the
1085                // cached inner state matches `strategy_result.rho` for the downstream
1086                // cap-guard / final assembly — whether the guard fired (β̂ → seed) or
1087                // was a no-op (β̂ → the certified ρ, undoing the seed probe).
1088                reml_state.compute_cost(&strategy_result.rho)?;
1089            }
1090            // #1074 UPPER-BOUND INWARD-DESCENT ESCAPE. The outer cost-stall /
1091            // convergence check projects out a coordinate sitting on the ρ upper
1092            // bound, so a coordinate that was driven to the over-smoothing rail can
1093            // be certified "converged" even when the REML criterion is strictly
1094            // LOWER at an interior ρ (a feasible inward descent the projection
1095            // masks). On `s(long,lat,bs="tp") + s(depth)` the spatial/depth
1096            // NULL-SPACE (affine-trend) coordinates rail to ρ=30 while
1097            // `compute_cost` is ~23/~5 units lower at ρ≈2, annihilating a SUPPORTED
1098            // spatial trend (#1074). This guard runs a bounded, keep-best
1099            // coordinate-descent polish on EXACTLY the coordinates pinned at the
1100            // upper bound: for each, it line-searches `compute_cost` over a coarse
1101            // inward grid and adopts the strictly-best ρ. It uses the same
1102            // authoritative `compute_cost` the optimizer minimizes, so it can only
1103            // LOWER the certified cost — it never raises it and is a no-op when the
1104            // rail genuinely is the optimum (an unsupported term shrinking out,
1105            // #1266/#1271: no interior point is cheaper, so nothing is adopted).
1106            {
1107                let rho_upper = crate::estimate::RHO_BOUND;
1108                // Coordinates eligible for the inward (less-smoothing) descent. Two
1109                // kinds qualify:
1110                //   (1) any coordinate pinned at the ρ upper rail — the original
1111                //       #1074 case: the outer convergence check projects out a
1112                //       rail-pinned coordinate, masking a feasible interior descent;
1113                //   (2) the well-determined double-penalty NULL-SPACE selection
1114                //       coordinates (`is_nullspace_degeneracy_prior`). These sit on a
1115                //       near-FLAT REML ridge in λ_null, so the outer optimizer can
1116                //       certify convergence at ANY high-but-not-railed ρ_null
1117                //       depending on its (floating-point) iterate path. Reflecting the
1118                //       covariate `x → −x` reverses the basis column order and flips
1119                //       that landing shoulder: a SUPPORTED affine trend is kept at
1120                //       ρ_null ≈ 0 in one orientation but over-penalized to e.g.
1121                //       ρ_null ≈ 25 in the mirror (#1548), even though neither is at
1122                //       the exact rail. Descending these to the cheaper interior
1123                //       optimum lands BOTH orientations on the same shoulder.
1124                // The descent below probes ONLY strictly-lower ρ, so it never
1125                // over-smooths (raising λ_null is the #1266 escape's job, with its
1126                // EDF parsimony guard against the #1476 concurvity transfer). It is
1127                // keep-best + cold-confirmed against the authoritative penalized cost,
1128                // so it is an exact no-op wherever the current ρ already is the
1129                // optimum — e.g. an unsupported trend correctly shrunk out at the rail
1130                // (#1266/#1271), where no interior point is cheaper.
1131                let mut descent_coords: Vec<usize> = (0..strategy_result.rho.len())
1132                    .filter(|&i| strategy_result.rho[i] >= rho_upper - 1e-9)
1133                    .collect();
1134                if n_design_rows >= 2 * p
1135                    && let gam_problem::RhoPrior::Independent(per_coord) =
1136                        reml_state.effective_rho_prior().as_ref()
1137                {
1138                    for i in 0..strategy_result.rho.len() {
1139                        if strategy_result.rho[i] < rho_upper - 1e-9
1140                            && per_coord
1141                                .get(i)
1142                                .is_some_and(gam_terms::smooth::is_nullspace_degeneracy_prior)
1143                        {
1144                            descent_coords.push(i);
1145                        }
1146                    }
1147                }
1148                // Baseline to beat = the optimizer's OWN authoritative converged
1149                // cost (`final_value`), which was scored at the converged ρ with its
1150                // own inner solve and is immune to warm-start pollution from the
1151                // probes below (the #1371 lesson). A probe only wins if it is
1152                // strictly cheaper than this honest cost.
1153                let base_cost = strategy_result.final_value;
1154                if !descent_coords.is_empty() && base_cost.is_finite() {
1155                    // Inward probe grid (descending from the rail). Bounded and
1156                    // cheap: at most 2 · |railed| · 8 inner solves, and only when a
1157                    // coord is actually pinned at the upper rail. Two coordinate-
1158                    // descent passes pick up cross-coordinate coupling between the
1159                    // railed axes.
1160                    const INWARD_GRID: [f64; 8] = [25.0, 20.0, 15.0, 10.0, 5.0, 2.0, 0.0, -2.0];
1161                    let mut best_rho = strategy_result.rho.clone();
1162                    let mut best_cost = base_cost;
1163                    let mut improved = false;
1164                    for _pass in 0..2 {
1165                        let mut pass_improved = false;
1166                        for &coord in &descent_coords {
1167                            let mut local_best = best_rho.clone();
1168                            let mut local_cost = best_cost;
1169                            for &cand in &INWARD_GRID {
1170                                // Inward escape only ever DESCENDS (less smoothing):
1171                                // skip any grid point at or above this coordinate's
1172                                // current ρ. Over-smoothing a null-space coordinate is
1173                                // the #1266 escape's job (it carries the EDF parsimony
1174                                // guard that prevents the #1476 concurvity transfer);
1175                                // this guard must never raise λ without it.
1176                                if cand >= best_rho[coord] - 1e-9 {
1177                                    continue;
1178                                }
1179                                let mut probe = best_rho.clone();
1180                                probe[coord] = cand;
1181                                if let Ok(c) = reml_state.compute_cost(&probe)
1182                                    && c.is_finite()
1183                                    && c < local_cost - 1e-6 * (1.0 + local_cost.abs())
1184                                {
1185                                    local_cost = c;
1186                                    local_best = probe;
1187                                }
1188                            }
1189                            if local_cost < best_cost - 1e-6 * (1.0 + best_cost.abs()) {
1190                                best_rho = local_best;
1191                                best_cost = local_cost;
1192                                improved = true;
1193                                pass_improved = true;
1194                            }
1195                        }
1196                        if !pass_improved {
1197                            break;
1198                        }
1199                    }
1200                    // CONTINUOUS REFINEMENT of each descended coordinate. The coarse
1201                    // INWARD_GRID snaps λ to a grid node (e.g. ρ_null = 0), but in the
1202                    // OTHER covariate orientation the outer optimizer reports the
1203                    // continuous interior minimizer (e.g. ρ_null = −0.37). Leaving one
1204                    // orientation on the grid node while the other keeps the continuous
1205                    // optimum leaves a small residual reflection asymmetry (#1548:
1206                    // ~1.7e-3 mirror drift survives the grid descent alone). Golden-
1207                    // section the SAME authoritative penalized cost on each moved
1208                    // coordinate so both orientations converge to the identical
1209                    // continuous minimum. It can only lower the cost from the grid node
1210                    // (the bracket straddles it), and the cold confirmation below still
1211                    // gates adoption, so this never raises the certified cost.
1212                    if improved {
1213                        const GS_R: f64 = 0.618_033_988_749_894_8; // (√5 − 1) / 2
1214                        for coord in descent_coords.clone() {
1215                            if (best_rho[coord] - strategy_result.rho[coord]).abs() <= 1e-9 {
1216                                continue; // coordinate did not descend
1217                            }
1218                            // Bracket straddling the adopted grid node, never re-entering
1219                            // the over-smoothing region above the coordinate's start ρ.
1220                            let node = best_rho[coord];
1221                            let mut a = node - 3.0;
1222                            let mut b = (node + 3.0).min(strategy_result.rho[coord]);
1223                            if b <= a + 1e-6 {
1224                                continue;
1225                            }
1226                            let cost_at =
1227                                |st: &mut RemlState, base: &Array1<f64>, x: f64| -> Option<f64> {
1228                                    let mut p = base.clone();
1229                                    p[coord] = x;
1230                                    st.compute_cost(&p).ok().filter(|c| c.is_finite())
1231                                };
1232                            let mut c = b - GS_R * (b - a);
1233                            let mut d = a + GS_R * (b - a);
1234                            let mut fc = cost_at(&mut reml_state, &best_rho, c);
1235                            let mut fd = cost_at(&mut reml_state, &best_rho, d);
1236                            let mut refine_ok = fc.is_some() && fd.is_some();
1237                            for _ in 0..40 {
1238                                if (b - a).abs() < 1e-4 {
1239                                    break;
1240                                }
1241                                match (fc, fd) {
1242                                    (Some(vc), Some(vd)) if vc <= vd => {
1243                                        b = d;
1244                                        d = c;
1245                                        fd = fc;
1246                                        c = b - GS_R * (b - a);
1247                                        fc = cost_at(&mut reml_state, &best_rho, c);
1248                                    }
1249                                    (Some(_), Some(_)) => {
1250                                        a = c;
1251                                        c = d;
1252                                        fc = fd;
1253                                        d = a + GS_R * (b - a);
1254                                        fd = cost_at(&mut reml_state, &best_rho, d);
1255                                    }
1256                                    _ => {
1257                                        refine_ok = false;
1258                                        break;
1259                                    }
1260                                }
1261                            }
1262                            if refine_ok {
1263                                let xm = 0.5 * (a + b);
1264                                if let Some(fm) = cost_at(&mut reml_state, &best_rho, xm)
1265                                    && fm < best_cost
1266                                {
1267                                    best_rho[coord] = xm;
1268                                    best_cost = fm;
1269                                }
1270                            }
1271                        }
1272                    }
1273                    if improved {
1274                        // COLD CONFIRMATION (guards against adopting a warm-start /
1275                        // inner-cap artifact, the #1426/#1371 trap). The grid probes
1276                        // ran warm-started off each other; a λ→0-ish interior point
1277                        // can report a spuriously low cost from a capped inner solve.
1278                        // Clear the inner cache and re-score the candidate cold; only
1279                        // adopt if it STILL strictly beats the authoritative
1280                        // `final_value`.
1281                        reml_state.reset_outer_seed_state();
1282                        let cold = reml_state.compute_cost(&best_rho);
1283                        let cold_ok = matches!(cold, Ok(c)
1284                        if c.is_finite() && c < base_cost - 1e-6 * (1.0 + base_cost.abs()));
1285                        if cold_ok {
1286                            let cold_cost = cold.unwrap_or(best_cost);
1287                            log::info!(
1288                                "[OUTER] #1074/#1548 upper-bound escape: certified ρ cost \
1289                             {base_cost:.6e} lowered to {cold_cost:.6e} (cold-confirmed) by \
1290                             descending {} over-smoothed coord(s) inward; adopting the cheaper \
1291                             interior ρ",
1292                                descent_coords.len()
1293                            );
1294                            strategy_result.rho = best_rho;
1295                            strategy_result.final_value = cold_cost;
1296                            // β̂ already installed at `best_rho` by the cold eval above.
1297                        } else {
1298                            // The improvement did not survive a cold re-score — it was
1299                            // a warm-start artifact. Keep the certified ρ and restore
1300                            // its inner state for downstream assembly.
1301                            reml_state.reset_outer_seed_state();
1302                            reml_state.compute_cost(&strategy_result.rho)?;
1303                        }
1304                    }
1305                }
1306            }
1307            // #1266 NULL-SPACE SHRINK-OUT ESCAPE (pure-REML; the OUTWARD-direction
1308            // dual of the #1074 inward escape above).
1309            //
1310            // A default double-penalty smooth (mgcv `select = TRUE`) carries a
1311            // `DoublePenaltyNullspace` shrinkage ridge on the term's penalty null
1312            // space ({1, x} for a 1-D bend) whose only job is SELECTION: drive its
1313            // λ_null UP to shrink an UNSUPPORTED term's constant+linear component out
1314            // (EDF → 0). On a well-determined Gaussian fit the relaxed ρ-prior places
1315            // a WIDE, symmetric `Normal(0, sd=15)` on that coordinate — NOT as a
1316            // selection criterion but purely as a degeneracy-breaker: the #1476
1317            // concurvity flat-ridge needs strictly-positive outer curvature to
1318            // certify an interior allocation. That symmetric prior's `ρ/sd²` gradient
1319            // also OPPOSES the (genuinely shallow) REML shrink-out tail, so the outer
1320            // optimizer certifies a stationary point at a MODERATE λ_null
1321            // (ρ_null ≈ 3.5, EDF ≈ 1.6) instead of following pure REML to the
1322            // shrink-out corner — the residual #1266 "Half B" contract violation. The
1323            // prior cannot be made one-sided: its high-ρ curvature is exactly what
1324            // stops a SUPPORTED concurvity null space from railing out (#1476), so a
1325            // data-INDEPENDENT prior cannot separate "shrink the unsupported term"
1326            // (#1266) from "keep the supported one" (#1476/#1371) — they overlap in ρ.
1327            //
1328            // The data-DEPENDENT discriminator is pure data-REML PLUS a parsimony
1329            // check. For each well-determined null-space selection coordinate,
1330            // line-search the OVER-SMOOTHING (high-ρ) direction on the PURE REML cost
1331            // (`compute_cost − configured_ρ_prior`; the prior is a conditioning
1332            // device, not a selection criterion), then adopt the strictly-best
1333            // COLD-confirmed point ONLY if it also does not increase the model's total
1334            // EDF:
1335            //   * UNSUPPORTED, uncorrelated null space (#1266 `s(z)`): pure REML
1336            //     descends toward shrink-out AND total EDF drops (z carries no signal,
1337            //     so nothing absorbs it) → the escape fires, EDF → 0.
1338            //   * SUPPORTED null space (#1371 genuine slope): pure REML strictly RISES
1339            //     under over-smoothing (killing a real linear trend dumps its variance
1340            //     into σ̂²) → no strict improvement → exact no-op.
1341            //   * CONCURVITY null space (#1476 `s(x1)+s(x2)`, corr ≈ 0.9): pure REML
1342            //     *marginally* prefers over-smoothing one coordinate because the inner
1343            //     β re-solve lets the CORRELATED partner absorb the shared signal — the
1344            //     "signal transfer" the degeneracy prior exists to forbid. That
1345            //     transfer keeps the deviance flat but INFLATES total EDF (the partner
1346            //     spends extra basis), so the EDF-non-increase guard vetoes it →
1347            //     no-op, the interior allocation is kept. (Pure REML alone cannot see
1348            //     this: the concurvity ridge is flat, so the transfer reads as a tiny
1349            //     improvement; the parsimony guard is what distinguishes a genuine
1350            //     simplification from a lateral reallocation.)
1351            //
1352            // Unlike #1074 (where the OPTIMIZER's bound projection masks the descent),
1353            // here it is the PRIOR that masks it, so the search runs on the pure
1354            // (prior-stripped) criterion. SCOPE: eligible coordinates are exactly the
1355            // well-determined relaxed null-space degeneracy coordinates
1356            // (`is_nullspace_degeneracy_prior`, gated by `n ≥ 2·p`). This deliberately
1357            // EXCLUDES the under-determined regime (`n < 2·p`, #1392 wine `p > n`),
1358            // where the null-space prior is the AGGRESSIVE PC select-out — a
1359            // deliberate, load-bearing selection push onto a genuinely-flat REML
1360            // score that stripping would undo.
1361            {
1362                let well_determined = n_design_rows >= 2 * p;
1363                let select_coords: Vec<usize> = if well_determined {
1364                    match reml_state.effective_rho_prior().as_ref() {
1365                        gam_problem::RhoPrior::Independent(per_coord) => {
1366                            (0..strategy_result.rho.len())
1367                                .filter(|&i| {
1368                                    per_coord.get(i).is_some_and(
1369                                        gam_terms::smooth::is_nullspace_degeneracy_prior,
1370                                    )
1371                                })
1372                                .collect()
1373                        }
1374                        _ => Vec::new(),
1375                    }
1376                } else {
1377                    Vec::new()
1378                };
1379                // Authoritative pure-REML baseline at the converged ρ: the optimizer's
1380                // own `final_value` (immune to warm-start pollution, the #1371 lesson)
1381                // minus the configured ρ-prior + soft λ→0 guard it carried. A probe
1382                // wins only if it strictly beats THIS pure cost.
1383                let conv_prior = reml_state
1384                    .configured_rho_prior_atom(&strategy_result.rho)
1385                    .cost()
1386                    + reml_state
1387                        .soft_rho_guard_prior_atom(&strategy_result.rho)
1388                        .cost();
1389                let base_pure = strategy_result.final_value - conv_prior;
1390                if !select_coords.is_empty() && base_pure.is_finite() && conv_prior.is_finite() {
1391                    // Converged-point total inner EDF, for the PARSIMONY guard below.
1392                    // The inner P-IRLS solve at the converged ρ is cached, so this is
1393                    // free. A genuine #1266 shrink-out (an UNSUPPORTED, uncorrelated
1394                    // term selected out) strictly LOWERS the model's total EDF; a
1395                    // concurvity TRANSFER (#1476: one null-space shrinks but its
1396                    // correlated partner absorbs the signal via the inner β re-solve)
1397                    // INFLATES it. Pure REML alone marginally prefers the transfer on
1398                    // a flat concurvity ridge — exactly the allocation the degeneracy
1399                    // prior exists to forbid — so the escape must additionally refuse
1400                    // any adoption that does not reduce total EDF.
1401                    let edf_conv = reml_state
1402                        .obtain_eval_bundle(&strategy_result.rho)
1403                        .ok()
1404                        .map(|b| b.pirls_result.edf);
1405                    // Pure data-REML at ρ: penalized `compute_cost` minus the configured
1406                    // ρ-prior and the soft λ→0 guard (both `O(K)` functions of ρ alone).
1407                    // Subtracting them recovers the mgcv-parity criterion selection
1408                    // must follow; the prior bias on λ_null is removed exactly.
1409                    let pure_reml = |rho: &Array1<f64>| -> Option<f64> {
1410                        let c = reml_state.compute_cost(rho).ok()?;
1411                        if !c.is_finite() {
1412                            return None;
1413                        }
1414                        let prior = reml_state.configured_rho_prior_atom(rho).cost()
1415                            + reml_state.soft_rho_guard_prior_atom(rho).cost();
1416                        if !prior.is_finite() {
1417                            return None;
1418                        }
1419                        Some(c - prior)
1420                    };
1421                    // Ascending over-smoothing grid in ABSOLUTE ρ (toward the
1422                    // shrink-out rail at `RHO_BOUND`); only values strictly above a
1423                    // coordinate's current ρ are over-smoothing candidates. Bounded:
1424                    // at most 2 · |select| · 6 inner solves, and only fires when a
1425                    // null-space coordinate is actually held below the rail.
1426                    let rho_upper = crate::estimate::RHO_BOUND;
1427                    const OUTWARD_GRID: [f64; 6] = [6.0, 9.0, 12.0, 18.0, 24.0, 30.0];
1428                    let mut best_rho = strategy_result.rho.clone();
1429                    let mut best_pure = base_pure;
1430                    let mut improved = false;
1431                    for _pass in 0..2 {
1432                        let mut pass_improved = false;
1433                        for &coord in &select_coords {
1434                            let mut local_best = best_rho.clone();
1435                            let mut local_pure = best_pure;
1436                            for &cand in &OUTWARD_GRID {
1437                                let target = cand.min(rho_upper);
1438                                if target <= best_rho[coord] + 1e-9 {
1439                                    continue;
1440                                }
1441                                let mut probe = best_rho.clone();
1442                                probe[coord] = target;
1443                                if let Some(c) = pure_reml(&probe)
1444                                    && c < local_pure - 1e-6 * (1.0 + local_pure.abs())
1445                                {
1446                                    local_pure = c;
1447                                    local_best = probe;
1448                                }
1449                            }
1450                            if local_pure < best_pure - 1e-6 * (1.0 + best_pure.abs()) {
1451                                best_rho = local_best;
1452                                best_pure = local_pure;
1453                                improved = true;
1454                                pass_improved = true;
1455                            }
1456                        }
1457                        if !pass_improved {
1458                            break;
1459                        }
1460                    }
1461                    if improved {
1462                        // COLD confirmation (mirror of #1074): the warm grid probes
1463                        // ran off each other's inner warm starts and can report a
1464                        // spuriously-low cost. Clear the inner cache and re-score the
1465                        // candidate cold; adopt only if its PURE REML STILL strictly
1466                        // beats the authoritative converged baseline.
1467                        reml_state.reset_outer_seed_state();
1468                        let cold_penalized = reml_state.compute_cost(&best_rho);
1469                        let cold_pure = cold_penalized.as_ref().ok().and_then(|&c| {
1470                            c.is_finite().then(|| {
1471                                c - reml_state.configured_rho_prior_atom(&best_rho).cost()
1472                                    - reml_state.soft_rho_guard_prior_atom(&best_rho).cost()
1473                            })
1474                        });
1475                        // Total inner EDF at the candidate (cached from the cold eval).
1476                        // The PARSIMONY guard: a genuine shrink-out must not INCREASE
1477                        // the model's effective dimension (see `edf_conv`). When either
1478                        // EDF is unavailable, refuse the adoption — a shrink that can't
1479                        // be certified parsimonious is not worth the #1476 risk.
1480                        let edf_best = reml_state
1481                            .obtain_eval_bundle(&best_rho)
1482                            .ok()
1483                            .map(|b| b.pirls_result.edf);
1484                        let edf_non_increasing = match (edf_best, edf_conv) {
1485                            (Some(eb), Some(ec)) => eb <= ec + 1e-6,
1486                            _ => false,
1487                        };
1488                        if let (Ok(penalized), Some(cold_pure)) = (cold_penalized, cold_pure)
1489                            && cold_pure.is_finite()
1490                            && cold_pure < base_pure - 1e-6 * (1.0 + base_pure.abs())
1491                            && edf_non_increasing
1492                        {
1493                            // β̂ already installed at `best_rho` by the cold eval above.
1494                            // Report the PENALIZED cost there as the objective so the
1495                            // cached inner state and `final_value` agree with the
1496                            // adopted ρ for the downstream cap-guard / assembly.
1497                            log::info!(
1498                                "[OUTER] #1266 null-space shrink-out escape: pure REML \
1499                             {base_pure:.6e} → {cold_pure:.6e} (cold-confirmed), total \
1500                             EDF {edf_conv:?} → {edf_best:?} (parsimonious) by \
1501                             over-smoothing {} selection coord(s); adopting the \
1502                             shrink-out ρ (penalized cost {penalized:.6e})",
1503                                select_coords.len()
1504                            );
1505                            strategy_result.rho = best_rho;
1506                            strategy_result.final_value = penalized;
1507                        } else {
1508                            // The improvement did not survive a cold re-score (or the
1509                            // re-score failed) — a warm-start artifact. Keep the
1510                            // certified ρ and restore its inner state.
1511                            reml_state.reset_outer_seed_state();
1512                            reml_state.compute_cost(&strategy_result.rho)?;
1513                        }
1514                    }
1515                }
1516            }
1517            // Convergence guard for the outer-aware inner-PIRLS schedule
1518            // (path #3): the BFGS bridge stores a coarsen-then-tighten cap
1519            // into `reml_state.outer_inner_cap` on every accepted gradient
1520            // eval. After the outer optimizer returns, the cached warm-start
1521            // β was computed at whatever cap the schedule last set — which
1522            // for fast-converging fits (≤5 BFGS iters) is a coarse cap of
1523            // 5/10/20 rather than the full inner budget. Reset the cap to 0
1524            // and run one final cost eval at the converged ρ so the cached
1525            // β is at full inner tolerance.
1526            run_outer_inner_cap_guard(
1527                &mut reml_state,
1528                &strategy_result.rho,
1529                RemlInnerCapGuardArm::Standard,
1530            )?;
1531            // Honour an explicit caller rho seed as the accepted log-λ: when the
1532            // caller pins `init_rhos`, the outer search is warm-started there and
1533            // the seed is the requested operating point, so report it verbatim
1534            // rather than the optimizer's (possibly clamped) returned rho.
1535            //
1536            // EXCEPTION (gam#1464): a caller seed that arrives as a warm-start hint
1537            // (the spatial-κ sweep reuses the previous-κ λ̂ as `heuristic_lambdas`)
1538            // must NOT pin the fit at a seed the optimizer has just been able to
1539            // strictly improve on. At a collapsing kernel (the constant-curvature
1540            // `curv()` smooth on the +κ side) the warm seed sits in a shallow
1541            // under-smoothing basin whose spuriously-low deviance, if reported
1542            // verbatim, makes the κ outer objective rail to the +chart bound for any
1543            // curved data. The objective-grid prepass and the #1371 release-and-
1544            // rerank guard above redirect `strategy_result.rho` into the correct
1545            // high-λ basin; defer to that converged ρ whenever it is STRICTLY cheaper
1546            // than the caller seed under the same REML cost. A genuine user pin (or a
1547            // healthy warm start) converges at the seed, so the seed stays cheapest
1548            // and is honoured verbatim, byte-for-byte as before.
1549            let accepted_rho = match heuristic_lambdas.filter(|h| h.len() == k) {
1550                Some(h) => {
1551                    let seed = Array1::from_iter(h.iter().copied());
1552                    let prefer_converged = {
1553                        let cost_seed = reml_state.compute_cost(&seed).ok();
1554                        let cost_converged = reml_state.compute_cost(&strategy_result.rho).ok();
1555                        // Restore the cached β̂ to the converged operating point after
1556                        // the seed probe (the no-op path below expects β̂ at
1557                        // `strategy_result.rho`). Propagate any failure rather than
1558                        // swallowing it: proceeding with β̂ at the wrong operating
1559                        // point would silently corrupt the reported fit.
1560                        reml_state.compute_cost(&strategy_result.rho)?;
1561                        match (cost_seed, cost_converged) {
1562                            (Some(cs), Some(cc)) if cs.is_finite() && cc.is_finite() => {
1563                                cc < cs - 1e-6 * (1.0 + cs.abs())
1564                            }
1565                            _ => false,
1566                        }
1567                    };
1568                    if prefer_converged {
1569                        log::info!(
1570                            "[OUTER] #1464 warm-seed override: converged ρ is strictly cheaper than \
1571                         the caller warm seed; reporting the optimizer's ρ instead of the seed"
1572                        );
1573                        strategy_result.rho.clone()
1574                    } else {
1575                        seed
1576                    }
1577                }
1578                None => strategy_result.rho.clone(),
1579            };
1580            (
1581                accepted_rho,
1582                cfg.link_kind.mixture_state().cloned(),
1583                cfg.link_kind.sas_state().copied(),
1584                None,
1585                None,
1586                strategy_result,
1587            )
1588        } else {
1589            let use_mixture = mixture_dim > 0;
1590            let use_sas = sas_dim > 0;
1591            let use_beta_logistic =
1592                use_sas && matches!(cfg.link_function(), LinkFunction::BetaLogistic);
1593            let theta_dim = k + mixture_dim + sas_dim;
1594            let sasspec = sas_optspec;
1595            let mixspec = mixture_optspec
1596                .clone()
1597                .or_else(|| {
1598                    if use_mixture {
1599                        None
1600                    } else {
1601                        Some(MixtureLinkSpec {
1602                            components: Vec::new(),
1603                            initial_rho: Array1::zeros(0),
1604                        })
1605                    }
1606                })
1607                .ok_or_else(|| EstimationError::InvalidInput("missing mixture spec".to_string()))?;
1608            let mut heuristic_theta = Vec::new();
1609            if let Some(hvals) = heuristic_lambdas
1610                && hvals.len() == k
1611            {
1612                heuristic_theta.extend_from_slice(hvals);
1613                if use_mixture {
1614                    heuristic_theta
1615                        .extend_from_slice(mixspec.initial_rho.as_slice().unwrap_or(&[]));
1616                }
1617                if let Some(spec) = sasspec {
1618                    heuristic_theta.push(spec.initial_epsilon);
1619                    heuristic_theta.push(spec.initial_log_delta);
1620                }
1621            }
1622            let heuristic_theta_ref = if heuristic_theta.len() == theta_dim {
1623                Some(heuristic_theta.as_slice())
1624            } else {
1625                None
1626            };
1627            let aux_dim_outer = if use_mixture { mixture_dim } else { sas_dim };
1628            let mut reml_seed_config_mix = reml_seed_config;
1629            reml_seed_config_mix.num_auxiliary_trailing = aux_dim_outer;
1630            if theta_dim >= REML_SEED_SCREENING_RHO_CAP {
1631                reml_seed_config_mix.max_seeds = 1;
1632                reml_seed_config_mix.seed_budget = 1;
1633            }
1634            use crate::rho_optimizer::OuterProblem;
1635            use gam_problem::{DeclaredHessianForm, Derivative, HessianResult, OuterEval};
1636            let initial_link_kind = cfg.link_kind.clone();
1637            let prefer_gradient_only = theta_dim >= REML_SECOND_ORDER_RHO_CAP;
1638            let continuation_prewarm = theta_dim < REML_CONTINUATION_PREWARM_RHO_CAP;
1639            if prefer_gradient_only {
1640                log::info!(
1641                    "[OUTER] theta_dim {theta_dim} reaches exact REML Hessian budget \
1642                   ({REML_SECOND_ORDER_RHO_CAP}); routing analytic-gradient quasi-Newton"
1643                );
1644            }
1645            if !continuation_prewarm {
1646                log::info!(
1647                    "[OUTER] theta_dim {theta_dim} reaches continuation-prewarm budget \
1648                   ({REML_CONTINUATION_PREWARM_RHO_CAP}); starting optimizer directly from seeds"
1649                );
1650            }
1651            let problem = OuterProblem::new(theta_dim)
1652            .with_gradient(Derivative::Analytic)
1653            .with_hessian(DeclaredHessianForm::Either)
1654            .with_prefer_gradient_only(prefer_gradient_only)
1655            .with_continuation_prewarm(continuation_prewarm)
1656            .with_psi_dim(mixture_dim + sas_dim)
1657            .with_barrier(
1658                crate::estimate::reml::reml_outer_engine::BarrierConfig::from_constraints(
1659                    fit_linear_constraints.as_ref(),
1660                ),
1661            )
1662            .with_tolerance(reml_tol)
1663            .with_max_iter(reml_max_iter)
1664            .with_seed_config(reml_seed_config_mix)
1665            .with_screening_cap(Arc::clone(&reml_state.screening_max_inner_iterations))
1666            .with_outer_inner_cap(reml_inner_progress_feedback(&reml_state))
1667            .with_rho_bound(crate::estimate::RHO_BOUND);
1668            let problem = if let Some(h) = heuristic_theta_ref {
1669                problem.with_heuristic_lambdas(h.to_vec())
1670            } else {
1671                problem
1672            };
1673            let problem = if let Some(h) = heuristic_theta_ref {
1674                problem.with_initial_rho(Array1::from_iter(h.iter().copied()))
1675            } else {
1676                problem
1677            };
1678            let problem = match reml_state.outer_cache_session() {
1679                Some(session) => problem.with_cache_session(session),
1680                None => problem,
1681            };
1682            // Shared helper: parse theta into rho + link params, update link state.
1683            let apply_link_theta =
1684                |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1685                 theta: &Array1<f64>|
1686                 -> Result<Array1<f64>, EstimationError> {
1687                    let rho = theta.slice(s![..k]).to_owned();
1688                    let mut cfg_eval = cfg.clone();
1689                    if use_mixture {
1690                        let mix_rho = theta.slice(s![k..(k + mixture_dim)]).to_owned();
1691                        cfg_eval.link_kind = InverseLink::Mixture(
1692                            state_fromspec(&MixtureLinkSpec {
1693                                components: mixspec.components.clone(),
1694                                initial_rho: mix_rho,
1695                            })
1696                            .map_err(|e| {
1697                                EstimationError::InvalidInput(format!(
1698                                    "invalid blended inverse link: {e}"
1699                                ))
1700                            })?,
1701                        );
1702                    }
1703                    if use_sas {
1704                        let epsilon = if use_beta_logistic {
1705                            theta[k]
1706                        } else {
1707                            let (v, _) = sas_effective_epsilon(theta[k]);
1708                            v
1709                        };
1710                        let delta_like = theta[k + 1];
1711                        cfg_eval.link_kind = if use_beta_logistic {
1712                            InverseLink::BetaLogistic(
1713                                state_from_beta_logisticspec(SasLinkSpec {
1714                                    initial_epsilon: epsilon,
1715                                    initial_log_delta: delta_like,
1716                                })
1717                                .map_err(|e| {
1718                                    EstimationError::InvalidInput(format!(
1719                                        "invalid Beta-Logistic link: {e}"
1720                                    ))
1721                                })?,
1722                            )
1723                        } else {
1724                            InverseLink::Sas(
1725                                state_from_sasspec(SasLinkSpec {
1726                                    initial_epsilon: epsilon,
1727                                    initial_log_delta: delta_like,
1728                                })
1729                                .map_err(|e| {
1730                                    EstimationError::InvalidInput(format!("invalid SAS link: {e}"))
1731                                })?,
1732                            )
1733                        };
1734                    }
1735                    state.set_link_states(
1736                        cfg_eval.link_kind.mixture_state().cloned(),
1737                        cfg_eval.link_kind.sas_state().copied(),
1738                    );
1739                    Ok(rho)
1740                };
1741
1742            // SAS ridge/barrier cost correction (shared between cost_fn, eval_fn, efs_fn).
1743            let sas_ridge_cost = |theta: &Array1<f64>| -> f64 {
1744                let sasridge = if use_sas && !use_beta_logistic {
1745                    sasridgeweight
1746                } else {
1747                    0.0
1748                };
1749                if use_sas && sasridge > 0.0 {
1750                    let log_delta = theta[k + 1];
1751                    let mut extra = 0.5 * sasridge * log_delta * log_delta;
1752                    if !use_beta_logistic {
1753                        let (barriercost, _) = sas_log_delta_edge_barriercostgrad(log_delta);
1754                        extra += barriercost;
1755                    }
1756                    extra
1757                } else {
1758                    0.0
1759                }
1760            };
1761
1762            let obj = problem.build_objective(
1763            &mut reml_state,
1764            |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1765             theta: &Array1<f64>| {
1766                let rho = apply_link_theta(state, theta)?;
1767                let cost = state.compute_cost(&rho)? + sas_ridge_cost(theta);
1768                Ok(cost)
1769            },
1770            |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1771             theta: &Array1<f64>| {
1772                let eval_idx = outer_eval_idx.fetch_add(1, Ordering::Relaxed) + 1;
1773                let rho = apply_link_theta(state, theta)?;
1774                let tcost = Instant::now();
1775
1776                // Use the unified REML evaluator with link ext_coords.
1777                // This computes ρ gradient AND link parameter gradient jointly
1778                // through the same HyperCoord infrastructure used for aniso ψ.
1779                let eval_mode =
1780                    crate::estimate::reml::reml_outer_engine::EvalMode::ValueGradientHessian;
1781                let result = state.evaluate_unified_with_link_ext(&rho, eval_mode)?;
1782
1783                let cost = result.cost + sas_ridge_cost(theta);
1784                let mut grad = result.gradient.ok_or_else(|| {
1785                    EstimationError::InvalidInput(
1786                        "unified evaluator returned no gradient in ValueGradientHessian mode"
1787                            .to_string(),
1788                    )
1789                })?;
1790
1791                assert_eq!(
1792                    grad.len(),
1793                    theta_dim,
1794                    "unified evaluator gradient length {} != theta_dim {}",
1795                    grad.len(),
1796                    theta_dim
1797                );
1798
1799                let grad_effective = grad.clone();
1800                let mut hessian = materialize_link_outer_hessian(result.hessian, theta_dim)?;
1801
1802                // SAS epsilon reparameterization chain rule.
1803                if use_sas && !use_beta_logistic {
1804                    let (_, d_eps_d_raw, d2_eps_d_raw2) = sas_effective_epsilon_second(theta[k]);
1805                    for j in 0..theta_dim {
1806                        hessian[[k, j]] *= d_eps_d_raw;
1807                        hessian[[j, k]] *= d_eps_d_raw;
1808                    }
1809                    hessian[[k, k]] += grad_effective[k] * d2_eps_d_raw2;
1810                    grad[k] *= d_eps_d_raw;
1811                }
1812                // SAS log_delta ridge + barrier gradient/Hessian.
1813                if use_sas && !use_beta_logistic && sasridgeweight > 0.0 {
1814                    let log_delta = theta[k + 1];
1815                    grad[k + 1] += sasridgeweight * log_delta;
1816                    hessian[[k + 1, k + 1]] += sasridgeweight;
1817                    let (_, barriergrad, barrierhess) =
1818                        sas_log_delta_edge_barriercostgradhess(log_delta);
1819                    grad[k + 1] += barriergrad;
1820                    hessian[[k + 1, k + 1]] += barrierhess;
1821                }
1822
1823                let cost_sec = tcost.elapsed().as_secs_f64();
1824                let aux_dim = if use_mixture { mixture_dim } else { sas_dim };
1825                log::debug!(
1826                    "[outer-eval {eval_idx}] theta_dim={} aux_dim={} unified_link_ext time_sec={:.3}",
1827                    theta_dim,
1828                    aux_dim,
1829                    cost_sec,
1830                );
1831                Ok(OuterEval {
1832                    cost,
1833                    gradient: grad,
1834                    hessian: HessianResult::Analytic(hessian),
1835                    inner_beta_hint: state.current_original_basis_beta(),
1836                })
1837            },
1838            Some(|state: &mut &mut crate::estimate::reml::RemlState<'_>| {
1839                state.reset_outer_seed_state();
1840                state.set_link_states(
1841                    initial_link_kind.mixture_state().cloned(),
1842                    initial_link_kind.sas_state().copied(),
1843                );
1844            }),
1845            Some(
1846                |state: &mut &mut crate::estimate::reml::RemlState<'_>,
1847                 theta: &Array1<f64>| {
1848                    let rho = apply_link_theta(state, theta)?;
1849                    let mut efs_eval = state.compute_efs_steps_with_link_ext(&rho)?;
1850
1851                    // SAS reparameterization chain rule on ψ steps.
1852                    if use_sas && !use_beta_logistic {
1853                        let (_, d_eps_d_raw) = sas_effective_epsilon(theta[k]);
1854                        if efs_eval.steps.len() > k {
1855                            efs_eval.steps[k] *= d_eps_d_raw;
1856                        }
1857                        if let Some(ref mut pg) = efs_eval.psi_gradient
1858                            && !pg.is_empty() {
1859                                pg[0] *= d_eps_d_raw;
1860                            }
1861                    }
1862
1863                    // SAS log-δ ridge + edge barrier: their gradients enter
1864                    // `result.gradient` from the unified evaluator (estimate.rs
1865                    // 2170+), and `compute_efs_steps_with_link_ext` runs the
1866                    // universal-form EFS step `Δρ = log(1 − 2·g_full/q_eff)`
1867                    // which absorbs them automatically. We only need to
1868                    // mirror that contribution into the *cost* slot here so
1869                    // the outer fixed-point bridge's line search compares
1870                    // augmented-cost trial points consistently.
1871                    efs_eval.cost += sas_ridge_cost(theta);
1872                    Ok(efs_eval)
1873                },
1874            ),
1875        );
1876            // Same publish/consume symmetry as the standard REML arm above
1877            // (issue #236). The mixture/SAS eval closure also surfaces
1878            // `inner_beta_hint = state.current_original_basis_beta()` (see
1879            // src/solver/estimate.rs:3275), so continuation pre-warm needs
1880            // a real seed hook to install it.
1881            let mut obj = obj.with_seed_inner_state(with_reml_beta_seed_hook());
1882            let outer_result = problem.run(&mut obj, "mixture/SAS flexible link")?;
1883            drop(obj);
1884            // Convergence guard for the outer-aware inner-PIRLS schedule
1885            // (path #3) — see the matching comment in the standard REML arm
1886            // above. Reset the cap and run one final compute_cost at the
1887            // converged θ so the cached warm-start β is at full inner
1888            // tolerance regardless of where the BFGS schedule was when the
1889            // optimizer terminated.
1890            //
1891            // The outer vector here is the AUGMENTED θ = [ρ_smooth (k) | link
1892            // params (mixture_dim and/or sas_dim)], not a smoothing-only ρ.
1893            // `compute_cost` exponentiates its argument wholesale into the
1894            // penalty λ vector (loop_driver.rs `rho.mapv(exp)`), so the guard
1895            // must receive exactly the same smoothing-only ρ — and the same
1896            // installed link state — the outer evaluator operated on, never the
1897            // raw augmented θ. Feeding the full θ in made the guard hand `k +
1898            // mixture_dim + sas_dim` "lambdas" to a `k`-penalty reparameterizer,
1899            // which faults with "Lambda count mismatch" (#1571). Route θ through
1900            // the same `apply_link_theta` the eval closure (optimizer.rs:1759)
1901            // and the accept-fit slice (the `final_rho` line just below) use: it
1902            // installs the converged mixture/SAS link state onto `reml_state`
1903            // and returns the smoothing-only ρ block.
1904            let guard_rho = {
1905                let mut state_ref: &mut crate::estimate::reml::RemlState<'_> =
1906                    &mut reml_state;
1907                apply_link_theta(&mut state_ref, &outer_result.rho)?
1908            };
1909            run_outer_inner_cap_guard(
1910                &mut reml_state,
1911                &guard_rho,
1912                RemlInnerCapGuardArm::MixtureSas,
1913            )?;
1914            let final_rho = outer_result.rho.slice(s![..k]).to_owned();
1915            let final_mix_state = if use_mixture {
1916                let final_mix_rho = outer_result.rho.slice(s![k..(k + mixture_dim)]).to_owned();
1917                Some(
1918                    state_fromspec(&MixtureLinkSpec {
1919                        components: mixspec.components.clone(),
1920                        initial_rho: final_mix_rho,
1921                    })
1922                    .map_err(|e| {
1923                        EstimationError::InvalidInput(format!("invalid blended inverse link: {e}"))
1924                    })?,
1925                )
1926            } else {
1927                None
1928            };
1929            let final_sas_state = if use_sas {
1930                let epsilon_eff = if use_beta_logistic {
1931                    outer_result.rho[k]
1932                } else {
1933                    let (v, _) = sas_effective_epsilon(outer_result.rho[k]);
1934                    v
1935                };
1936                Some(if use_beta_logistic {
1937                    state_from_beta_logisticspec(SasLinkSpec {
1938                        initial_epsilon: epsilon_eff,
1939                        initial_log_delta: outer_result.rho[k + 1],
1940                    })
1941                    .map_err(|e| {
1942                        EstimationError::InvalidInput(format!("invalid Beta-Logistic link: {e}"))
1943                    })?
1944                } else {
1945                    state_from_sasspec(SasLinkSpec {
1946                        initial_epsilon: epsilon_eff,
1947                        initial_log_delta: outer_result.rho[k + 1],
1948                    })
1949                    .map_err(|e| EstimationError::InvalidInput(format!("invalid SAS link: {e}")))?
1950                })
1951            } else {
1952                cfg.link_kind.sas_state().copied()
1953            };
1954            let aux_param_covariance = None;
1955            let (mix_cov, sas_cov) = if use_mixture {
1956                (aux_param_covariance, None)
1957            } else if use_sas {
1958                (None, aux_param_covariance)
1959            } else {
1960                (None, None)
1961            };
1962            (
1963                final_rho,
1964                final_mix_state,
1965                final_sas_state,
1966                mix_cov,
1967                sas_cov,
1968                outer_result,
1969            )
1970        };
1971        // Reuse the Gaussian-Identity XᵀWX cache the outer loop already populated,
1972        // so the final accept-fit skips the streaming GEMM as well.
1973        //
1974        // When the outer loop conditioned the response (centering for #1000, scaling
1975        // for #1127), that cache holds `XᵀW((y−center)/scale)`; the accept-fit runs
1976        // on the *original* response `y_o`, so reusing the conditioned `XᵀWy` would
1977        // solve on the shifted/rescaled scale and report every fitted value, residual
1978        // and dispersion off the user's scale. Rebuild the cross-product from the
1979        // original response in that case — the constant `XᵀWX` block is the only part
1980        // the cache would have saved, a one-off cost paid only on the rare
1981        // large-mean / small-magnitude responses that trigger conditioning.
1982        let final_cache_handle = if response_center.is_some() || response_scale.is_some() {
1983            None
1984        } else {
1985            reml_state.gaussian_fixed_cache_if_eligible()
1986        };
1987        let pirls_res_pair = pirls::fit_model_for_fixed_rho_with_adaptive_kkt(
1988            LogSmoothingParamsView::new(final_rho.view()),
1989            pirls::PirlsProblem {
1990                x: reml_state.x(),
1991                offset: offset_o.view(),
1992                y: y_o.view(),
1993                priorweights: w_o.view(),
1994                covariate_se: None,
1995                gaussian_fixed_cache: final_cache_handle.as_deref(),
1996                // The final reported fit must be exact at the converged ρ/ψ — never
1997                // serve the frozen-W first-step approximation here.
1998                glm_first_step_gram: None,
1999            },
2000            pirls::PenaltyConfig {
2001                canonical_penalties: reml_state.canonical_penalties(),
2002                balanced_penalty_root: Some(reml_state.balanced_penalty_root()),
2003                reparam_invariant: None,
2004                p,
2005                coefficient_lower_bounds: None,
2006                linear_constraints_original: fit_linear_constraints.as_ref(),
2007                penalty_shrinkage_floor: opts.penalty_shrinkage_floor,
2008                kronecker_factored: None,
2009            },
2010            &pirls::PirlsConfig {
2011                link_kind: if let Some(state) = final_mixture_state.clone() {
2012                    InverseLink::Mixture(state)
2013                } else if let Some(state) = final_sas_state {
2014                    if matches!(cfg.link_function(), LinkFunction::BetaLogistic) {
2015                        InverseLink::BetaLogistic(state)
2016                    } else {
2017                        InverseLink::Sas(state)
2018                    }
2019                } else {
2020                    cfg.link_kind.clone()
2021                },
2022                ..cfg.as_pirls_config()
2023            },
2024            None,
2025            None,
2026            // Final, reported fit at the REML-selected λ: refine the family's
2027            // estimated dispersion nuisance at the converged η. For Gamma this
2028            // re-estimates the shape so `dispersion_phi()` and every SE / interval
2029            // reflect the conditional noise, not the spread of μ (#678); for Beta
2030            // it drives the precision φ and the mean β̂ to their joint fixed point,
2031            // undoing the slope attenuation from a φ frozen at the null predictor
2032            // (#769). λ is fixed here, so there is no scale↔λ feedback.
2033            true,
2034        )?;
2035        pirls_res = pirls_res_pair.0;
2036
2037        // Negative-Binomial outer θ↔λ alternation decision (#1448, supersedes the
2038        // #1082 drift diagnostic).
2039        //
2040        // θ was frozen at the λ-search value (`frozen_negbin_theta`) so `F(ρ)` is
2041        // stationary in ρ; the accept-fit above ML-refreshed θ at the converged η.
2042        // If that refreshed θ_final drifted from the search θ_frozen by more than the
2043        // joint-stationarity tolerance, the ρ we just selected was optimal for the
2044        // OLD θ, not θ_final: re-freeze the search at θ_final, reset the outer seed
2045        // state (eval bundle, PIRLS cache, warm-start signals, inner caps — all keyed
2046        // to the old θ), and run the ρ search again. Iterate to the (ρ, θ) joint
2047        // fixed point or until the round cap, after which we accept the last fit and
2048        // log the residual drift. For non-NB / user-fixed-θ fits the criterion below
2049        // is never met (θ is not estimated), so the loop breaks on round 0 and the
2050        // fit is byte-identical to the pre-#1448 single pass.
2051        let mut should_alternate = false;
2052        if pirls_res.likelihood.negbin_theta_is_estimated() {
2053            let frozen_bits = reml_state.frozen_negbin_theta.load(Ordering::Relaxed);
2054            if frozen_bits != 0
2055                && let Some(theta_final) = pirls_res.likelihood.negbin_theta()
2056            {
2057                let theta_frozen = f64::from_bits(frozen_bits);
2058                if theta_frozen.is_finite() && theta_frozen > 0.0 && theta_final.is_finite() {
2059                    let rel_drift =
2060                        (theta_final - theta_frozen).abs() / theta_frozen.max(f64::MIN_POSITIVE);
2061                    let drift_pct = rel_drift * 100.0;
2062                    if rel_drift > NEGBIN_THETA_JOINT_DRIFT_TOL {
2063                        if negbin_alternation_round + 1 < NEGBIN_OUTER_ALTERNATION_MAX_ROUNDS {
2064                            log::info!(
2065                                "[OUTER] negative-binomial θ↔λ alternation round {}: θ drifted \
2066                             {drift_pct:.1}% (θ_frozen={theta_frozen:.6e} → θ_final={theta_final:.6e}); \
2067                             re-freezing at θ_final and re-running the ρ search (#1448).",
2068                                negbin_alternation_round + 1
2069                            );
2070                            // Re-freeze the λ-search θ at the refreshed value. The
2071                            // capture in `solve_for_unified_rho` only writes when the
2072                            // frozen slot is 0, so a non-zero value here pins every
2073                            // subsequent λ-search inner solve to θ_final rather than
2074                            // re-deriving it from the seed η.
2075                            reml_state
2076                                .frozen_negbin_theta
2077                                .store(theta_final.to_bits(), Ordering::Relaxed);
2078                            // The cached criterion / factor bundle and warm-start
2079                            // signals were all computed at θ_frozen; drop them so the
2080                            // next round's ρ search recomputes `F(ρ) = REML(ρ, θ_final)`.
2081                            reml_state.reset_outer_seed_state();
2082                            should_alternate = true;
2083                        } else {
2084                            log::warn!(
2085                                "[OUTER] negative-binomial θ↔λ alternation hit the round cap \
2086                             ({NEGBIN_OUTER_ALTERNATION_MAX_ROUNDS}) with residual θ drift \
2087                             {drift_pct:.1}% (θ_frozen={theta_frozen:.6e} → θ_final={theta_final:.6e}); \
2088                             accepting the last fit (#1448)."
2089                            );
2090                        }
2091                    } else {
2092                        log::debug!(
2093                            "[OUTER] negative-binomial (ρ, θ) jointly stationary after {} \
2094                         alternation round(s): drift {drift_pct:.2}% \
2095                         (θ_frozen={theta_frozen:.6e} → θ_final={theta_final:.6e}).",
2096                            negbin_alternation_round + 1
2097                        );
2098                    }
2099                }
2100            }
2101        }
2102        if should_alternate {
2103            negbin_alternation_round += 1;
2104            continue;
2105        }
2106        break;
2107    } // negbin θ↔λ alternation loop (#1448)
2108    // Ensure we don't report 0 iterations to the caller; at least 1 is more meaningful.
2109    let iters = std::cmp::max(1, outer_result.iterations);
2110
2111    // Map beta back to original basis
2112    let beta_orig_internal = pirls_res
2113        .reparam_result
2114        .qs
2115        .dot(pirls_res.beta_transformed.as_ref());
2116    let beta_orig = conditioning.backtransform_beta(&beta_orig_internal);
2117
2118    // Effective sample size for dispersion/REML accounting.
2119    //
2120    // A prior weight of exactly 0 makes a row contribute nothing to any weighted
2121    // cross-product (XᵀWX, XᵀWy) or to the weighted RSS (w_i·r_i² = 0), so such a
2122    // row is statistically equivalent to an absent row. The *only* channel left by
2123    // which it could still perturb the fit is an explicit observation count. To
2124    // keep zero-weight rows exactly equivalent to absent rows (R's `n.ok =
2125    // nobs − Σ[w==0]`, mgcv's dropped zero-weight observations), the dispersion
2126    // sample size must be the count of positive-weight rows, not the raw row
2127    // count. Otherwise the Gaussian scale φ̂ = weighted_rss / (n − edf) puts a
2128    // numerator that already excludes zero-weight rows over a denominator that
2129    // counts them, biasing φ̂ low and shrinking every SE (#584). The REML
2130    // criterion's own observation count (which drives λ selection) lives in the
2131    // inner-solution assembly and must apply the same positive-weight count.
2132    let n = w_o.iter().filter(|&&wi| wi > 0.0).count() as f64;
2133    let weighted_rss = if matches!(cfg.link_function(), LinkFunction::Identity) {
2134        let fitted = {
2135            let mut eta = offset_o.clone();
2136            eta += &x_o.matrixvectormultiply(&beta_orig);
2137            eta
2138        };
2139        let resid = y_o.to_owned() - &fitted;
2140        w_o.iter()
2141            .zip(resid.iter())
2142            .map(|(&wi, &ri)| wi * ri * ri)
2143            .sum()
2144    } else {
2145        0.0
2146    };
2147
2148    // Default solver policy stays on the REML/Laplace path. Joint HMC remains
2149    // available through explicit sampling flows, but fitting does not
2150    // automatically densify the Hessian or escalate into NUTS.
2151    let (final_rho, pirls_res) = (final_rho, pirls_res);
2152
2153    // Recompute beta in the finalized basis/parameterization.
2154    let beta_orig_internal = pirls_res
2155        .reparam_result
2156        .qs
2157        .dot(pirls_res.beta_transformed.as_ref());
2158
2159    let lambdas = final_rho.mapv(f64::exp);
2160    let p_dim = pirls_res.beta_transformed.len();
2161    let penalty_rank_total = pirls_res.reparam_result.e_transformed.nrows();
2162    let mp = (p_dim as f64 - penalty_rank_total as f64).max(0.0);
2163    let mut edf_by_block = vec![0.0; k];
2164    // Raw per-block penalty trace tr_kk = λ_kk·tr(H⁻¹S_kk), retained so per-term
2165    // EDF can be assembled as |coeff_range| − Σ tr_kk (issue #1219).
2166    let mut penalty_block_trace = vec![0.0; k];
2167    let mut edf_total = 0.0;
2168    let mut smoothing_correction = None;
2169    let mut rho_covariance = None;
2170    let mut penalized_hessian = Array2::<f64>::zeros((0, 0));
2171    let mut beta_covariance = None;
2172    let mut beta_standard_errors = None;
2173    let mut beta_covariance_corrected = None;
2174    let mut beta_standard_errors_corrected = None;
2175    let mut beta_covariance_frequentist = None;
2176    let mut coefficient_influence = None;
2177    let mut weighted_gram = None;
2178    // Factorization of stabilized Hessian in transformed basis, reused for
2179    // SE computation via solve-on-demand after dispersion is determined.
2180    let mut edf_factor: Option<Box<dyn FactorizedSystem>> = None;
2181    let mut bias_correction_beta = None;
2182    let mut rho_posterior_certificate = None;
2183    let mut rho_posterior_escalation = None;
2184
2185    if opts.compute_inference {
2186        // EDF by block using stabilized H and penalty roots in transformed basis.
2187        let h = &pirls_res.stabilizedhessian_transformed;
2188        let p_dim = h.nrows();
2189        // Sparse-aware factorization with ridge retry — no densification.
2190        // Uses SymmetricMatrix::factorize() -> sparse Cholesky for sparse,
2191        // dense Cholesky for dense.
2192        let factor = {
2193            let scale = h.max_abs_diag();
2194            let min_step = scale * 1e-10;
2195            let mut ridge = 0.0_f64;
2196            let mut attempts = 0_usize;
2197            loop {
2198                let candidate = if ridge > 0.0 {
2199                    match h.addridge(ridge) {
2200                        Ok(c) => c,
2201                        Err(_) => h.clone(),
2202                    }
2203                } else {
2204                    h.clone()
2205                };
2206                if let Ok(f) = candidate.factorize() {
2207                    if ridge > 0.0 {
2208                        // This ridged factor is reused for the reported standard
2209                        // errors, covariance, and bias correction below, so those
2210                        // quantities are stabilized approximations, not the exact
2211                        // (unridged) Hessian-based values.
2212                        log::warn!(
2213                            "Inference Hessian was rank-deficient and required a stabilizing \
2214                             ridge {:.3e}; reported standard errors, covariance, and bias \
2215                             correction are computed from the ridge-stabilized factor and are \
2216                             approximations, not exact unridged values",
2217                            ridge,
2218                        );
2219                    }
2220                    break f;
2221                }
2222                attempts += 1;
2223                if attempts >= MAX_FACTORIZATION_ATTEMPTS {
2224                    return Err(EstimationError::ModelIsIllConditioned {
2225                        condition_number: f64::INFINITY,
2226                    });
2227                }
2228                ridge = if ridge <= 0.0 { min_step } else { ridge * 10.0 };
2229            }
2230        };
2231        let mut traces = vec![0.0f64; k];
2232        for (kk, cp) in pirls_res
2233            .reparam_result
2234            .canonical_transformed
2235            .iter()
2236            .enumerate()
2237        {
2238            // Build the p × rank RHS with nonzeros only in [start..end] rows.
2239            let r = &cp.col_range;
2240            let rank = cp.rank();
2241            let mut rhs = Array2::<f64>::zeros((p_dim, rank));
2242            for col in 0..rank {
2243                for row in 0..cp.block_dim() {
2244                    rhs[[r.start + row, col]] = cp.root[[col, row]];
2245                }
2246            }
2247            let sol =
2248                factor
2249                    .solvemulti(&rhs)
2250                    .map_err(|_| EstimationError::ModelIsIllConditioned {
2251                        condition_number: f64::INFINITY,
2252                    })?;
2253            // Frobenius inner product: only the block rows of rhs are nonzero.
2254            let mut frob = 0.0f64;
2255            for col in 0..rank {
2256                for row in 0..cp.block_dim() {
2257                    frob += sol[[r.start + row, col]] * rhs[[r.start + row, col]];
2258                }
2259            }
2260            // The per-block penalty trace `tr_kk = λ_kk·tr(H⁻¹ S_kk)` is the
2261            // penalized effective d.f. of block `kk`, mathematically confined to
2262            // `[0, rank_kk]` (a PSD penalty can absorb at most its own rank). When
2263            // the outer REML / spatial-κ optimizer drives a redundant block's
2264            // `λ_kk = exp(ρ_kk)` to the finite ceiling (gam#1379: the Matérn kernel
2265            // already controls the smoothness a redundant operator block also
2266            // penalizes, so REML wants `λ → ∞`), the raw product `λ_kk · frob`
2267            // can overflow to `+∞` on the ridge-stabilized inference Hessian even
2268            // though the true value is just `rank_kk` — poisoning
2269            // `penalty_block_trace[kk]` and tripping the fit-result finiteness
2270            // validator (`fit_result.penalty_block_trace[kk] must be finite, got
2271            // inf`). Clamp to the valid `[0, rank]` interval so a fully-penalized
2272            // direction reads its exact saturated trace `rank_kk` instead of `+∞`.
2273            // Ordinary finite traces are inside `[0, rank]` and pass through
2274            // unchanged, so non-degenerate fits and their recorded EDF accounting
2275            // are bit-identical (the `edf_by_block` channel already clamps the
2276            // complementary `rank − trace` to `[0, rank]`).
2277            // f64::clamp does NOT fix NaN (only ±inf): a NaN product (e.g.
2278            // inf*0 from an overflowed solve) would slip through and trip the
2279            // penalty_block_trace finiteness validator. Map any non-finite
2280            // product to the saturated `rank` bound, exactly as the inf case
2281            // already resolves (gam#1379).
2282            let trace_val = lambdas[kk] * frob;
2283            traces[kk] = if trace_val.is_finite() {
2284                trace_val.clamp(0.0, rank as f64)
2285            } else {
2286                rank as f64
2287            };
2288        }
2289        edf_total = (p_dim as f64 - kahan_sum(traces.iter().copied())).clamp(mp, p_dim as f64);
2290        penalty_block_trace.clone_from(&traces);
2291        for (kk, cp) in pirls_res
2292            .reparam_result
2293            .canonical_transformed
2294            .iter()
2295            .enumerate()
2296        {
2297            let p_k = cp.rank() as f64;
2298            let edf_k = (p_k - traces[kk]).clamp(0.0, p_k);
2299            edf_by_block[kk] = edf_k;
2300        }
2301
2302        // Reconcile the EDF accounting with the influence matrix F = H⁻¹X'WX.
2303        //
2304        // The block-trace channel above factorizes the TRANSFORMED stabilized
2305        // Hessian with a bespoke 10×-escalation ridge loop. On rank-deficient
2306        // spatial-smooth corners (degenerate-Hessian thin-plate fits) that loop
2307        // can take an enormous ridge, inflating Σ tr_kk toward `p` and collapsing
2308        // `edf_total = p − Σ tr_kk` onto its floor `mp` (e.g. 1.0 for a single
2309        // smooth) even though the fitted surface — and the influence matrix `F`
2310        // that the prediction, dispersion, and per-term EDF all consume — has
2311        // legitimately spent ~70 EDF (issue #1356). The authoritative model
2312        // definition of EDF is the influence-matrix trace; the per-term EDF
2313        // (`FitResult::per_term_edf`) reads `tr(F)` over each block. Recompute the
2314        // per-block penalty traces from the SAME rank-revealing inverse `F` uses
2315        // (`matrix_inversewith_regularization` of the original-basis Hessian), so
2316        // `edf_total = p − Σ tr_kk = tr(F)`, `Σ edf_by_block = edf_total`, and the
2317        // total can never fall below a single term's own EDF. Done before the
2318        // dispersion `σ̂² = RSS/(n − edf_total)` is formed so it, too, uses the
2319        // honest effective d.f. (the trace-channel collapse otherwise biased
2320        // σ̂² high → inflated SEs on the same seeds).
2321        //
2322        // Per-block traces `tr_kk = λ_kk·tr(H⁻¹ S_kk)` are basis-invariant; map
2323        // each canonical block's penalty root into the original coefficient basis
2324        // (`root_orig = Qs · root_t`) and contract against the original-basis
2325        // inverse. Restricted to small models (where the dense inverse `F` itself
2326        // is formed); large models keep the trace-channel value.
2327        {
2328            let p_orig = pirls_res.reparam_result.qs.nrows();
2329            const COV_FULL_INVERSE_MAX_P: usize = 10_000;
2330            if p_orig <= COV_FULL_INVERSE_MAX_P {
2331                let h_orig = map_hessian_to_original_basis(&pirls_res)?;
2332                if let Some(h_inv) =
2333                    matrix_inversewith_regularization(&h_orig, "edf reconciliation")
2334                {
2335                    let qs = &pirls_res.reparam_result.qs;
2336                    let p_t = qs.ncols();
2337                    let mut traces_f = vec![0.0f64; k];
2338                    for (kk, cp) in pirls_res
2339                        .reparam_result
2340                        .canonical_transformed
2341                        .iter()
2342                        .enumerate()
2343                    {
2344                        if kk >= lambdas.len() {
2345                            continue;
2346                        }
2347                        let r = &cp.col_range;
2348                        let rank = cp.rank();
2349                        let mut root_t = Array2::<f64>::zeros((p_t, rank));
2350                        for col in 0..rank {
2351                            for row in 0..cp.block_dim() {
2352                                root_t[[r.start + row, col]] = cp.root[[col, row]];
2353                            }
2354                        }
2355                        // S_kk = Rᵀ R; λ_kk·tr(H⁻¹ S_kk) = λ_kk·Σ_col (R_col)ᵀ H⁻¹ R_col.
2356                        let root_orig = qs.dot(&root_t); // p_orig × rank
2357                        let sol = h_inv.dot(&root_orig); // H⁻¹ R
2358                        let mut frob = 0.0f64;
2359                        for col in 0..rank {
2360                            for row in 0..p_orig {
2361                                frob += sol[[row, col]] * root_orig[[row, col]];
2362                            }
2363                        }
2364                        // Same `[0, rank]` clamp as the trace-channel path above
2365                        // (gam#1379): a ceiling-`λ` redundant block's
2366                        // `λ_kk·tr(H⁻¹ S_kk)` can overflow to `+∞` here too; the
2367                        // penalized trace is bounded by the block rank, so clamp to
2368                        // keep `penalty_block_trace` finite and the EDF accounting
2369                        // consistent. Finite in-range traces are untouched.
2370                        // NaN-safe (gam#1379): f64::clamp leaves NaN as NaN, so
2371                        // map any non-finite product to the saturated `rank`.
2372                        let trace_val = lambdas[kk] * frob;
2373                        traces_f[kk] = if trace_val.is_finite() {
2374                            trace_val.clamp(0.0, rank as f64)
2375                        } else {
2376                            rank as f64
2377                        };
2378                    }
2379                    edf_total = (p_orig as f64 - kahan_sum(traces_f.iter().copied()))
2380                        .clamp(mp, p_orig as f64);
2381                    penalty_block_trace.clone_from(&traces_f);
2382                    for (kk, cp) in pirls_res
2383                        .reparam_result
2384                        .canonical_transformed
2385                        .iter()
2386                        .enumerate()
2387                    {
2388                        let p_k = cp.rank() as f64;
2389                        edf_by_block[kk] = (p_k - traces_f[kk]).clamp(0.0, p_k);
2390                    }
2391                }
2392            }
2393        }
2394
2395        // O(n⁻¹) frequentist bias correction vector b̂ = H⁻¹ S(λ̂)(β̂ - μ).
2396        // Computed in transformed PIRLS basis (where the factorization above lives)
2397        // and then mapped to the original coefficient basis via Qs.
2398        // Frequentist bias of the linear predictor at x is -s_*(x)^T b̂; the
2399        // corrected predictor is η̂_BC(x) = η̂(x) + s_*(x)^T b̂.
2400        let beta_t = pirls_res.beta_transformed.as_ref();
2401        let mut s_beta_t = Array1::<f64>::zeros(p_dim);
2402        for (kk, cp) in pirls_res
2403            .reparam_result
2404            .canonical_transformed
2405            .iter()
2406            .enumerate()
2407        {
2408            // S_k(β - μ): only the col_range of beta couples through local penalty.
2409            let r = &cp.col_range;
2410            let local = cp.local_ref();
2411            let beta_block = beta_t.slice(ndarray::s![r.clone()]);
2412            let centered = &beta_block - &cp.prior_mean;
2413            let local_beta = local.dot(&centered);
2414            let lam_k = lambdas[kk];
2415            let mut acc = s_beta_t.slice_mut(ndarray::s![r.clone()]);
2416            acc.scaled_add(lam_k, &local_beta);
2417        }
2418        match factor.solve(&s_beta_t) {
2419            Ok(b_t) => {
2420                let qs = &pirls_res.reparam_result.qs;
2421                let b_orig = qs.dot(&b_t);
2422                if b_orig.iter().all(|v| v.is_finite()) {
2423                    bias_correction_beta = Some(b_orig);
2424                } else {
2425                    log::warn!("bias-correction vector contained non-finite entries; skipping");
2426                }
2427            }
2428            Err(e) => {
2429                log::warn!("bias-correction solve failed: {e}");
2430            }
2431        }
2432        // Preserve the factorization for solve-on-demand SE and covariance
2433        // computation below, after dispersion has been determined.
2434        edf_factor = Some(factor);
2435    }
2436
2437    // Persist residual-based scale for Gaussian identity models.
2438    // Contract: residual standard deviation sigma, not variance.
2439    //
2440    // Gaussian REML scale: σ̂² = RSS / (n − edf_total), matching mgcv's gam.scale.
2441    // Using the null-space dim (mp = p − rank(Σ_k S_k)) here was wrong: mp is the
2442    // minimum possible edf (all smooths fully penalized to their null space), so
2443    // n − mp ≥ n − edf_total, and σ̂² was systematically biased low whenever any
2444    // smooth/random-effect spent real edf. edf_total ∈ [mp, p_dim] is the effective
2445    // df computed just above from tr(λ_k · H⁻¹ S_k), and is exactly the residual
2446    // df mgcv uses. When inference is off, edf_total is unavailable, so the MLE
2447    // RSS/n is returned instead.
2448    let standard_deviation = match &pirls_res.likelihood.spec.response {
2449        ResponseFamily::Gaussian => {
2450            let denom = if opts.compute_inference {
2451                (n - edf_total).max(1.0)
2452            } else {
2453                n.max(1.0)
2454            };
2455            (weighted_rss / denom).sqrt()
2456        }
2457        ResponseFamily::Gamma => pirls_res.likelihood.gamma_shape().unwrap_or(1.0),
2458        ResponseFamily::Binomial
2459        | ResponseFamily::Tweedie { .. }
2460        | ResponseFamily::NegativeBinomial { .. }
2461        | ResponseFamily::Beta { .. }
2462        | ResponseFamily::Poisson
2463        | ResponseFamily::RoystonParmar => 1.0,
2464    };
2465    let dispersion = dispersion_from_likelihood(&pirls_res.likelihood, standard_deviation);
2466
2467    // Explicit dispersion contract for coefficient covariance matrices:
2468    // Vb = H⁻¹ · cov_scale, where the stored penalized Hessian is always
2469    // H = XᵀWX + S_λ with the penalty added UNSCALED. The multiplier therefore
2470    // restores ONLY the dispersion the working weight W does not already carry:
2471    //
2472    //   * Profiled Gaussian keeps W scale-free (W = priorweights), so the data
2473    //     term has unit implicit scale and Vb = H⁻¹·σ̂².
2474    //   * Every other family folds its reciprocal dispersion / full Fisher
2475    //     information into W (Gamma W = prior/φ, Tweedie W = prior·μ^{2−p}/φ,
2476    //     Beta/NB the complete fixed-scale Fisher info, Poisson/Binomial φ ≡ 1),
2477    //     so H already equals the true penalized Hessian (identical to mgcv's
2478    //     XᵀW_sfX/φ + S_λ) and Vb = H⁻¹ with NO extra dispersion factor. A
2479    //     post-hoc ×φ here would double-count the dispersion and shrink every SE
2480    //     by √φ (= 1/√shape for Gamma); see #679.
2481    //
2482    // The single source of truth for this invariant is
2483    // `GlmLikelihoodSpec::coefficient_covariance_scale`; the response-level
2484    // observation noise used by predictive intervals stays in `dispersion`
2485    // above (a deliberately distinct quantity, e.g. 1/shape for Gamma).
2486    let cov_scale = pirls_res
2487        .likelihood
2488        .coefficient_covariance_scale(standard_deviation * standard_deviation)
2489        .max(f64::MIN_POSITIVE);
2490
2491    // Compute gradient norm at final rho for reporting
2492    let finalgrad = reml_state
2493        .compute_gradient(&final_rho)
2494        .unwrap_or_else(|_| Array1::from_elem(final_rho.len(), f64::NAN));
2495    let finalgrad_norm_rho = finalgrad.dot(&finalgrad).sqrt();
2496    let finalgrad_norm = if finalgrad_norm_rho.is_finite() {
2497        finalgrad_norm_rho
2498    } else {
2499        outer_result.final_grad_norm.unwrap_or(0.0)
2500    };
2501
2502    if opts.compute_inference {
2503        penalized_hessian = map_hessian_to_original_basis(&pirls_res)?;
2504        let p_cov = penalized_hessian.nrows();
2505        let qs = &pirls_res.reparam_result.qs;
2506
2507        // Auto-select covariance strategy based on model size.
2508        //
2509        // For small-to-medium models (p ≤ COV_FULL_INVERSE_MAX_P) we can afford
2510        // the full p×p inverse: O(p³) compute, O(p²) memory. The full matrix is
2511        // needed for the frequentist covariance Ve = H⁻¹ X'WX H⁻¹ φ, the
2512        // influence matrix F = H⁻¹ X'WX, and the smoothing-parameter correction.
2513        //
2514        // For large models we use solve-on-demand against the Cholesky factor
2515        // already computed for EDF traces above. We solve H_t Z_t = Qs^T in
2516        // column chunks of size COV_SE_CHUNK, then extract the diagonal of
2517        // Qs · Z_t = H_orig⁻¹ to get exact posterior SEs without ever
2518        // materialising the p×p inverse. Prediction bands continue to work via
2519        // the factorised-Hessian path in PredictionCovarianceBackend::Factorized.
2520        const COV_FULL_INVERSE_MAX_P: usize = 10_000;
2521        const COV_SE_CHUNK: usize = 512;
2522
2523        // Attempt the full inverse when the model is small enough.
2524        let beta_covariance_unscaled: Option<Array2<f64>> = if p_cov <= COV_FULL_INVERSE_MAX_P {
2525            match matrix_inversewith_regularization(&penalized_hessian, "posterior covariance") {
2526                Some(h_inv) => Some(h_inv),
2527                None => {
2528                    log::warn!(
2529                        "posterior covariance inversion failed (p={p_cov}): \
2530                         falling back to solve-on-demand standard errors"
2531                    );
2532                    None
2533                }
2534            }
2535        } else {
2536            None
2537        };
2538
2539        if let Some(ref h_inv) = beta_covariance_unscaled {
2540            // Full inverse available: wrap as phi-scaled covariance, compute
2541            // frequentist quantities, and pass to smoothing-correction cubature.
2542            beta_covariance = Some(gam_problem::dispersion_cov::PhiScaledCovariance::wrap(
2543                scaled_covariance(h_inv.clone(), cov_scale),
2544            ));
2545
2546            // Frequentist covariance Ve = F H⁻¹ φ and influence matrix F = H⁻¹ X'WX.
2547            // Both require the full unscaled inverse; computed in original basis.
2548            //
2549            // The canonical penalties live in the TRANSFORMED frame, while
2550            // `h_inv` is the ORIGINAL-basis inverse — assemble S(λ) in the
2551            // transformed frame and map it through the same congruence as the
2552            // Hessian (`S_orig = Qs·S_t·Qsᵀ`, issue #1027). Pairing the
2553            // transformed-frame S directly with the original-frame inverse made
2554            // `F` (and everything reconstructed from it) frame-inconsistent.
2555            let p_t = qs.ncols();
2556            let mut s_t = Array2::<f64>::zeros((p_t, p_t));
2557            for (kk, cp) in pirls_res
2558                .reparam_result
2559                .canonical_transformed
2560                .iter()
2561                .enumerate()
2562            {
2563                if kk >= lambdas.len() {
2564                    continue;
2565                }
2566                let r = &cp.col_range;
2567                let local = cp.local_ref();
2568                let lam = lambdas[kk];
2569                for i in 0..cp.block_dim() {
2570                    for j in 0..cp.block_dim() {
2571                        s_t[[r.start + i, r.start + j]] += lam * local[[i, j]];
2572                    }
2573                }
2574            }
2575            let mut s_mat = qs.dot(&s_t).dot(&qs.t());
2576            gam_linalg::matrix::symmetrize_in_place(&mut s_mat);
2577            // Influence matrix F = I − H⁻¹·S(λ) = H⁻¹·X'WX. This is a product
2578            // of two symmetric matrices and is therefore generally NOT
2579            // symmetric; it must not be symmetrized — `gam_linalg::matrix::symmetrize_in_place(F)`
2580            // both breaks the H·F = X'WX consistency identity (so any
2581            // downstream code that reconstructs X'WX from H·F lands on an
2582            // asymmetric/indefinite matrix) AND corrupts the frequentist
2583            // covariance `Ve = F·H⁻¹·φ` (since (F_sym)·H⁻¹ ≠ H⁻¹·X'WX·H⁻¹)
2584            // AND distorts the Wood-corrected reference d.f.
2585            // `tr(F_jj)² / tr(F_jj²)` consumed by `smooth_test::reference_df`
2586            // (tr(F²) ≠ tr(F_sym²) in general). See issue #1027.
2587            let mut f_mat = Array2::<f64>::eye(p_cov);
2588            f_mat -= &h_inv.dot(&s_mat);
2589            let mut ve = f_mat.dot(h_inv);
2590            ve *= cov_scale;
2591            gam_linalg::matrix::symmetrize_in_place(&mut ve);
2592            // X'WX = H − S(λ) in the original basis — the genuine PSD weighted
2593            // Gram, reconstructed from the same `penalized_hessian` and `s_mat`
2594            // that define `F = H⁻¹X'WX` (issue #1027). Stored directly so the
2595            // WPS corrected-EDF correction never has to recover it from an
2596            // inconsistent `H·F` product.
2597            let mut xwx = &penalized_hessian - &s_mat;
2598            gam_linalg::matrix::symmetrize_in_place(&mut xwx);
2599            weighted_gram = Some(xwx);
2600            coefficient_influence = Some(f_mat);
2601            beta_covariance_frequentist = Some(ve);
2602        }
2603
2604        // Smoothing-parameter correction (first-order delta + optional cubature).
2605        // Passes None for large models; compute_smoothing_correction_auto falls
2606        // back to first-order correction when no base covariance is supplied.
2607        // `cov_scale` is the coefficient-covariance multiplier at the optimum
2608        // (σ̂² for profiled Gaussian, 1 for every weight-carries-dispersion
2609        // family). The cubature path multiplies its dispersion-free curvature
2610        // block `E_ρ[H(ρ)⁻¹] − H_opt⁻¹` by this scale so the FULL cubature
2611        // correction lands on the same c² variance scale as `Vb = cov_scale·H_opt⁻¹`
2612        // (#582); the var_beta = Cov_ρ[β̂] block is already on that scale and
2613        // stays unscaled.
2614        let smoothing_outcome = reml_state.compute_smoothing_correction_auto(
2615            &final_rho,
2616            &pirls_res,
2617            beta_covariance_unscaled.as_ref(),
2618            cov_scale,
2619            finalgrad_norm,
2620        );
2621        rho_covariance = smoothing_outcome.rho_covariance().cloned();
2622        smoothing_correction = smoothing_outcome.into_correction();
2623
2624        // Tier-0 marginal-smoothing certificate (#938): while the REML objective
2625        // is still live, sample the outer criterion around the converged ρ̂ to
2626        // read the PSIS k̂ that says whether the plug-in + first-order V_ρ
2627        // correction is adequate. This is the objective-lifecycle seam — the
2628        // certificate runs against the SAME objective the fit converged on, so
2629        // its criterion is the fit's own bit-for-bit (no retain/rebuild). Absent
2630        // when there are no smoothing parameters or the outer Hessian is
2631        // unavailable; never fatal. Superseded intermediate fits skip this block
2632        // and the caller must refit with a live objective before returning that
2633        // model. When the certificate reads Escalate, the auto-selected escalation
2634        // tier (quadrature for K≤4, NUTS over ρ for K≤16, honest Unavailable
2635        // beyond) runs at this same live seam.
2636        if !opts.skip_rho_posterior_inference {
2637            (rho_posterior_certificate, rho_posterior_escalation) =
2638                reml_state.rho_posterior_inference(&final_rho, None);
2639        }
2640
2641        // Standard errors: prefer the diagonal of the full inverse when
2642        // available; otherwise use the factorised Hessian from the EDF pass
2643        // (in transformed basis) to compute exact diagonal of H_orig⁻¹ =
2644        // Qs H_t⁻¹ Qs' via chunked solve-on-demand. Memory per chunk:
2645        // 2 × p × COV_SE_CHUNK × 8 bytes.
2646        beta_standard_errors = if let Some(ref h_inv) = beta_covariance_unscaled {
2647            // Fast path: SE from stored full inverse (already phi-scaled via
2648            // beta_covariance, but we need the unscaled diagonal here).
2649            let raw_se = Array1::from_iter(
2650                h_inv
2651                    .diag()
2652                    .iter()
2653                    .map(|&v| (cov_scale * v.max(0.0)).sqrt()),
2654            );
2655            Some(raw_se)
2656        } else if let Some(ref factor_t) = edf_factor {
2657            // Solve-on-demand: process columns of Qs^T in chunks.
2658            // Qs is (p_cov × p_t) orthogonal. H_orig⁻¹ = Qs H_t⁻¹ Qs'.
2659            // (H_orig⁻¹)_{ii} = Qs[i,:] · H_t⁻¹ · Qs[i,:]'
2660            // Batch: column i of Qs^T is row i of Qs. Solve H_t Z = Qs^T[:,chunk]
2661            // then dot each solution column back with the corresponding Qs row.
2662            let mut diag_inv = Array1::<f64>::zeros(p_cov);
2663            let mut col_start = 0usize;
2664            while col_start < p_cov {
2665                let col_end = (col_start + COV_SE_CHUNK).min(p_cov);
2666                let chunk = col_end - col_start;
2667                // qs.t() has shape (p_t, p_cov); slice to (p_t, chunk).
2668                let rhs = qs.t().slice(ndarray::s![.., col_start..col_end]).to_owned();
2669                match factor_t.solvemulti(&rhs) {
2670                    Ok(z_chunk) => {
2671                        // z_chunk is (p_t × chunk).
2672                        // (H_orig⁻¹)_{ii} = qs.row(i) · z_chunk.column(i - col_start)
2673                        for local_i in 0..chunk {
2674                            let global_i = col_start + local_i;
2675                            let qs_row = qs.row(global_i);
2676                            let z_col = z_chunk.column(local_i);
2677                            diag_inv[global_i] = qs_row.dot(&z_col);
2678                        }
2679                    }
2680                    Err(e) => {
2681                        log::warn!(
2682                            "SE solve-on-demand failed at chunk {col_start}..{col_end}: {e}"
2683                        );
2684                        // Leave remaining entries as 0 (no SE).
2685                        break;
2686                    }
2687                }
2688                col_start = col_end;
2689            }
2690            let se = diag_inv.mapv(|v| (cov_scale * v.max(0.0)).sqrt());
2691            if se.iter().all(|v| v.is_finite()) {
2692                Some(se)
2693            } else {
2694                log::warn!("SE solve-on-demand produced non-finite entries; discarding");
2695                None
2696            }
2697        } else {
2698            None
2699        };
2700
2701        // Vp = Vb + J·V_ρ·Jᵀ, both terms on the SAME dispersion (variance) scale.
2702        //
2703        // The smoothing correction is built from the coefficient sensitivities
2704        // J = dβ̂/dρ = −H⁻¹(λ_k S_k(β̂ − μ_k)), which are linear in β̂, and from
2705        // V_ρ = (∇²_ρρ V)⁻¹. Under a Gaussian rescaling y → c·y the fit is exactly
2706        // equivariant: β̂ → c·β̂ (so J → c·J), H is response-scale-invariant, the
2707        // REML/LAML cost gains only a ρ-independent (n/2)·log(c²) offset (so its
2708        // ρ-gradient and ρ-Hessian — hence V_ρ — are dispersion-free), and φ̂ → c²·φ̂.
2709        // Therefore J·V_ρ·Jᵀ ∝ c · c⁰ · c = c², i.e. the correction is already on
2710        // the c² variance scale, exactly like Vb = φ̂·H⁻¹ ∝ c². It must be added
2711        // directly to Vb. Multiplying it by cov_scale
2712        // (≈ c²) again would make the correction scale as c⁴, inflating every
2713        // predict() interval for large-magnitude responses (#582). cov_scale is
2714        // applied once, where it belongs: in Vb = scaled_covariance(H⁻¹, cov_scale).
2715        beta_covariance_corrected = match (&beta_covariance, &smoothing_correction) {
2716            (Some(base_cov), Some(corr)) if base_cov.as_array().dim() == corr.dim() => {
2717                let mut corrected = base_cov.as_array().clone();
2718                corrected += corr;
2719                gam_linalg::matrix::symmetrize_in_place(&mut corrected);
2720                Some(corrected)
2721            }
2722            (Some(_), Some(corr)) => {
2723                log::warn!(
2724                    "Skipping corrected covariance: dimension mismatch (base {:?}, corr {:?})",
2725                    beta_covariance.as_ref().map(|c| c.as_array().dim()),
2726                    Some(corr.dim())
2727                );
2728                None
2729            }
2730            _ => None,
2731        };
2732        beta_standard_errors_corrected = beta_covariance_corrected.as_ref().map(se_from_covariance);
2733    }
2734    let inference = opts.compute_inference.then(|| FitInference {
2735        edf_by_block,
2736        penalty_block_trace,
2737        edf_total,
2738        smoothing_correction,
2739        penalized_hessian: penalized_hessian.into(),
2740        working_weights: pirls_res.solveweights.clone(),
2741        working_response: pirls_res.solveworking_response.clone(),
2742        reparam_qs: Some(pirls_res.reparam_result.qs.clone()),
2743        dispersion,
2744        beta_covariance,
2745        beta_standard_errors,
2746        beta_covariance_corrected,
2747        beta_standard_errors_corrected,
2748        beta_covariance_frequentist,
2749        coefficient_influence,
2750        weighted_gram,
2751        bias_correction_beta,
2752    });
2753
2754    let pirls_status = pirls_res.status;
2755    let likelihood_scale_field = pirls_res.likelihood.scale;
2756    let log_likelihood = crate::pirls::calculate_loglikelihood_omitting_constants(
2757        y_o.view(),
2758        &pirls_res.finalmu,
2759        &pirls_res.likelihood,
2760        w_o.view(),
2761    );
2762
2763    // Report the fitted Negative-Binomial overdispersion `theta` on the family
2764    // variant (issue #802). Unlike the Gamma shape / Tweedie φ (which live only
2765    // in `likelihood_scale`) and the Beta φ (whose estimate downstream consumers
2766    // read from `likelihood_scale` via a separate override), NB `theta` is the
2767    // *canonical* parameter on `ResponseFamily::NegativeBinomial { theta }` that
2768    // every NB predictive consumer (prediction-interval variance, quadrature,
2769    // sampling, `generate` draws) reads directly off the saved family. The fit
2770    // updated it in lock-step with the `EstimatedNegBinTheta` scale metadata via
2771    // `with_negbin_theta`, so threading that fitted `theta` back onto the reported
2772    // family is what makes those consumers see the data's overdispersion instead
2773    // of the seed. Non-NB families keep `opts.family` (their estimates live in the
2774    // scale metadata), preserving the existing seed-in-family convention.
2775    let mut reported_family = opts.family.clone();
2776    if let (
2777        ResponseFamily::NegativeBinomial { theta, .. },
2778        LikelihoodScaleMetadata::EstimatedNegBinTheta {
2779            theta: fitted_theta,
2780        },
2781    ) = (&mut reported_family.response, likelihood_scale_field)
2782    {
2783        *theta = fitted_theta;
2784    }
2785
2786    let result = ExternalOptimResult {
2787        beta: beta_orig_internal,
2788        lambdas: lambdas.to_owned(),
2789        likelihood_family: reported_family,
2790        likelihood_scale: likelihood_scale_field,
2791        log_likelihood_normalization: LogLikelihoodNormalization::OmittingResponseConstants,
2792        log_likelihood,
2793        standard_deviation,
2794        iterations: iters,
2795        finalgrad_norm,
2796        outer_converged: outer_result.converged,
2797        pirls_status,
2798        deviance: pirls_res.deviance,
2799        stable_penalty_term: pirls_res.stable_penalty_term,
2800        used_device: pirls_res.used_device,
2801        max_abs_eta: pirls_res.max_abs_eta,
2802        constraint_kkt: pirls_res.constraint_kkt.clone(),
2803        artifacts: FitArtifacts {
2804            pirls: Some(pirls_res),
2805            criterion_certificate: outer_result.criterion_certificate.clone(),
2806            rho_posterior_certificate,
2807            rho_posterior_escalation,
2808            rho_covariance,
2809            ..Default::default()
2810        },
2811        inference,
2812        reml_score: outer_result.final_value,
2813        outer_cost_evals: usize::try_from(*reml_state.arena.cost_eval_count.read().unwrap())
2814            .unwrap_or(usize::MAX),
2815        inner_pirls_solves: usize::try_from(
2816            reml_state
2817                .arena
2818                .inner_pirls_solve_count
2819                .load(std::sync::atomic::Ordering::Relaxed),
2820        )
2821        .unwrap_or(usize::MAX),
2822        fitted_link: if let Some(state) = final_mixture_state {
2823            FittedLinkState::Mixture {
2824                state,
2825                covariance: final_mixture_param_covariance,
2826            }
2827        } else if let Some(state) = opts.latent_cloglog {
2828            FittedLinkState::LatentCLogLog { state }
2829        } else if let Some(state) = final_sas_state {
2830            if opts.family.is_binomial_sas() {
2831                FittedLinkState::Sas {
2832                    state,
2833                    covariance: final_sas_param_covariance,
2834                }
2835            } else if opts.family.is_binomial_beta_logistic() {
2836                FittedLinkState::BetaLogistic {
2837                    state,
2838                    covariance: final_sas_param_covariance,
2839                }
2840            } else {
2841                FittedLinkState::Standard(None)
2842            }
2843        } else {
2844            FittedLinkState::Standard(None)
2845        },
2846    };
2847    Ok(conditioning.backtransform_external_result(result))
2848}
2849
2850#[cfg(test)]
2851mod blended_mixture_link_solve_tests {
2852    //! Regression coverage for issue #1598.
2853    //!
2854    //! A `blended(logit, probit)` learnable link NESTS each pure component
2855    //! (mixing weight 1 on one inverse link). On data generated from a plain
2856    //! logit link, the joint (smoothing + mixing-weight ρ) solve must therefore
2857    //! converge and recover a near-logit fit — at the ρ=0 seed the mixture
2858    //! collapses to its well-conditioned first component (logit).
2859    //!
2860    //! Before the fix the inner P-IRLS observed-Hessian curvature build aborted
2861    //! with "observed Hessian curvature is not positive finite at row N" the
2862    //! moment a single row's residual-dependent observed weight went
2863    //! non-positive (which the non-canonical mixture path produces on finite
2864    //! data even at the logit-collapsed seed). That abort propagated up as
2865    //! "no candidate seeds passed outer startup validation (mixture/SAS flexible
2866    //! link)" and the whole fit failed. The downstream solver weight floor
2867    //! (`solver_hessian_weights_into`) is designed to clamp exactly those rows
2868    //! to keep the Newton system SPD, so the array build must tolerate a finite
2869    //! non-positive observed weight rather than hard-bail on it.
2870
2871    use super::optimize_external_design;
2872    use crate::estimate::external_options::ExternalOptimOptions;
2873    use gam_problem::{
2874        InverseLink, LikelihoodSpec, LinkComponent, MixtureLinkSpec, ResponseFamily, StandardLink,
2875    };
2876    use gam_terms::smooth::BlockwisePenalty;
2877    use ndarray::{Array1, Array2};
2878
2879    #[test]
2880    fn blended_logit_probit_fits_clean_logit_data() {
2881        // --- Synthetic clean logit data ---------------------------------
2882        // y_i ~ Bernoulli(logistic(eta_i)), eta_i = b0 + b1 * x_i.
2883        let n = 120usize;
2884        let b0 = -0.3_f64;
2885        let b1 = 1.7_f64;
2886        // Deterministic pseudo-random x and Bernoulli draws so the test is
2887        // reproducible without an RNG dependency.
2888        let mut x = Array1::<f64>::zeros(n);
2889        let mut y = Array1::<f64>::zeros(n);
2890        let mut true_p = Array1::<f64>::zeros(n);
2891        let mut seed = 0x9E3779B97F4A7C15u64;
2892        let mut next_unit = || {
2893            // SplitMix64 → uniform in [0, 1).
2894            seed = seed.wrapping_add(0x9E3779B97F4A7C15);
2895            let mut z = seed;
2896            z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
2897            z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
2898            z ^= z >> 31;
2899            (z >> 11) as f64 / (1u64 << 53) as f64
2900        };
2901        for i in 0..n {
2902            let xi = -2.5 + 5.0 * next_unit();
2903            let eta = b0 + b1 * xi;
2904            let p = 1.0 / (1.0 + (-eta).exp());
2905            x[i] = xi;
2906            true_p[i] = p;
2907            y[i] = if next_unit() < p { 1.0 } else { 0.0 };
2908        }
2909
2910        // Design with intercept + slope, both unpenalized (zero penalty block).
2911        let mut design = Array2::<f64>::zeros((n, 2));
2912        for i in 0..n {
2913            design[[i, 0]] = 1.0;
2914            design[[i, 1]] = x[i];
2915        }
2916        let w = Array1::<f64>::ones(n);
2917        let offset = Array1::<f64>::zeros(n);
2918        let penalty = BlockwisePenalty::new(0..2, Array2::<f64>::zeros((2, 2)));
2919
2920        // blended(logit, probit) with the documented ρ=0 (pure first-component)
2921        // seed: mixing weight 1 on logit, 0 on probit.
2922        let mix_spec = MixtureLinkSpec {
2923            components: vec![LinkComponent::Logit, LinkComponent::Probit],
2924            initial_rho: Array1::zeros(1),
2925        };
2926
2927        let opts = ExternalOptimOptions {
2928            family: LikelihoodSpec::new(
2929                ResponseFamily::Binomial,
2930                InverseLink::Standard(StandardLink::Logit),
2931            ),
2932            latent_cloglog: None,
2933            mixture_link: Some(mix_spec),
2934            optimize_mixture: true,
2935            sas_link: None,
2936            optimize_sas: false,
2937            compute_inference: false,
2938            skip_rho_posterior_inference: true,
2939            max_iter: 200,
2940            tol: 1e-9,
2941            nullspace_dims: vec![2],
2942            linear_constraints: None,
2943            firth_bias_reduction: None,
2944            penalty_shrinkage_floor: None,
2945            rho_prior: Default::default(),
2946            kronecker_penalty_system: None,
2947            kronecker_factored: None,
2948            persist_warm_start_disk: false,
2949        };
2950
2951        // The joint mixture solve must converge (no error) on data its own pure
2952        // logit component fits trivially.
2953        let result = optimize_external_design(
2954            y.view(),
2955            w.view(),
2956            design.clone(),
2957            offset.view(),
2958            vec![penalty],
2959            &opts,
2960        )
2961        .expect("blended(logit, probit) joint solve must converge on clean logit data");
2962
2963        // Reconstruct the fitted linear predictor η̂ = X·β̂ and correlate it
2964        // with the true probabilities. The mixture inverse link is monotone, so
2965        // a faithful fit makes corr(η̂, true_p) ≈ 1.
2966        assert_eq!(result.beta.len(), 2, "fitted β has intercept + slope");
2967        let mut eta_hat = Array1::<f64>::zeros(n);
2968        for i in 0..n {
2969            eta_hat[i] = design[[i, 0]] * result.beta[0] + design[[i, 1]] * result.beta[1];
2970        }
2971        let corr = pearson(&eta_hat, &true_p);
2972        assert!(
2973            corr > 0.95,
2974            "blended(logit, probit) on clean logit data must recover a near-logit \
2975             fit: corr(η̂, true_p)={corr:.4} (β̂={:?}, λ̂={:?})",
2976            result.beta.as_slice().unwrap(),
2977            result.lambdas.as_slice().unwrap(),
2978        );
2979        // The recovered slope must keep the sign and order of magnitude of the
2980        // generating coefficient (it is not penalized).
2981        assert!(
2982            result.beta[1] > 0.5,
2983            "recovered slope must stay strongly positive: β̂₁={}",
2984            result.beta[1]
2985        );
2986    }
2987
2988    fn pearson(a: &Array1<f64>, b: &Array1<f64>) -> f64 {
2989        let n = a.len() as f64;
2990        let ma = a.sum() / n;
2991        let mb = b.sum() / n;
2992        let mut cov = 0.0;
2993        let mut va = 0.0;
2994        let mut vb = 0.0;
2995        for i in 0..a.len() {
2996            let da = a[i] - ma;
2997            let db = b[i] - mb;
2998            cov += da * db;
2999            va += da * da;
3000            vb += db * db;
3001        }
3002        cov / (va.sqrt() * vb.sqrt())
3003    }
3004}