Skip to main content

gam_models/fit_orchestration/drivers/
spatial_optimization.rs

1fn try_build_spatial_term_log_kappa_derivative(
2    data: ArrayView2<'_, f64>,
3    resolvedspec: &TermCollectionSpec,
4    design: &TermCollectionDesign,
5    term_idx: usize,
6) -> Result<
7    Option<(
8        Range<usize>,
9        usize,
10        Array2<f64>,
11        Array2<f64>,
12        Array2<f64>,
13        Array2<f64>,
14        Vec<Array2<f64>>,
15        Vec<Array2<f64>>,
16        Option<std::sync::Arc<gam_terms::basis::ImplicitDesignPsiDerivative>>,
17    )>,
18    EstimationError,
19> {
20    let Some(smooth_term) = design.smooth.terms.get(term_idx) else {
21        return Ok(None);
22    };
23    let Some(termspec) = resolvedspec.smooth_terms.get(term_idx) else {
24        return Ok(None);
25    };
26
27    let derivative_bundle = match &termspec.basis {
28        SmoothBasisSpec::ThinPlate {
29            feature_cols,
30            spec,
31            input_scales,
32        } => {
33            let mut x = select_columns(data, feature_cols).map_err(EstimationError::from)?;
34            let mut spec_local = spec.clone();
35            if let Some(s) = input_scales {
36                apply_input_standardization(&mut x, s);
37                spec_local.length_scale =
38                    compensate_length_scale_for_standardization(spec.length_scale, s);
39            }
40            build_thin_plate_basis_log_kappa_derivatives(x.view(), &spec_local)
41                .map_err(EstimationError::from)?
42        }
43        SmoothBasisSpec::Sphere { .. } => return Ok(None),
44        // Constant-curvature smooths expose κ as one signed, design-moving
45        // outer ψ-coordinate (#944 stage 3 final wiring). Unlike the Matérn /
46        // Duchon / TPS kernels — whose ψ-coordinate is `log κ = −log ℓ` — the
47        // constant-curvature ψ-coordinate is the **raw curvature κ itself**, so
48        // κ = 0 stays an interior point of the `S^d ← ℝ^d → H^d` family. The
49        // bundle therefore carries `∂·/∂κ` / `∂²·/∂κ²` directly, and the chart
50        // coordinates are consumed verbatim (no input standardization — the
51        // gauge `1 + κ‖x‖²` defines what κ means; see the basis builder).
52        SmoothBasisSpec::ConstantCurvature { feature_cols, spec } => {
53            let x = select_columns(data, feature_cols).map_err(EstimationError::from)?;
54            build_constant_curvature_basis_kappa_derivatives(x.view(), spec)
55                .map_err(EstimationError::from)?
56        }
57        // Measure-jet routes through the GROUPED dial builder
58        // (`try_build_spatial_term_log_kappa_aniso_derivativeinfos`):
59        // `spatial_term_uses_per_axis_psi` is true for every enrolled
60        // measure-jet term, so this isotropic path only sees unenrolled
61        // terms (`measure_jet_enrolls_psi` = false), which expose no ψ bundle.
62        SmoothBasisSpec::MeasureJet { .. } => return Ok(None),
63        SmoothBasisSpec::Matern {
64            feature_cols,
65            spec,
66            input_scales,
67        } => {
68            let mut x = select_columns(data, feature_cols).map_err(EstimationError::from)?;
69            let mut spec_local = spec.clone();
70            if let Some(s) = input_scales {
71                apply_input_standardization(&mut x, s);
72                spec_local.length_scale =
73                    compensate_length_scale_for_standardization(spec.length_scale, s);
74            }
75            build_matern_basis_log_kappa_derivatives(x.view(), &spec_local)
76                .map_err(EstimationError::from)?
77        }
78        SmoothBasisSpec::Duchon {
79            feature_cols,
80            spec,
81            input_scales,
82        } => {
83            let mut x = select_columns(data, feature_cols).map_err(EstimationError::from)?;
84            let mut spec_local = spec.clone();
85            if let Some(s) = input_scales {
86                apply_input_standardization(&mut x, s);
87                spec_local.length_scale =
88                    compensate_optional_length_scale_for_standardization(spec.length_scale, s);
89            }
90            let BasisMetadata::Duchon {
91                centers,
92                identifiability_transform,
93                operator_collocation_points,
94                radial_reparam,
95                ..
96            } = &smooth_term.metadata
97            else {
98                return Ok(None);
99            };
100            // #1355: replay the frozen data-metric reparam into the derivative
101            // spec so the ψ-derivative arms assemble in the rotated radial basis.
102            if spec_local.radial_reparam.is_none() {
103                spec_local.radial_reparam = radial_reparam.clone();
104            }
105            gam_terms::basis::build_duchon_basis_log_kappa_derivativeswith_collocationwithworkspace(
106                x.view(),
107                &spec_local,
108                centers.view(),
109                identifiability_transform.as_ref(),
110                operator_collocation_points
111                    .as_ref()
112                    .map(|points| points.view()),
113                &mut BasisWorkspace::default(),
114            )
115            .map_err(EstimationError::from)?
116        }
117        SmoothBasisSpec::BSpline1D { .. }
118        | SmoothBasisSpec::TensorBSpline { .. }
119        | SmoothBasisSpec::ByVariable { .. }
120        | SmoothBasisSpec::FactorSumToZero { .. }
121        | SmoothBasisSpec::BySmooth { .. }
122        | SmoothBasisSpec::FactorSmooth { .. }
123        | SmoothBasisSpec::Pca { .. } => {
124            return Ok(None);
125        }
126    };
127    let mut implicit_operator = derivative_bundle.implicit_operator;
128    let BasisPsiDerivativeResult {
129        design_derivative: mut local_x_psi,
130        penalties_derivative: mut local_s_psi,
131        implicit_operator: local_implicit_first_unused,
132    } = derivative_bundle.first;
133    let BasisPsiSecondDerivativeResult {
134        designsecond_derivative: mut local_x_psi_psi,
135        penaltiessecond_derivative: mut local_s_psi_psi,
136        implicit_operator: local_implicit_second_unused,
137    } = derivative_bundle.second;
138    assert!(local_implicit_first_unused.is_none());
139    assert!(local_implicit_second_unused.is_none());
140
141    if let Some(rotation) = smooth_term.joint_null_rotation.as_ref() {
142        let q = &rotation.rotation;
143        if let Some(op) = implicit_operator.take() {
144            implicit_operator = Some(op.append_full_transform(q).map_err(EstimationError::from)?);
145        } else {
146            if local_x_psi.ncols() != q.nrows() || local_x_psi_psi.ncols() != q.nrows() {
147                return Ok(None);
148            }
149            local_x_psi = fast_ab(&local_x_psi, q);
150            local_x_psi_psi = fast_ab(&local_x_psi_psi, q);
151        }
152        let rotate_penalty = |s_local: Array2<f64>| -> Option<Array2<f64>> {
153            if s_local.nrows() != q.nrows() || s_local.ncols() != q.nrows() {
154                return None;
155            }
156            let qt_s = gam_linalg::faer_ndarray::fast_atb(q, &s_local);
157            Some(gam_linalg::faer_ndarray::fast_ab(&qt_s, q))
158        };
159        let Some(rotated_s_psi) = local_s_psi
160            .into_iter()
161            .map(|s| rotate_penalty(s))
162            .collect::<Option<Vec<_>>>()
163        else {
164            return Ok(None);
165        };
166        local_s_psi = rotated_s_psi;
167        let Some(rotated_s_psi_psi) = local_s_psi_psi
168            .into_iter()
169            .map(|s| rotate_penalty(s))
170            .collect::<Option<Vec<_>>>()
171        else {
172            return Ok(None);
173        };
174        local_s_psi_psi = rotated_s_psi_psi;
175    }
176    let implicit_operator = implicit_operator.map(std::sync::Arc::new);
177
178    if let Some(ref op) = implicit_operator {
179        if op.p_out() != smooth_term.coeff_range.len() {
180            return Ok(None);
181        }
182    } else {
183        if local_x_psi.ncols() != smooth_term.coeff_range.len() {
184            return Ok(None);
185        }
186        if local_x_psi_psi.ncols() != smooth_term.coeff_range.len() {
187            return Ok(None);
188        }
189    }
190    if local_s_psi.is_empty() || local_s_psi.len() != local_s_psi_psi.len() {
191        return Ok(None);
192    }
193    if local_s_psi.iter().any(|s| {
194        s.nrows() != smooth_term.coeff_range.len() || s.ncols() != smooth_term.coeff_range.len()
195    }) {
196        return Ok(None);
197    }
198    if local_s_psi_psi.iter().any(|s| {
199        s.nrows() != smooth_term.coeff_range.len() || s.ncols() != smooth_term.coeff_range.len()
200    }) {
201        return Ok(None);
202    }
203
204    let p_total = design.design.ncols();
205    let smooth_start = p_total.saturating_sub(design.smooth.total_smooth_cols());
206    let global_range = (smooth_start + smooth_term.coeff_range.start)
207        ..(smooth_start + smooth_term.coeff_range.end);
208
209    Ok(Some((
210        global_range,
211        p_total,
212        local_x_psi,
213        local_s_psi.iter().fold(
214            Array2::<f64>::zeros((smooth_term.coeff_range.len(), smooth_term.coeff_range.len())),
215            |acc, m| acc + m,
216        ),
217        local_x_psi_psi,
218        local_s_psi_psi.iter().fold(
219            Array2::<f64>::zeros((smooth_term.coeff_range.len(), smooth_term.coeff_range.len())),
220            |acc, m| acc + m,
221        ),
222        local_s_psi,
223        local_s_psi_psi,
224        implicit_operator,
225    )))
226}
227
228fn try_build_spatial_log_kappa_hyper_dirs(
229    data: ArrayView2<'_, f64>,
230    resolvedspec: &TermCollectionSpec,
231    design: &TermCollectionDesign,
232    spatial_terms: &[usize],
233) -> Result<Option<Vec<DirectionalHyperParam>>, EstimationError> {
234    // Each spatial term contributes one continuous scale hyperparameter
235    //   psi = log(kappa) = -log(length_scale),
236    // while rho = log(lambda) still indexes the smoothing parameters of the
237    // three operator penalties. The joint outer vector is therefore
238    //   theta = (rho_0, ..., rho_{K-1}, psi_1, ..., psi_q)
239    // for q spatial terms participating in exact joint optimization.
240    let Some(info_list) =
241        try_build_spatial_log_kappa_derivativeinfo_list(data, resolvedspec, design, spatial_terms)?
242    else {
243        return Ok(None);
244    };
245    Ok(Some(spatial_log_kappa_hyper_dirs_frominfo_list(info_list)?))
246}
247
248pub(crate) fn try_build_latent_coord_hyper_dirs(
249    latent: std::sync::Arc<gam_terms::latent::LatentCoordValues>,
250    resolvedspec: &TermCollectionSpec,
251    design: &TermCollectionDesign,
252    latent_terms: &[gam_problem::types::SmoothTermIdx],
253    analytic_rho_count: usize,
254) -> Result<Option<Vec<DirectionalHyperParam>>, EstimationError> {
255    if latent_terms.is_empty() || latent.is_empty() {
256        return Ok(None);
257    }
258    if latent_terms.len() != 1 {
259        crate::bail_invalid_estim!(
260            "LatentCoord standard-fit hyper_dirs currently require exactly one latent smooth term"
261                .to_string(),
262        );
263    }
264    let term_idx = latent_terms[0];
265    let smooth_term = design.smooth.terms.get(term_idx.get()).ok_or_else(|| {
266        EstimationError::InvalidInput(format!(
267            "LatentCoord term index {term_idx} out of bounds for realized smooth design"
268        ))
269    })?;
270    let termspec = resolvedspec
271        .smooth_terms
272        .get(term_idx.get())
273        .ok_or_else(|| {
274            EstimationError::InvalidInput(format!(
275                "LatentCoord term index {term_idx} out of bounds for resolved smooth spec"
276            ))
277        })?;
278    let p_total = design.design.ncols();
279    let smooth_start = p_total.saturating_sub(design.smooth.total_smooth_cols());
280    let global_range = (smooth_start + smooth_term.coeff_range.start)
281        ..(smooth_start + smooth_term.coeff_range.end);
282
283    // Spline bases do not add a separate continuous basis-scale ψ coordinate
284    // here. When they are latent-coordinate terms, their ψ directions are the
285    // latent-coordinate axes below, using the same DirectionalHyperParam layout
286    // as Matérn and Duchon.
287    let operator = match (&termspec.basis, &smooth_term.metadata) {
288        (
289            SmoothBasisSpec::Matern { .. },
290            BasisMetadata::Matern {
291                centers,
292                length_scale,
293                nu,
294                include_intercept,
295                identifiability_transform,
296                ..
297            },
298        ) => gam_terms::basis::LatentCoordDesignDerivative::new_matern(
299            latent.clone(),
300            std::sync::Arc::new(centers.clone()),
301            *length_scale,
302            *nu,
303            *include_intercept,
304            identifiability_transform.clone(),
305        )
306        .map_err(EstimationError::from)?,
307        (
308            SmoothBasisSpec::Duchon { .. },
309            BasisMetadata::Duchon {
310                centers,
311                length_scale,
312                power,
313                nullspace_order,
314                identifiability_transform,
315                ..
316            },
317        ) => gam_terms::basis::LatentCoordDesignDerivative::new_duchon(
318            latent.clone(),
319            std::sync::Arc::new(centers.clone()),
320            *length_scale,
321            *power,
322            *nullspace_order,
323            identifiability_transform.clone(),
324        )
325        .map_err(EstimationError::from)?,
326        (
327            SmoothBasisSpec::Sphere { .. },
328            BasisMetadata::Sphere {
329                centers,
330                penalty_order,
331                method,
332                constraint_transform,
333                ..
334            },
335        ) if matches!(*method, gam_terms::basis::SphereMethod::Wahba) => {
336            gam_terms::basis::LatentCoordDesignDerivative::new_sphere(
337                latent.clone(),
338                std::sync::Arc::new(centers.clone()),
339                *penalty_order,
340                constraint_transform.clone(),
341            )
342            .map_err(EstimationError::from)?
343        }
344        (
345            SmoothBasisSpec::BSpline1D { spec, .. },
346            BasisMetadata::BSpline1D {
347                knots,
348                identifiability_transform,
349                periodic,
350                degree: meta_degree,
351                ..
352            },
353        ) => {
354            // Issue #340: use the metadata-recorded effective degree so the
355            // latent-design Jacobian matches what `build_bspline_basis_1d`
356            // actually built at fit time after auto-shrink.
357            let effective_degree = meta_degree.unwrap_or(spec.degree);
358            if let Some((domain_start, period, num_basis)) = periodic {
359                gam_terms::basis::LatentCoordDesignDerivative::new_periodic_bspline(
360                    latent.clone(),
361                    (*domain_start, *domain_start + *period),
362                    effective_degree,
363                    *num_basis,
364                    identifiability_transform.clone(),
365                )
366                .map_err(EstimationError::from)?
367            } else {
368                gam_terms::basis::LatentCoordDesignDerivative::new_tensor_bspline(
369                    latent.clone(),
370                    vec![knots.clone()],
371                    vec![effective_degree],
372                    identifiability_transform.clone(),
373                )
374                .map_err(EstimationError::from)?
375            }
376        }
377        (
378            SmoothBasisSpec::TensorBSpline { .. },
379            BasisMetadata::TensorBSpline {
380                knots,
381                degrees,
382                identifiability_transform,
383                ..
384            },
385        ) => gam_terms::basis::LatentCoordDesignDerivative::new_tensor_bspline(
386            latent.clone(),
387            knots.clone(),
388            degrees.clone(),
389            identifiability_transform.clone(),
390        )
391        .map_err(EstimationError::from)?,
392        (SmoothBasisSpec::Pca { .. }, BasisMetadata::Pca { basis_matrix, .. }) => {
393            gam_terms::basis::LatentCoordDesignDerivative::new_pca(
394                latent.clone(),
395                std::sync::Arc::new(basis_matrix.clone()),
396            )
397            .map_err(EstimationError::from)?
398        }
399        _ => return Ok(None),
400    };
401    if operator.p_out() != global_range.len() {
402        crate::bail_invalid_estim!(
403            "LatentCoord derivative width mismatch for term '{}': operator p={}, coeff range={}",
404            smooth_term.name,
405            operator.p_out(),
406            global_range.len()
407        );
408    }
409    let operator = std::sync::Arc::new(operator);
410    let mut hyper_dirs = Vec::with_capacity(operator.n_axes());
411    for flat_axis in 0..operator.n_axes() {
412        let dir = DirectionalHyperParam::new_compact(
413            gam_solve::estimate::reml::HyperDesignDerivative::from_latent_coord(
414                operator.clone(),
415                flat_axis,
416                global_range.clone(),
417                p_total,
418            ),
419            Vec::new(),
420            None,
421            None,
422        )?
423        .not_penalty_like();
424        hyper_dirs.push(dir);
425    }
426    let direct_dim = latent_coord_direct_hyper_count(latent.id_mode(), latent.latent_dim());
427    if analytic_rho_count + direct_dim > 0 {
428        let zero_x = gam_solve::estimate::reml::HyperDesignDerivative::from(Array2::<f64>::zeros((
429            design.design.nrows(),
430            p_total,
431        )));
432        for _ in 0..analytic_rho_count {
433            hyper_dirs.push(
434                DirectionalHyperParam::new_compact(zero_x.clone(), Vec::new(), None, None)?
435                    .not_penalty_like(),
436            );
437        }
438        for _ in 0..direct_dim {
439            hyper_dirs.push(
440                DirectionalHyperParam::new_compact(zero_x.clone(), Vec::new(), None, None)?
441                    .not_penalty_like(),
442            );
443        }
444    }
445    Ok(Some(hyper_dirs))
446}
447
448fn latent_coord_direct_hyper_count(
449    id_mode: &gam_terms::latent::LatentIdMode,
450    latent_dim: usize,
451) -> usize {
452    use gam_terms::latent::{AuxPriorStrength, LatentIdMode};
453    match id_mode {
454        LatentIdMode::AuxPrior { strength, .. } => match strength {
455            AuxPriorStrength::Auto => 1,
456            AuxPriorStrength::Fixed(_) => 0,
457        },
458        LatentIdMode::AuxPriorDimSelection { strength, .. } => {
459            latent_dim
460                + match strength {
461                    AuxPriorStrength::Auto => 1,
462                    AuxPriorStrength::Fixed(_) => 0,
463                }
464        }
465        LatentIdMode::DimSelection { .. } => latent_dim,
466        // A fixed-reference anchor carries at most the REML-selectable log-`μ`
467        // (one direct hyper when `Auto`, none when `Fixed`), like `AuxPrior`.
468        LatentIdMode::IsometryToReference { strength, .. } => match strength {
469            AuxPriorStrength::Auto => 1,
470            AuxPriorStrength::Fixed(_) => 0,
471        },
472        // The behavioral head appends one (1 + d) coefficient block per
473        // η-channel, plus the composed per-axis ARD log-precisions.
474        LatentIdMode::AuxOutcome { head, .. } => head.n_coeffs(latent_dim) + latent_dim,
475        LatentIdMode::None => 0,
476    }
477}
478
479fn latent_coord_initial_direct_hypers(
480    id_mode: &gam_terms::latent::LatentIdMode,
481    latent_dim: usize,
482) -> Result<Array1<f64>, EstimationError> {
483    use gam_terms::latent::{AuxPriorStrength, LatentIdMode};
484    let mut values = Vec::with_capacity(latent_coord_direct_hyper_count(id_mode, latent_dim));
485    match id_mode {
486        LatentIdMode::AuxPrior { strength, .. } => {
487            if matches!(strength, AuxPriorStrength::Auto) {
488                values.push(0.0);
489            }
490        }
491        LatentIdMode::AuxPriorDimSelection {
492            strength,
493            init_log_precision,
494            ..
495        } => {
496            if matches!(strength, AuxPriorStrength::Auto) {
497                values.push(0.0);
498            }
499            append_latent_ard_seed(&mut values, init_log_precision.as_ref(), latent_dim)?;
500        }
501        LatentIdMode::DimSelection { init_log_precision } => {
502            append_latent_ard_seed(&mut values, init_log_precision.as_ref(), latent_dim)?;
503        }
504        LatentIdMode::IsometryToReference { strength, .. } => {
505            if matches!(strength, AuxPriorStrength::Auto) {
506                values.push(0.0);
507            }
508        }
509        LatentIdMode::AuxOutcome {
510            head,
511            init_log_precision,
512        } => {
513            // Head coefficients seed at zero: intercept 0 ⇒ baseline rate, all
514            // loadings 0 ⇒ no behavioral anchoring at start (REML/Newton move
515            // them). One (1 + d) block per η-channel.
516            values.extend(std::iter::repeat_n(0.0, head.n_coeffs(latent_dim)));
517            append_latent_ard_seed(&mut values, init_log_precision.as_ref(), latent_dim)?;
518        }
519        LatentIdMode::None => {}
520    }
521    Ok(Array1::from_vec(values))
522}
523
524fn append_latent_ard_seed(
525    values: &mut Vec<f64>,
526    init: Option<&Array1<f64>>,
527    latent_dim: usize,
528) -> Result<(), EstimationError> {
529    if let Some(init) = init {
530        if init.len() != latent_dim {
531            crate::bail_invalid_estim!(
532                "latent dim_selection init_log_precision length mismatch: got {}, expected {}",
533                init.len(),
534                latent_dim
535            );
536        }
537        values.extend(init.iter().copied());
538    } else {
539        values.extend(std::iter::repeat_n(0.0, latent_dim));
540    }
541    Ok(())
542}
543
544struct LatentIdObjectiveContribution {
545    cost: f64,
546    gradient: Array1<f64>,
547}
548
549fn latent_id_objective_contribution(
550    theta: &Array1<f64>,
551    rho_dim: usize,
552    analytic_rho_count: usize,
553    latent: &gam_terms::latent::LatentCoordValues,
554) -> Result<LatentIdObjectiveContribution, EstimationError> {
555    use gam_terms::latent::{AuxPriorStrength, LatentIdMode, aux_prior_targets};
556    let n_obs = latent.n_obs();
557    let latent_dim = latent.latent_dim();
558    let flat_len = latent.len();
559    let mut gradient = Array1::<f64>::zeros(theta.len());
560    let t_start = rho_dim;
561    let direct_start = t_start + flat_len + analytic_rho_count;
562    if theta.len() < direct_start {
563        crate::bail_invalid_estim!(
564            "latent-coordinate theta too short for id objective: got {}, need at least {}",
565            theta.len(),
566            direct_start
567        );
568    }
569    let t = latent.as_matrix();
570    let mut cost = 0.0;
571    let mut cursor = direct_start;
572
573    match latent.id_mode() {
574        LatentIdMode::AuxPrior {
575            u,
576            family,
577            strength,
578        }
579        | LatentIdMode::AuxPriorDimSelection {
580            u,
581            family,
582            strength,
583            ..
584        } => {
585            let (log_mu, mu) = match strength {
586                AuxPriorStrength::Fixed(mu) => (mu.ln(), *mu),
587                AuxPriorStrength::Auto => {
588                    let log_mu = theta[cursor];
589                    cursor += 1;
590                    (log_mu, log_mu.exp())
591                }
592            };
593            let targets = aux_prior_targets(t.view(), u.view(), *family)
594                .map_err(EstimationError::InvalidInput)?;
595            let residual = &t - &targets;
596            let q = residual.iter().map(|v| v * v).sum::<f64>();
597            // The single shared precision `mu` governs every one of the
598            // `n_obs · latent_dim` scalar latent coordinates, so the prior
599            // log-determinant normalizer `−0.5·log det₊(mu · I_K)` counts
600            // `K = n_obs · latent_dim`. (The per-axis ARD path below emits
601            // `−0.5·n_obs·ln(α)` for each of `latent_dim` axes; one shared `mu`
602            // must equal that sum.)
603            let k = (n_obs * latent_dim) as f64;
604            cost += 0.5 * mu * q - 0.5 * k * log_mu;
605
606            let projected_residual = aux_prior_targets(residual.view(), u.view(), *family)
607                .map_err(EstimationError::InvalidInput)?;
608            let grad_base = residual - projected_residual;
609            for n in 0..n_obs {
610                for axis in 0..latent_dim {
611                    gradient[t_start + n * latent_dim + axis] += mu * grad_base[[n, axis]];
612                }
613            }
614            if matches!(strength, AuxPriorStrength::Auto) {
615                gradient[direct_start] += 0.5 * mu * q - 0.5 * k;
616            }
617        }
618        LatentIdMode::IsometryToReference { reference, strength } => {
619            // Fixed-reference anchor `½ μ ‖t − reference‖²` with REML-selectable
620            // `μ`. Identical structure to `AuxPrior` except the target is a
621            // constant configuration (independent of `t`), so the latent
622            // gradient is the plain `μ · (t − reference)` with no projection
623            // term (`AuxPrior` subtracts the projected residual only because its
624            // target `ĥ(u)` depends on `t` through the internal ridge fit).
625            if reference.dim() != (n_obs, latent_dim) {
626                crate::bail_invalid_estim!(
627                    "IsometryToReference reference shape {:?} must equal (n_obs, latent_dim) = ({}, {})",
628                    reference.dim(),
629                    n_obs,
630                    latent_dim
631                );
632            }
633            let mu_slot = cursor;
634            let (log_mu, mu) = match strength {
635                AuxPriorStrength::Fixed(mu) => (mu.ln(), *mu),
636                AuxPriorStrength::Auto => {
637                    let log_mu = theta[cursor];
638                    cursor += 1;
639                    (log_mu, log_mu.exp())
640                }
641            };
642            let residual = &t - reference;
643            let q = residual.iter().map(|v| v * v).sum::<f64>();
644            // Shared precision `mu` over all `K = n_obs · latent_dim` scalar
645            // coordinates: the normalizer `−0.5·log det₊(mu · I_K)` counts `K`,
646            // matching the AuxPrior arm and the ARD path's per-axis sum.
647            let k = (n_obs * latent_dim) as f64;
648            cost += 0.5 * mu * q - 0.5 * k * log_mu;
649            for n in 0..n_obs {
650                for axis in 0..latent_dim {
651                    gradient[t_start + n * latent_dim + axis] += mu * residual[[n, axis]];
652                }
653            }
654            if matches!(strength, AuxPriorStrength::Auto) {
655                gradient[mu_slot] += 0.5 * mu * q - 0.5 * k;
656            }
657        }
658        LatentIdMode::AuxOutcome { head, .. } => {
659            // Behavioral head likelihood channel: the head's design columns are
660            // the live latent codes, so its NLL enters the SAME joint objective
661            // as the reconstruction term and REML balances the two channels.
662            // The head coefficients occupy `head.n_coeffs(d)` direct-hyper slots
663            // starting at `cursor`; their gradient drives the β-tier update and
664            // the head's latent-code gradient flows into the `t` block (the
665            // arrow-Schur cross-channel coupling).
666            let n_coeffs = head.n_coeffs(latent_dim);
667            let coeffs = theta
668                .slice(ndarray::s![cursor..cursor + n_coeffs])
669                .to_owned();
670            let (head_nll, grad_coeffs, grad_t) = head
671                .neg_loglik_and_grad(t.view(), coeffs.view())
672                .map_err(EstimationError::InvalidInput)?;
673            cost += head_nll;
674            for (offset, &g) in grad_coeffs.iter().enumerate() {
675                gradient[cursor + offset] += g;
676            }
677            for n in 0..n_obs {
678                for axis in 0..latent_dim {
679                    gradient[t_start + n * latent_dim + axis] += grad_t[[n, axis]];
680                }
681            }
682            cursor += n_coeffs;
683        }
684        LatentIdMode::DimSelection { .. } | LatentIdMode::None => {}
685    }
686
687    match latent.id_mode() {
688        LatentIdMode::AuxPriorDimSelection { .. }
689        | LatentIdMode::DimSelection { .. }
690        | LatentIdMode::AuxOutcome { .. } => {
691            for axis in 0..latent_dim {
692                let log_alpha = theta[cursor + axis];
693                let alpha = log_alpha.exp();
694                let mut q_axis = 0.0;
695                for n in 0..n_obs {
696                    let flat_idx = n * latent_dim + axis;
697                    let value = latent.as_flat()[flat_idx];
698                    q_axis += value * value;
699                    gradient[t_start + flat_idx] += alpha * value;
700                }
701                cost += 0.5 * alpha * q_axis - 0.5 * n_obs as f64 * log_alpha;
702                gradient[cursor + axis] += 0.5 * alpha * q_axis - 0.5 * n_obs as f64;
703            }
704            cursor += latent_dim;
705        }
706        LatentIdMode::AuxPrior { .. }
707        | LatentIdMode::IsometryToReference { .. }
708        | LatentIdMode::None => {}
709    }
710
711    if cursor != theta.len() {
712        crate::bail_invalid_estim!(
713            "latent-coordinate direct hyperparameter length mismatch: consumed {}, theta len {}",
714            cursor,
715            theta.len()
716        );
717    }
718    Ok(LatentIdObjectiveContribution { cost, gradient })
719}
720
721fn add_latent_id_objective_to_eval(
722    theta: &Array1<f64>,
723    rho_dim: usize,
724    analytic_rho_count: usize,
725    latent: &gam_terms::latent::LatentCoordValues,
726    eval: &mut (
727        f64,
728        Array1<f64>,
729        gam_problem::HessianResult,
730    ),
731) -> Result<(), EstimationError> {
732    let contribution =
733        latent_id_objective_contribution(theta, rho_dim, analytic_rho_count, latent)?;
734    eval.0 += contribution.cost;
735    if eval.1.len() != contribution.gradient.len() {
736        crate::bail_invalid_estim!(
737            "latent-coordinate REML gradient length mismatch: base={}, id={}",
738            eval.1.len(),
739            contribution.gradient.len()
740        );
741    }
742    eval.1 += &contribution.gradient;
743    if eval.2.is_analytic() {
744        eval.2 = gam_problem::HessianResult::Unavailable;
745    }
746    Ok(())
747}
748
749fn analytic_penalty_objective_contribution(
750    theta: &Array1<f64>,
751    rho_dim: usize,
752    latent: &gam_terms::latent::LatentCoordValues,
753    registry: &gam_terms::AnalyticPenaltyRegistry,
754) -> Result<LatentIdObjectiveContribution, EstimationError> {
755    let flat_len = latent.len();
756    let t_start = rho_dim;
757    let t_end = t_start + flat_len;
758    let rho_start = t_end;
759    let rho_end = rho_start + registry.total_rho_count();
760    if theta.len() < rho_end {
761        crate::bail_invalid_estim!(
762            "latent-coordinate theta too short for analytic penalties: got {}, need at least {}",
763            theta.len(),
764            rho_end
765        );
766    }
767    let target_t = theta.slice(s![t_start..t_end]);
768    let rho = theta.slice(s![rho_start..rho_end]);
769    let mut cost = 0.0_f64;
770    let mut gradient = Array1::<f64>::zeros(theta.len());
771    for (penalty, (rho_slice, tier, name)) in registry.penalties.iter().zip(registry.rho_layout()) {
772        let rho_local = rho.slice(s![rho_slice.clone()]);
773        match tier {
774            gam_terms::PenaltyTier::Psi => {
775                cost += penalty.value(target_t.view(), rho_local);
776                let grad = penalty.grad_target(target_t.view(), rho_local);
777                if grad.len() != flat_len {
778                    crate::bail_invalid_estim!(
779                        "analytic penalty {name:?} gradient length mismatch: got {}, expected {}",
780                        grad.len(),
781                        flat_len
782                    );
783                }
784                for i in 0..flat_len {
785                    gradient[t_start + i] += grad[i];
786                }
787                let grad_rho_local = penalty.grad_rho(target_t.view(), rho_local);
788                if grad_rho_local.len() != rho_slice.len() {
789                    crate::bail_invalid_estim!(
790                        "analytic penalty {name:?} rho-gradient length mismatch: got {}, expected {}",
791                        grad_rho_local.len(),
792                        rho_slice.len()
793                    );
794                }
795                for local_idx in 0..grad_rho_local.len() {
796                    gradient[rho_start + rho_slice.start + local_idx] += grad_rho_local[local_idx];
797                }
798            }
799            gam_terms::PenaltyTier::Beta => {}
800            gam_terms::PenaltyTier::Rho => {}
801        }
802    }
803    Ok(LatentIdObjectiveContribution { cost, gradient })
804}
805
806fn add_analytic_penalty_hessian_to_eval(
807    theta: &Array1<f64>,
808    rho_dim: usize,
809    latent: &gam_terms::latent::LatentCoordValues,
810    registry: &gam_terms::AnalyticPenaltyRegistry,
811    eval: &mut (
812        f64,
813        Array1<f64>,
814        gam_problem::HessianResult,
815    ),
816) -> Result<(), EstimationError> {
817    let flat_len = latent.len();
818    let t_start = rho_dim;
819    let t_end = t_start + flat_len;
820    let rho_start = t_end;
821    let rho_end = rho_start + registry.total_rho_count();
822    if theta.len() < rho_end {
823        crate::bail_invalid_estim!(
824            "latent-coordinate theta too short for analytic penalty Hessian: got {}, need at least {}",
825            theta.len(),
826            rho_end
827        );
828    }
829    let gam_problem::HessianResult::Analytic(hessian) = &mut eval.2 else {
830        if eval.2.is_analytic() {
831            eval.2 = gam_problem::HessianResult::Unavailable;
832        }
833        return Ok(());
834    };
835    if hessian.dim() != (theta.len(), theta.len()) {
836        crate::bail_invalid_estim!(
837            "analytic penalty Hessian target shape mismatch: got {}x{}, expected {}x{}",
838            hessian.nrows(),
839            hessian.ncols(),
840            theta.len(),
841            theta.len()
842        );
843    }
844    let target_t = theta.slice(s![t_start..t_end]);
845    let rho = theta.slice(s![rho_start..rho_end]);
846    for (penalty, (rho_slice, tier, _name)) in registry.penalties.iter().zip(registry.rho_layout())
847    {
848        let rho_local = rho.slice(s![rho_slice]);
849        if !matches!(tier, gam_terms::PenaltyTier::Psi) {
850            continue;
851        }
852        if let Some(diag) = penalty.hessian_diag(target_t.view(), rho_local) {
853            if diag.len() != flat_len {
854                crate::bail_invalid_estim!(
855                    "analytic penalty Hessian diagonal length mismatch: got {}, expected {}",
856                    diag.len(),
857                    flat_len
858                );
859            }
860            for i in 0..flat_len {
861                hessian[[t_start + i, t_start + i]] += diag[i];
862            }
863            continue;
864        }
865        let mut probe = Array1::<f64>::zeros(flat_len);
866        for col in 0..flat_len {
867            probe[col] = 1.0;
868            let hv = penalty.hvp(target_t.view(), rho_local, probe.view());
869            if hv.len() != flat_len {
870                crate::bail_invalid_estim!(
871                    "analytic penalty Hessian-vector length mismatch: got {}, expected {}",
872                    hv.len(),
873                    flat_len
874                );
875            }
876            for row in 0..flat_len {
877                hessian[[t_start + row, t_start + col]] += hv[row];
878            }
879            probe[col] = 0.0;
880        }
881    }
882    Ok(())
883}
884
885fn add_analytic_penalty_objective_to_eval(
886    theta: &Array1<f64>,
887    rho_dim: usize,
888    latent: &gam_terms::latent::LatentCoordValues,
889    registry: &gam_terms::AnalyticPenaltyRegistry,
890    eval: &mut (
891        f64,
892        Array1<f64>,
893        gam_problem::HessianResult,
894    ),
895) -> Result<(), EstimationError> {
896    let contribution = analytic_penalty_objective_contribution(theta, rho_dim, latent, registry)?;
897    eval.0 += contribution.cost;
898    if eval.1.len() != contribution.gradient.len() {
899        crate::bail_invalid_estim!(
900            "latent-coordinate REML gradient length mismatch: base={}, analytic_penalty={}",
901            eval.1.len(),
902            contribution.gradient.len()
903        );
904    }
905    eval.1 += &contribution.gradient;
906    add_analytic_penalty_hessian_to_eval(theta, rho_dim, latent, registry, eval)?;
907    Ok(())
908}
909
910fn spatial_log_kappa_hyper_dirs_frominfo_list(
911    info_list: Vec<SpatialPsiDerivative>,
912) -> Result<Vec<DirectionalHyperParam>, EstimationError> {
913    use gam_solve::estimate::reml::ImplicitDerivLevel;
914    use std::collections::HashMap;
915
916    let log_kappa_dim = info_list.len();
917    // Layout-only metadata (group_id per axis) is cheap to snapshot up front so
918    // the consumption loop below can MOVE the dense (n × p) derivative arrays
919    // out of each entry instead of cloning. At large scale (n≈3×10⁵, 16-axis
920    // CTN) the prior `.clone()` sites doubled peak working memory for the
921    // psi-derivative pass through several GiB.
922    let group_ids: Vec<Option<usize>> = info_list.iter().map(|e| e.aniso_group_id).collect();
923    let mut group_indices_map: HashMap<usize, Vec<usize>> = HashMap::new();
924    for (idx, gid) in group_ids.iter().enumerate() {
925        if let Some(g) = gid {
926            group_indices_map.entry(*g).or_default().push(idx);
927        }
928    }
929
930    let mut hyper_dirs = Vec::with_capacity(log_kappa_dim);
931    for (i, info) in info_list.into_iter().enumerate() {
932        let SpatialPsiDerivative {
933            penalty_index: _,
934            penalty_indices,
935            global_range,
936            total_p,
937            x_psi_local,
938            s_psi_components_local,
939            x_psi_psi_local,
940            s_psi_psi_components_local,
941            aniso_group_id,
942            aniso_cross_designs,
943            aniso_cross_penalty_provider,
944            implicit_operator,
945            implicit_axis,
946        } = info;
947
948        let mut xsecond = vec![None; log_kappa_dim];
949        // Diagonal second derivative (same axis).
950        xsecond[i] = Some(if let Some(ref op) = implicit_operator {
951            gam_solve::estimate::reml::HyperDesignDerivative::from_implicit(
952                op.clone(),
953                ImplicitDerivLevel::SecondDiag(implicit_axis),
954                global_range.clone(),
955                total_p,
956            )
957        } else {
958            gam_solve::estimate::reml::HyperDesignDerivative::from_embedded(
959                x_psi_psi_local,
960                global_range.clone(),
961                total_p,
962            )
963        });
964        // Cross second derivatives for axes in the same aniso group.
965        if let Some(cross_designs) = aniso_cross_designs {
966            // Use the base index of this aniso group in the original info_list.
967            // Entries for the same group are contiguous: the first index in the
968            // group gives the base, and axis b is at base+b.
969            if let Some(gid) = aniso_group_id {
970                let base = group_indices_map
971                    .get(&gid)
972                    .and_then(|v| v.first().copied())
973                    .unwrap_or(i);
974                for (b_axis, cross_mat) in cross_designs.into_iter() {
975                    let j = base + b_axis;
976                    if j < log_kappa_dim {
977                        xsecond[j] = Some(if let Some(ref op) = implicit_operator {
978                            gam_solve::estimate::reml::HyperDesignDerivative::from_implicit(
979                                op.clone(),
980                                ImplicitDerivLevel::SecondCross(implicit_axis, b_axis),
981                                global_range.clone(),
982                                total_p,
983                            )
984                        } else {
985                            gam_solve::estimate::reml::HyperDesignDerivative::from_embedded(
986                                cross_mat,
987                                global_range.clone(),
988                                total_p,
989                            )
990                        });
991                    }
992                }
993            }
994        }
995        let s_components = penalty_indices
996            .iter()
997            .copied()
998            .zip(s_psi_components_local.into_iter().map(|local| {
999                gam_solve::estimate::reml::HyperPenaltyDerivative::from_embedded(
1000                    local,
1001                    global_range.clone(),
1002                    total_p,
1003                )
1004            }))
1005            .collect::<Vec<_>>();
1006        let s2_components = penalty_indices
1007            .iter()
1008            .copied()
1009            .zip(s_psi_psi_components_local.into_iter().map(|local| {
1010                gam_solve::estimate::reml::HyperPenaltyDerivative::from_embedded(
1011                    local,
1012                    global_range.clone(),
1013                    total_p,
1014                )
1015            }))
1016            .collect::<Vec<_>>();
1017        let mut ssecond_components = vec![None; log_kappa_dim];
1018        ssecond_components[i] = Some(s2_components);
1019        let mut penaltysecond_partner_indices: Option<Vec<usize>> = None;
1020        let penaltysecond_component_provider =
1021            if let (Some(provider), Some(gid)) = (aniso_cross_penalty_provider, aniso_group_id) {
1022                let group_indices = group_indices_map.get(&gid).cloned().unwrap_or_default();
1023                let axis_in_group =
1024                    group_indices
1025                        .iter()
1026                        .position(|&idx| idx == i)
1027                        .ok_or_else(|| {
1028                            EstimationError::InvalidInput(format!(
1029                                "missing spatial hyper axis {} in anisotropy group {}",
1030                                i, gid
1031                            ))
1032                        })?;
1033                penaltysecond_partner_indices = Some(
1034                    group_indices
1035                        .iter()
1036                        .copied()
1037                        .filter(|&idx| idx != i)
1038                        .collect(),
1039                );
1040                let penalty_indices_inner = penalty_indices.clone();
1041                let global_range_inner = global_range.clone();
1042                let total_p_inner = total_p;
1043                let group_indices_inner = group_indices;
1044                Some(std::sync::Arc::new(
1045                    move |j: usize| -> Result<
1046                        Option<Vec<gam_solve::estimate::reml::PenaltyDerivativeComponent>>,
1047                        EstimationError,
1048                    > {
1049                        let Some(other_axis_in_group) =
1050                            group_indices_inner.iter().position(|&idx| idx == j)
1051                        else {
1052                            return Ok(None);
1053                        };
1054                        if other_axis_in_group == axis_in_group {
1055                            return Ok(None);
1056                        }
1057                        let cross_pens = provider(other_axis_in_group)?;
1058                        if cross_pens.is_empty() {
1059                            return Ok(None);
1060                        }
1061                        Ok(Some(
1062                            penalty_indices_inner
1063                                .iter()
1064                                .copied()
1065                                .zip(cross_pens.into_iter().map(|local| {
1066                                    gam_solve::estimate::reml::HyperPenaltyDerivative::from_embedded(
1067                                        local,
1068                                        global_range_inner.clone(),
1069                                        total_p_inner,
1070                                    )
1071                                }))
1072                                .map(|(penalty_index, matrix)| {
1073                                    gam_solve::estimate::reml::PenaltyDerivativeComponent {
1074                                        penalty_index,
1075                                        matrix,
1076                                    }
1077                                })
1078                                .collect(),
1079                        ))
1080                    },
1081                )
1082                    as std::sync::Arc<
1083                        dyn Fn(
1084                                usize,
1085                            ) -> Result<
1086                                Option<Vec<gam_solve::estimate::reml::PenaltyDerivativeComponent>>,
1087                                EstimationError,
1088                            > + Send
1089                            + Sync
1090                            + 'static,
1091                    >)
1092            } else {
1093                None
1094            };
1095        // First derivative: use implicit operator when available to avoid
1096        // storing dense (n x p) matrices for all D axes simultaneously.
1097        let x_first_hyper = if let Some(ref op) = implicit_operator {
1098            gam_solve::estimate::reml::HyperDesignDerivative::from_implicit(
1099                op.clone(),
1100                ImplicitDerivLevel::First(implicit_axis),
1101                global_range.clone(),
1102                total_p,
1103            )
1104        } else {
1105            gam_solve::estimate::reml::HyperDesignDerivative::from_embedded(
1106                x_psi_local,
1107                global_range.clone(),
1108                total_p,
1109            )
1110        };
1111        let mut dir = DirectionalHyperParam::new_compact(
1112            x_first_hyper,
1113            s_components,
1114            Some(xsecond),
1115            Some(ssecond_components),
1116        )?
1117        .not_penalty_like();
1118        if let Some(provider) = penaltysecond_component_provider {
1119            dir = dir.with_penaltysecond_component_provider(provider);
1120        }
1121        if let Some(partner_indices) = penaltysecond_partner_indices {
1122            dir = dir.with_penaltysecond_partner_indices(partner_indices);
1123        }
1124        hyper_dirs.push(dir);
1125    }
1126    Ok(hyper_dirs)
1127}
1128
1129/// Compute `dims_per_term` for a list of spatial term indices.
1130///
1131/// Returns a vector where entry i is the number of stored ψ values for
1132/// spatial term i: `d` for terms that enroll per-axis anisotropy in the
1133/// REML joint vector (`spatial_term_uses_per_axis_psi`), `1` otherwise.
1134pub(crate) fn spatial_dims_per_term(
1135    resolvedspec: &TermCollectionSpec,
1136    spatial_terms: &[usize],
1137) -> Vec<usize> {
1138    spatial_terms
1139        .iter()
1140        .map(|&term_idx| {
1141            if let Some(mj) = measure_jet_term_spec(resolvedspec, term_idx) {
1142                // Dial group, not per-axis anisotropy; layout owned by
1143                // `measure_jet_psi_dim`.
1144                measure_jet_psi_dim(mj)
1145            } else if spatial_term_uses_per_axis_psi(resolvedspec, term_idx) {
1146                get_spatial_feature_dim(resolvedspec, term_idx).unwrap_or(1)
1147            } else {
1148                1
1149            }
1150        })
1151        .collect()
1152}
1153
1154/// Check whether any spatial terms enroll per-axis anisotropic ψ in the joint
1155/// outer vector. Mirrors the hyper_dirs builder's enrollment predicate so the
1156/// outer θ-layout cannot drift from the inner evaluator's ψ count.
1157fn has_aniso_terms(resolvedspec: &TermCollectionSpec, spatial_terms: &[usize]) -> bool {
1158    spatial_terms
1159        .iter()
1160        .any(|&term_idx| spatial_term_uses_per_axis_psi(resolvedspec, term_idx))
1161}
1162
1163/// Emits the `theta`-keyed memoization accessors shared verbatim by the
1164/// single-block and n-block exact-joint design caches. Both carry the same
1165/// `current_theta` / `last_cost` / `last_eval` fields, so the cost/eval
1166/// lookups and the `store_eval` writer are identical; this macro is the single
1167/// source so the two inherent impls cannot drift.
1168macro_rules! impl_exact_joint_theta_memo {
1169    () => {
1170        fn memoized_cost(&self, theta: &Array1<f64>) -> Option<f64> {
1171            if self
1172                .current_theta
1173                .as_ref()
1174                .is_some_and(|cached| theta_values_match(cached, theta))
1175            {
1176                self.last_eval
1177                    .as_ref()
1178                    .map(|cached| cached.0)
1179                    .or(self.last_cost)
1180            } else {
1181                None
1182            }
1183        }
1184
1185        fn memoized_eval(
1186            &self,
1187            theta: &Array1<f64>,
1188        ) -> Option<(
1189            f64,
1190            Array1<f64>,
1191            gam_problem::HessianResult,
1192        )> {
1193            if self
1194                .current_theta
1195                .as_ref()
1196                .is_some_and(|cached| theta_values_match(cached, theta))
1197            {
1198                self.last_eval.clone()
1199            } else {
1200                None
1201            }
1202        }
1203
1204        fn store_eval(
1205            &mut self,
1206            eval: (
1207                f64,
1208                Array1<f64>,
1209                gam_problem::HessianResult,
1210            ),
1211        ) {
1212            self.last_cost = Some(eval.0);
1213            self.last_eval = Some(eval);
1214        }
1215    };
1216}
1217
1218struct SingleBlockExactJointDesignCache<'d> {
1219    realizer: FrozenTermCollectionIncrementalRealizer<'d>,
1220    current_theta: Option<Array1<f64>>,
1221    // Memo key for `last_cost`/`last_eval`. Distinct from `current_theta` (which
1222    // tracks the θ the n×k design is REALIZED at): on the #1033 certified
1223    // Gaussian path `eval_full` evaluates a trial ψ WITHOUT re-realizing the
1224    // design (the tensor serves value+gradient n-free), so the eval θ and the
1225    // realized-design θ diverge. Keying the memo on a dedicated field keeps a
1226    // ψ-skip from ever mis-associating one ψ's cost/eval with another ψ's key.
1227    last_eval_theta: Option<Array1<f64>>,
1228    last_cost: Option<f64>,
1229    last_eval: Option<(
1230        f64,
1231        Array1<f64>,
1232        gam_problem::HessianResult,
1233    )>,
1234    // #1033: ψ-invariant hyper-direction slab cache. The κ hyper_dirs (the n×k
1235    // ∂X/∂ψ design-derivative slabs + their k×k penalty derivatives) are a pure
1236    // function of (data, frozen spec, REALIZED column layout) — they do NOT
1237    // depend on the trial ψ once the design is fixed. On the certified Gaussian
1238    // n-free path `eval_full` evaluates trial ψ WITHOUT re-realizing the design,
1239    // so the realized layout (and hence the hyper_dirs) is identical across an
1240    // entire run of skip-path trials. Rebuilding them each trial re-runs the
1241    // basis ψ-derivative over all n rows + an O(n·k²) `fast_ab` rotation — the
1242    // last per-trial O(n) pass in the κ loop. Cache them keyed by the realizer
1243    // `design_revision`: a skip-path trial (revision unchanged) reuses the
1244    // build; a slow-path trial (revision advanced) rebuilds and re-keys.
1245    cached_hyper_dirs: Option<(u64, Vec<DirectionalHyperParam>)>,
1246    spatial_terms: Vec<usize>,
1247    rho_dim: usize,
1248    dims_per_term: Vec<usize>,
1249}
1250
1251impl<'d> SingleBlockExactJointDesignCache<'d> {
1252    fn new(
1253        data: ArrayView2<'d, f64>,
1254        spec: TermCollectionSpec,
1255        design: TermCollectionDesign,
1256        spatial_terms: Vec<usize>,
1257        rho_dim: usize,
1258        dims_per_term: Vec<usize>,
1259    ) -> Result<Self, String> {
1260        Ok(Self {
1261            realizer: FrozenTermCollectionIncrementalRealizer::new(data, spec, design)?,
1262            current_theta: None,
1263            last_eval_theta: None,
1264            last_cost: None,
1265            last_eval: None,
1266            cached_hyper_dirs: None,
1267            spatial_terms,
1268            rho_dim,
1269            dims_per_term,
1270        })
1271    }
1272
1273    fn design_revision(&self) -> u64 {
1274        self.realizer.design_revision()
1275    }
1276
1277    /// Build the κ hyper-directions for the CURRENT realized design, reusing the
1278    /// `cached_hyper_dirs` slab when the realizer revision has not advanced since
1279    /// the last build (#1033). The slab is ψ-invariant at a fixed realized
1280    /// layout, so a skip-path trial (which does not re-realize the design) gets a
1281    /// bit-identical clone instead of re-running the per-row basis ψ-derivative +
1282    /// O(n·k²) rotation. A revision change (slow-path re-realization) rebuilds and
1283    /// re-keys. The clone is an O(n·k) memcpy — far cheaper than the O(n·k²)
1284    /// rebuild, and the conditioning pass it feeds is itself skipped on the
1285    /// certified path (see `prepare_eval_state`'s fast path).
1286    fn hyper_dirs_for_current_design(
1287        &mut self,
1288        data: ArrayView2<'_, f64>,
1289        kind: SpatialHyperKind,
1290    ) -> Result<Vec<DirectionalHyperParam>, EstimationError> {
1291        let revision = self.realizer.design_revision();
1292        if let Some((cached_rev, dirs)) = self.cached_hyper_dirs.as_ref()
1293            && *cached_rev == revision
1294        {
1295            return Ok(dirs.clone());
1296        }
1297        let dirs = try_build_spatial_log_kappa_hyper_dirs(
1298            data,
1299            self.realizer.spec(),
1300            self.realizer.design(),
1301            &self.spatial_terms,
1302        )?
1303        .ok_or_else(|| {
1304            EstimationError::InvalidInput(format!(
1305                "failed to build {} hyper_dirs at current {}",
1306                kind.adjective(),
1307                kind.coord_name(),
1308            ))
1309        })?;
1310        self.cached_hyper_dirs = Some((revision, dirs.clone()));
1311        Ok(dirs)
1312    }
1313
1314    fn nfree_tensor_gradient_hyper_dirs(
1315        &mut self,
1316        theta: &Array1<f64>,
1317    ) -> Result<Vec<DirectionalHyperParam>, EstimationError> {
1318        let psi = &theta.as_slice().ok_or_else(|| {
1319            EstimationError::InvalidInput(
1320                "nfree_tensor_gradient_hyper_dirs: theta is not contiguous".to_string(),
1321            )
1322        })?[self.rho_dim..];
1323        let (global_range, p_total, s_psi_components) = self
1324            .realizer
1325            .canonical_penalty_derivatives_at_psi(&self.spatial_terms, psi)
1326            .map_err(EstimationError::InvalidInput)?;
1327        let zero_x = gam_solve::estimate::reml::HyperDesignDerivative::zero(
1328            self.realizer.design().design.nrows(),
1329            p_total,
1330        );
1331        let components = s_psi_components
1332            .into_iter()
1333            .enumerate()
1334            .map(|(penalty_index, local)| {
1335                (
1336                    penalty_index,
1337                    gam_solve::estimate::reml::HyperPenaltyDerivative::from_embedded(
1338                        local,
1339                        global_range.clone(),
1340                        p_total,
1341                    ),
1342                )
1343            })
1344            .collect::<Vec<_>>();
1345        Ok(DirectionalHyperParam::new_compact(zero_x, components, None, None)?.not_penalty_like())
1346            .map(|dir| vec![dir])
1347    }
1348
1349    fn ensure_theta(&mut self, theta: &Array1<f64>) -> Result<(), String> {
1350        if self
1351            .current_theta
1352            .as_ref()
1353            .is_some_and(|cached| theta_values_match(cached, theta))
1354        {
1355            return Ok(());
1356        }
1357        let t_ensure = std::time::Instant::now();
1358        let log_kappa = SpatialLogKappaCoords::from_theta_tail_with_dims(
1359            theta,
1360            self.rho_dim,
1361            self.dims_per_term.clone(),
1362        );
1363        self.realizer
1364            .apply_log_kappa(&log_kappa, &self.spatial_terms)?;
1365        log::info!(
1366            "[STAGE] ensure_theta (apply_log_kappa, {} terms): {:.3}s",
1367            self.spatial_terms.len(),
1368            t_ensure.elapsed().as_secs_f64(),
1369        );
1370        self.current_theta = Some(theta.clone());
1371        self.last_eval_theta = None;
1372        self.last_cost = None;
1373        self.last_eval = None;
1374        Ok(())
1375    }
1376
1377    // Memo methods keyed on `last_eval_theta` (NOT `current_theta`): the #1033
1378    // certified Gaussian path evaluates a trial ψ without re-realizing the
1379    // design, so the eval θ and the realized-design θ can differ. Keying the
1380    // memo on the eval θ keeps a ψ-skip from mis-associating one ψ's result
1381    // with another ψ's key. The other exact-joint caches still use the shared
1382    // `impl_exact_joint_theta_memo!` macro (they always realize before eval).
1383    fn memoized_cost(&self, theta: &Array1<f64>) -> Option<f64> {
1384        if self
1385            .last_eval_theta
1386            .as_ref()
1387            .is_some_and(|cached| theta_values_match(cached, theta))
1388        {
1389            self.last_eval
1390                .as_ref()
1391                .map(|cached| cached.0)
1392                .or(self.last_cost)
1393        } else {
1394            None
1395        }
1396    }
1397
1398    fn memoized_eval(
1399        &self,
1400        theta: &Array1<f64>,
1401    ) -> Option<(
1402        f64,
1403        Array1<f64>,
1404        gam_problem::HessianResult,
1405    )> {
1406        if self
1407            .last_eval_theta
1408            .as_ref()
1409            .is_some_and(|cached| theta_values_match(cached, theta))
1410        {
1411            self.last_eval.clone()
1412        } else {
1413            None
1414        }
1415    }
1416
1417    /// Record an eval result keyed to the θ it was computed at. Used in place of
1418    /// the macro's `store_eval` so the memo key reflects the EVAL θ even when the
1419    /// design was not re-realized at that θ (#1033 certified skip).
1420    fn store_eval_at(
1421        &mut self,
1422        theta: &Array1<f64>,
1423        eval: (
1424            f64,
1425            Array1<f64>,
1426            gam_problem::HessianResult,
1427        ),
1428    ) {
1429        self.last_eval_theta = Some(theta.clone());
1430        self.last_cost = Some(eval.0);
1431        self.last_eval = Some(eval);
1432    }
1433
1434    /// Record a cost-only result keyed to the θ it was computed at, so
1435    /// `memoized_cost` keys on the EVAL θ (matching `store_eval_at`).
1436    fn store_cost_at(&mut self, theta: &Array1<f64>, cost: f64) {
1437        self.last_eval_theta = Some(theta.clone());
1438        self.last_cost = Some(cost);
1439        // A cost-only probe carries no gradient/Hessian, so drop any prior
1440        // full eval: `memoized_cost` prefers `last_eval.0`, and a stale
1441        // `last_eval` from a different θ must never answer for this θ.
1442        self.last_eval = None;
1443    }
1444
1445    fn spec(&self) -> &TermCollectionSpec {
1446        self.realizer.spec()
1447    }
1448
1449    fn design(&self) -> &TermCollectionDesign {
1450        self.realizer.design()
1451    }
1452
1453    /// True when the single spatial term's frozen geometry admits an EXACT,
1454    /// n-free penalty re-key at a new length-scale (#1033). The κ-loop fast path
1455    /// gates its design-realization skip on this (replacing the old certified
1456    /// `psi_penalty_tensor_covers` gate): the skip leaves `reset_surface`
1457    /// un-run, so it is sound only when `S(ψ_new)` can be rebuilt n-free.
1458    fn supports_nfree_penalty_rekey(&self) -> bool {
1459        self.realizer
1460            .supports_nfree_penalty_rekey(&self.spatial_terms)
1461    }
1462
1463    fn supports_nfree_gradient_only_routing(&self) -> bool {
1464        self.realizer
1465            .supports_nfree_gradient_only_routing(&self.spatial_terms)
1466    }
1467
1468    /// Build the EXACT canonical penalty surface `S(ψ)` at the length-scale
1469    /// implied by `theta`'s ψ tail, entirely n-free (#1033). Maps ψ→length-scale
1470    /// with the IDENTICAL `spatial_term_psi_to_length_scale_and_aniso` the slow
1471    /// path uses, reuses the frozen basis geometry, and runs the SAME
1472    /// `canonicalize_penalty_specs` pipeline `reset_surface` runs — so the
1473    /// returned canonical list is the one the kept reference surface must be
1474    /// re-keyed with on the design-revision fast path. The caller (which holds
1475    /// `cache`) computes this and hands the owned result to the evaluator via
1476    /// `stage_fast_path_penalty`, avoiding a `&mut cache` borrow alias.
1477    fn canonical_penalties_at(
1478        &mut self,
1479        theta: &Array1<f64>,
1480    ) -> Result<(Vec<gam_terms::construction::CanonicalPenalty>, Vec<usize>), String> {
1481        let psi = &theta
1482            .as_slice()
1483            .ok_or_else(|| "canonical_penalties_at: theta is not contiguous".to_string())?
1484            [self.rho_dim..];
1485        self.realizer
1486            .canonical_penalties_at_psi(&self.spatial_terms, psi)
1487    }
1488}
1489
1490struct SingleBlockLatentCoordDesignCache {
1491    data: Array2<f64>,
1492    spec: TermCollectionSpec,
1493    design: TermCollectionDesign,
1494    current_theta: Option<Array1<f64>>,
1495    current_latent: Option<std::sync::Arc<gam_terms::latent::LatentCoordValues>>,
1496    current_hyper_dirs: Option<Vec<gam_solve::estimate::reml::DirectionalHyperParam>>,
1497    current_design_cache_id: Option<u64>,
1498    latent_design_cache: gam_solve::latent_cache::LatentDesignCache,
1499    last_cost: Option<f64>,
1500    last_eval: Option<(
1501        f64,
1502        Array1<f64>,
1503        gam_problem::HessianResult,
1504    )>,
1505    term_index: gam_problem::types::SmoothTermIdx,
1506    feature_cols: Vec<usize>,
1507    rho_dim: usize,
1508    n_obs: usize,
1509    latent_dim: usize,
1510    id_mode: gam_terms::latent::LatentIdMode,
1511    manifold: gam_terms::latent::LatentManifold,
1512    retraction_registry: gam_solve::latent_cache::LatentRetractionRegistry,
1513    latent_id: u64,
1514    analytic_penalties: Option<std::sync::Arc<gam_terms::AnalyticPenaltyRegistry>>,
1515    analytic_rho_count: usize,
1516    design_revision: u64,
1517    // Stamp the outer-iter the cached cost/eval was computed under; analytic
1518    // penalty weight schedules advance with this counter, so a stale stamp
1519    // invalidates the memo even at unchanged θ.
1520    last_outer_iter: Option<u64>,
1521}
1522
1523impl SingleBlockLatentCoordDesignCache {
1524    fn new(
1525        data: Array2<f64>,
1526        spec: TermCollectionSpec,
1527        design: TermCollectionDesign,
1528        latent: &StandardLatentCoordConfig,
1529        rho_dim: usize,
1530    ) -> Result<Self, String> {
1531        if latent.term_index.get() >= spec.smooth_terms.len() {
1532            return Err(SmoothError::dimension_mismatch(format!(
1533                "latent-coordinate term index {} out of bounds for {} smooth terms",
1534                latent.term_index,
1535                spec.smooth_terms.len()
1536            ))
1537            .into());
1538        }
1539        if latent.feature_cols.len() != latent.values.latent_dim() {
1540            return Err(SmoothError::dimension_mismatch(format!(
1541                "latent-coordinate feature width mismatch: feature_cols={}, latent_dim={}",
1542                latent.feature_cols.len(),
1543                latent.values.latent_dim()
1544            ))
1545            .into());
1546        }
1547        if latent.values.n_obs() != data.nrows() {
1548            return Err(SmoothError::dimension_mismatch(format!(
1549                "latent-coordinate row mismatch: latent n={}, data n={}",
1550                latent.values.n_obs(),
1551                data.nrows()
1552            ))
1553            .into());
1554        }
1555        let analytic_rho_count = latent
1556            .analytic_penalties
1557            .as_ref()
1558            .map_or(0, |registry| registry.total_rho_count());
1559        Ok(Self {
1560            data,
1561            spec,
1562            design,
1563            current_theta: None,
1564            current_latent: None,
1565            current_hyper_dirs: None,
1566            current_design_cache_id: None,
1567            latent_design_cache: gam_solve::latent_cache::LatentDesignCache::default(),
1568            last_cost: None,
1569            last_eval: None,
1570            term_index: latent.term_index,
1571            feature_cols: latent.feature_cols.clone(),
1572            rho_dim,
1573            n_obs: latent.values.n_obs(),
1574            latent_dim: latent.values.latent_dim(),
1575            id_mode: latent.values.id_mode().clone(),
1576            manifold: latent.values.manifold().clone(),
1577            retraction_registry: latent.values.retraction_registry().clone(),
1578            latent_id: latent.values.latent_id(),
1579            analytic_penalties: latent.analytic_penalties.clone(),
1580            analytic_rho_count,
1581            design_revision: 0,
1582            last_outer_iter: None,
1583        })
1584    }
1585
1586    fn design_revision(&self) -> u64 {
1587        self.design_revision
1588    }
1589
1590    fn design(&self) -> &TermCollectionDesign {
1591        &self.design
1592    }
1593
1594    fn latent(&self) -> Result<std::sync::Arc<gam_terms::latent::LatentCoordValues>, String> {
1595        self.current_latent
1596            .as_ref()
1597            .cloned()
1598            .ok_or_else(|| "latent-coordinate cache has not been realized".to_string())
1599    }
1600
1601    fn analytic_penalties(&self) -> Option<std::sync::Arc<gam_terms::AnalyticPenaltyRegistry>> {
1602        self.analytic_penalties.clone()
1603    }
1604
1605    fn analytic_penalty_rho_count(&self) -> usize {
1606        self.analytic_rho_count
1607    }
1608
1609    fn hyper_dirs(&self) -> Result<Vec<gam_solve::estimate::reml::DirectionalHyperParam>, String> {
1610        self.current_hyper_dirs
1611            .as_ref()
1612            .cloned()
1613            .ok_or_else(|| "latent-coordinate hyper_dirs cache has not been realized".to_string())
1614    }
1615
1616    fn latent_basis_kind(&self) -> Result<gam_solve::latent_cache::LatentBasisKind, String> {
1617        let smooth_term = self
1618            .design
1619            .smooth
1620            .terms
1621            .get(self.term_index.get())
1622            .ok_or_else(|| {
1623                SmoothError::dimension_mismatch(format!(
1624                    "LatentCoord term index {} out of bounds for realized smooth design",
1625                    self.term_index
1626                ))
1627            })?;
1628        let termspec = self
1629            .spec
1630            .smooth_terms
1631            .get(self.term_index.get())
1632            .ok_or_else(|| {
1633                SmoothError::dimension_mismatch(format!(
1634                    "LatentCoord term index {} out of bounds for resolved smooth spec",
1635                    self.term_index
1636                ))
1637            })?;
1638        match (&termspec.basis, &smooth_term.metadata) {
1639            (
1640                SmoothBasisSpec::Matern { .. },
1641                BasisMetadata::Matern {
1642                    centers,
1643                    length_scale,
1644                    nu,
1645                    aniso_log_scales,
1646                    ..
1647                },
1648            ) => Ok(gam_solve::latent_cache::LatentBasisKind::Matern {
1649                centers: centers.clone(),
1650                length_scale: *length_scale,
1651                nu: *nu,
1652                aniso_log_scales: aniso_log_scales
1653                    .clone()
1654                    .unwrap_or_else(|| vec![0.0; centers.ncols()]),
1655                chunk_size: gam_terms::basis::auto_streaming_chunk_size_for_dense(
1656                    self.n_obs,
1657                    centers.nrows(),
1658                ),
1659            }),
1660            (
1661                SmoothBasisSpec::Duchon { .. },
1662                BasisMetadata::Duchon {
1663                    centers,
1664                    length_scale,
1665                    power,
1666                    nullspace_order,
1667                    aniso_log_scales,
1668                    ..
1669                },
1670            ) => Ok(gam_solve::latent_cache::LatentBasisKind::Duchon {
1671                centers: centers.clone(),
1672                length_scale: *length_scale,
1673                power: *power,
1674                nullspace_order: *nullspace_order,
1675                aniso_log_scales: aniso_log_scales
1676                    .clone()
1677                    .unwrap_or_else(|| vec![0.0; centers.ncols()]),
1678            }),
1679            (
1680                SmoothBasisSpec::Sphere { .. },
1681                BasisMetadata::Sphere {
1682                    centers,
1683                    penalty_order,
1684                    method,
1685                    ..
1686                },
1687            ) if matches!(*method, gam_terms::basis::SphereMethod::Wahba) => {
1688                Ok(gam_solve::latent_cache::LatentBasisKind::Sphere {
1689                    centers: centers.clone(),
1690                    penalty_order: *penalty_order,
1691                    chunk_size: gam_terms::basis::auto_streaming_chunk_size_for_dense(
1692                        self.n_obs,
1693                        centers.nrows(),
1694                    ),
1695                })
1696            }
1697            (
1698                SmoothBasisSpec::BSpline1D { spec, .. },
1699                BasisMetadata::BSpline1D {
1700                    knots,
1701                    periodic,
1702                    degree: meta_degree,
1703                    ..
1704                },
1705            ) => {
1706                // Issue #340: prefer the metadata-recorded effective degree
1707                // (which reflects fit-time auto-shrink) over the upstream
1708                // user-requested `spec.degree`.
1709                let effective_degree = meta_degree.unwrap_or(spec.degree);
1710                if let Some((domain_start, period, num_basis)) = periodic {
1711                    Ok(
1712                        gam_solve::latent_cache::LatentBasisKind::PeriodicBspline {
1713                            domain_start: *domain_start,
1714                            period: *period,
1715                            degree: effective_degree,
1716                            num_basis: *num_basis,
1717                            chunk_size: gam_terms::basis::auto_streaming_chunk_size_for_dense(
1718                                self.n_obs, *num_basis,
1719                            ),
1720                        },
1721                    )
1722                } else {
1723                    let num_basis_est = knots.len().saturating_sub(effective_degree + 1);
1724                    Ok(
1725                        gam_solve::latent_cache::LatentBasisKind::TensorBspline {
1726                            knots: vec![knots.clone()],
1727                            degrees: vec![effective_degree],
1728                            chunk_size: gam_terms::basis::auto_streaming_chunk_size_for_dense(
1729                                self.n_obs,
1730                                num_basis_est,
1731                            ),
1732                        },
1733                    )
1734                }
1735            }
1736            (
1737                SmoothBasisSpec::TensorBSpline { .. },
1738                BasisMetadata::TensorBSpline { knots, degrees, .. },
1739            ) => Ok(
1740                gam_solve::latent_cache::LatentBasisKind::TensorBspline {
1741                    knots: knots.clone(),
1742                    degrees: degrees.clone(),
1743                    chunk_size: None,
1744                },
1745            ),
1746            (
1747                SmoothBasisSpec::Pca { .. },
1748                BasisMetadata::Pca {
1749                    basis_matrix,
1750                    centered,
1751                    smooth_penalty,
1752                    center_mean,
1753                    pca_basis_path,
1754                    chunk_size,
1755                    ..
1756                },
1757            ) => {
1758                let center_mean_fingerprint = if *centered && pca_basis_path.is_none() {
1759                    let mean = center_mean.as_ref().ok_or_else(|| {
1760                        SmoothError::invalid_config(
1761                            "latent-coordinate Pca cache key requires center_mean when centered",
1762                        )
1763                    })?;
1764                    Some(gam_solve::latent_cache::pca_center_mean_fingerprint(
1765                        mean,
1766                    ))
1767                } else {
1768                    None
1769                };
1770                Ok(gam_solve::latent_cache::LatentBasisKind::Pca {
1771                    basis_matrix: basis_matrix.clone(),
1772                    centered: *centered,
1773                    center_mean_fingerprint,
1774                    smooth_penalty: *smooth_penalty,
1775                    pca_basis_path: pca_basis_path.clone(),
1776                    chunk_size: *chunk_size,
1777                })
1778            }
1779            _ => Err(SmoothError::invalid_config(
1780                "latent-coordinate design cache could not key the realized latent smooth basis"
1781                    .to_string(),
1782            )
1783            .into()),
1784        }
1785    }
1786
1787    fn ensure_theta(&mut self, theta: &Array1<f64>) -> Result<(), String> {
1788        if self
1789            .current_theta
1790            .as_ref()
1791            .is_some_and(|cached| theta_values_match(cached, theta))
1792        {
1793            return Ok(());
1794        }
1795        let latent_flat_len = self.n_obs * self.latent_dim;
1796        let direct_hyper_count = latent_coord_direct_hyper_count(&self.id_mode, self.latent_dim);
1797        let expected =
1798            self.rho_dim + latent_flat_len + self.analytic_rho_count + direct_hyper_count;
1799        if theta.len() != expected {
1800            return Err(SmoothError::dimension_mismatch(format!(
1801                "latent-coordinate theta length mismatch: got {}, expected {} (rho_dim={}, n={}, d={}, analytic_rhos={}, direct_hypers={})",
1802                theta.len(),
1803                expected,
1804                self.rho_dim,
1805                self.n_obs,
1806                self.latent_dim,
1807                self.analytic_rho_count,
1808                direct_hyper_count
1809            ))
1810            .into());
1811        }
1812        let flat = theta
1813            .slice(s![self.rho_dim..self.rho_dim + latent_flat_len])
1814            .to_owned();
1815        let latent = std::sync::Arc::new(
1816            gam_terms::latent::LatentCoordValues::from_flat_with_manifold_and_retraction_and_id(
1817                flat,
1818                self.n_obs,
1819                self.latent_dim,
1820                self.id_mode.clone(),
1821                self.manifold.clone(),
1822                self.retraction_registry.clone(),
1823                self.latent_id,
1824            ),
1825        );
1826        let latent_values_changed = self
1827            .current_latent
1828            .as_ref()
1829            .map(|cached| !latent_values_match(cached.as_flat(), latent.as_flat()))
1830            .unwrap_or(true);
1831        if latent_values_changed {
1832            self.latent_design_cache.invalidate_all();
1833            self.current_design_cache_id = None;
1834            self.design_revision = self.design_revision.wrapping_add(1);
1835        }
1836        for n in 0..self.n_obs {
1837            for axis in 0..self.latent_dim {
1838                let col = self.feature_cols[axis];
1839                self.data[[n, col]] = latent.as_flat()[n * self.latent_dim + axis];
1840            }
1841        }
1842
1843        let basis_kind = self.latent_basis_kind()?;
1844        let rebuilt_width = self.design.design.ncols();
1845        let spec = self.spec.clone();
1846        let term_index = self.term_index;
1847        let analytic_rho_count = self.analytic_rho_count;
1848        let data = self.data.view();
1849        let design_context_digest =
1850            gam_solve::latent_cache::latent_design_context_cache_digest(
1851                data,
1852                &spec,
1853                term_index,
1854                analytic_rho_count,
1855                &self.feature_cols,
1856            )
1857            .map_err(|e| e.to_string())?;
1858        let lookup = self
1859            .latent_design_cache
1860            .lookup_or_compute(latent.clone(), basis_kind, design_context_digest, || {
1861                let rebuilt = build_term_collection_design(data, &spec).map_err(|e| {
1862                    EstimationError::InvalidInput(format!(
1863                        "failed to rebuild latent-coordinate design: {e}"
1864                    ))
1865                })?;
1866                if rebuilt.design.ncols() != rebuilt_width {
1867                    crate::bail_invalid_estim!(
1868                        "latent-coordinate design topology changed: rebuilt p={}, cached p={}",
1869                        rebuilt.design.ncols(),
1870                        rebuilt_width
1871                    );
1872                }
1873                let hyper_dirs = try_build_latent_coord_hyper_dirs(
1874                    latent.clone(),
1875                    &spec,
1876                    &rebuilt,
1877                    &[term_index],
1878                    analytic_rho_count,
1879                )?
1880                .ok_or_else(|| {
1881                    EstimationError::InvalidInput(
1882                        "failed to build latent-coordinate hyper_dirs".to_string(),
1883                    )
1884                })?;
1885                Ok(gam_solve::latent_cache::ComputedLatentDesign {
1886                    design: rebuilt,
1887                    hyper_dirs,
1888                })
1889            })
1890            .map_err(|e| e.to_string())?;
1891        if lookup.cached.design.design.ncols() != self.design.design.ncols() {
1892            return Err(SmoothError::dimension_mismatch(format!(
1893                "latent-coordinate design topology changed: rebuilt p={}, cached p={}",
1894                lookup.cached.design.design.ncols(),
1895                self.design.design.ncols()
1896            ))
1897            .into());
1898        }
1899        self.design = lookup.cached.design.clone();
1900        self.current_hyper_dirs = Some(lookup.cached.hyper_dirs.clone());
1901        self.current_latent = Some(latent);
1902        self.current_theta = Some(theta.clone());
1903        self.last_cost = None;
1904        self.last_eval = None;
1905        self.last_outer_iter = None;
1906        if !latent_values_changed && self.current_design_cache_id != Some(lookup.entry_id) {
1907            self.design_revision = self.design_revision.wrapping_add(1);
1908        }
1909        self.current_design_cache_id = Some(lookup.entry_id);
1910        Ok(())
1911    }
1912
1913    fn memoized_cost(&self, theta: &Array1<f64>) -> Option<f64> {
1914        if self
1915            .current_theta
1916            .as_ref()
1917            .is_some_and(|cached| theta_values_match(cached, theta))
1918            && self.last_outer_iter
1919                == Some(gam_solve::estimate::reml::outer_eval::current_outer_iter())
1920        {
1921            self.last_eval
1922                .as_ref()
1923                .map(|cached| cached.0)
1924                .or(self.last_cost)
1925        } else {
1926            None
1927        }
1928    }
1929
1930    fn memoized_eval(
1931        &self,
1932        theta: &Array1<f64>,
1933    ) -> Option<(
1934        f64,
1935        Array1<f64>,
1936        gam_problem::HessianResult,
1937    )> {
1938        if self
1939            .current_theta
1940            .as_ref()
1941            .is_some_and(|cached| theta_values_match(cached, theta))
1942            && self.last_outer_iter
1943                == Some(gam_solve::estimate::reml::outer_eval::current_outer_iter())
1944        {
1945            self.last_eval.clone()
1946        } else {
1947            None
1948        }
1949    }
1950
1951    fn store_eval(
1952        &mut self,
1953        eval: (
1954            f64,
1955            Array1<f64>,
1956            gam_problem::HessianResult,
1957        ),
1958    ) {
1959        self.last_cost = Some(eval.0);
1960        self.last_eval = Some(eval);
1961        self.last_outer_iter =
1962            Some(gam_solve::estimate::reml::outer_eval::current_outer_iter());
1963    }
1964
1965    fn store_cost(&mut self, cost: f64) {
1966        self.last_cost = Some(cost);
1967        self.last_outer_iter =
1968            Some(gam_solve::estimate::reml::outer_eval::current_outer_iter());
1969    }
1970
1971    fn reset(&mut self) {
1972        self.current_theta = None;
1973        self.current_latent = None;
1974        self.current_hyper_dirs = None;
1975        self.current_design_cache_id = None;
1976        self.latent_design_cache.invalidate();
1977        self.last_cost = None;
1978        self.last_eval = None;
1979        self.last_outer_iter = None;
1980    }
1981}
1982
1983/// #1464: the fixed-κ profiled-REML score `V_p(κ)` for a single constant-curvature
1984/// term — pin κ on the term, fit with κ-optimisation DISABLED so only the
1985/// smoothing parameters ρ are profiled, and return the resulting REML/LAML
1986/// negative-log-evidence (the value the outer loop minimises). This is exactly
1987/// the criterion the `curvature_inference_forspec` CI oracle evaluates; factoring
1988/// it here lets the production joint-fit path reuse the SAME sign-correct profiled
1989/// criterion to pick the κ-sign basin before the joint [ρ, ψ] solve, instead of
1990/// letting the joint optimiser descend from a single κ seed into the spurious +κ
1991/// collapsed-kernel corner (the headline #1464 sign-blindness).
1992///
1993/// `pub` so a regression test can evaluate the EXACT production criterion at two
1994/// pinned κ (e.g. +κ vs −κ on a hyperbolic dataset) and settle solver-vs-criterion:
1995/// if `V_p(+κ) < V_p(−κ)` for hyperbolic data, the criterion itself prefers the
1996/// collapsed +κ corner and the bug is in the constant-curvature REML/Occam term,
1997/// not the optimiser.
1998pub fn fixed_kappa_profiled_reml_score(
1999    data: ArrayView2<'_, f64>,
2000    y: ArrayView1<'_, f64>,
2001    weights: ArrayView1<'_, f64>,
2002    offset: ArrayView1<'_, f64>,
2003    resolvedspec: &TermCollectionSpec,
2004    term_idx: usize,
2005    kappa: f64,
2006    family: LikelihoodSpec,
2007    options: &FitOptions,
2008) -> Result<f64, EstimationError> {
2009    if !kappa.is_finite() {
2010        crate::bail_invalid_estim!("fixed-κ profiled score probed a non-finite κ = {kappa}");
2011    }
2012    // Resolve the constant-curvature term's feature columns and base spec so the
2013    // criterion is probed on the production constant-curvature design.
2014    let (feature_cols, mut probe_basis) = match resolvedspec
2015        .smooth_terms
2016        .get(term_idx)
2017        .map(|t| &t.basis)
2018    {
2019        Some(SmoothBasisSpec::ConstantCurvature {
2020            feature_cols, spec, ..
2021        }) => (feature_cols.clone(), spec.clone()),
2022        _ => {
2023            crate::bail_invalid_estim!(
2024                "fixed-κ profiled score: term {term_idx} is not a constant-curvature smooth"
2025            )
2026        }
2027    };
2028    probe_basis.kappa = kappa;
2029
2030    // #1464: the curvature κ criterion the CI/flatness oracle walks (and the
2031    // `constant_curvature_profiled_reml_scores` export reports) is the HONEST
2032    // fixed-κ profiled REML of the realized constant-curvature design —
2033    // `dof·log(rss/dof) + log|H| − log|λS|₊` profiled over λ on `[1|K_κ·z]`
2034    // (`constant_curvature_honest_profiled_reml_score`). NOT the production
2035    // full-fit `reml_score`: that score heavily SMOOTHS this RKHS kernel, and under
2036    // heavy smoothing the +κ chart's geodesic-distance compression makes the
2037    // collapsed kernel a uniformly better fit of the over-smoothed target for ANY
2038    // data, so it is MONOTONE toward the +chart bound regardless of the true
2039    // curvature sign (the #1464 sign-blindness — `bug_hunt_1464_criterion_vs_solver`
2040    // shows V_p(+2) < V_p(0) < V_p(−2) on hyperbolic data with the raw score). The
2041    // honest profiled REML keeps the curvature-shape signal in the data fit, so its
2042    // argmin tracks the planted sign, and as a proper profiled-REML deviance the
2043    // CI/flatness LR thresholds stay χ²-calibrated; on constant-mean data it is
2044    // ~flat in κ, giving the flatness test correct size. Gaussian-identity is the
2045    // only family the curvature-as-estimand path serves; a weighted response, a
2046    // non-zero offset, or a non-Gaussian link routes to the production fixed-κ fit
2047    // (those configurations are not exercised by curvature inference, and the
2048    // fallback keeps their behaviour byte-identical).
2049    let is_unweighted = weights.iter().all(|&w| (w - 1.0).abs() <= 1e-12);
2050    let is_zero_offset = offset.iter().all(|&o| o.abs() <= 1e-12);
2051    if family == LikelihoodSpec::gaussian_identity() && is_unweighted && is_zero_offset {
2052        let x_term = select_columns(data, &feature_cols).map_err(EstimationError::from)?;
2053        let score =
2054            gam_terms::basis::constant_curvature_honest_profiled_reml_score(x_term.view(), y, &probe_basis)
2055                .map_err(|e| {
2056                    EstimationError::InvalidInput(format!(
2057                        "fixed-κ honest profiled-REML score at κ={kappa} failed: {e}"
2058                    ))
2059                })?;
2060        if !score.is_finite() {
2061            crate::bail_invalid_estim!(
2062                "fixed-κ honest profiled-REML score at κ={kappa} is non-finite"
2063            );
2064        }
2065        return Ok(score);
2066    }
2067
2068    // Fallback (weighted / offset / non-Gaussian): the production fixed-κ fit.
2069    let mut probe_spec = resolvedspec.clone();
2070    match probe_spec.smooth_terms.get_mut(term_idx).map(|t| &mut t.basis) {
2071        Some(SmoothBasisSpec::ConstantCurvature { spec, .. }) => spec.kappa = kappa,
2072        _ => {
2073            crate::bail_invalid_estim!(
2074                "fixed-κ profiled score: term {term_idx} is not a constant-curvature smooth"
2075            )
2076        }
2077    }
2078    let fixed_kappa_options = SpatialLengthScaleOptimizationOptions {
2079        enabled: false,
2080        ..SpatialLengthScaleOptimizationOptions::default()
2081    };
2082    let fit = fit_term_collectionwith_spatial_length_scale_optimization(
2083        data,
2084        y.to_owned(),
2085        weights.to_owned(),
2086        offset.to_owned(),
2087        &probe_spec,
2088        family,
2089        options,
2090        &fixed_kappa_options,
2091    )?;
2092    let score = fit_score(&fit.fit);
2093    if !score.is_finite() {
2094        crate::bail_invalid_estim!("fixed-κ profiled fit at κ={kappa} returned a non-finite score");
2095    }
2096    Ok(score)
2097}
2098
2099/// #1464: estimate κ̂ for a constant-curvature term as the argmin of the κ-FAIR
2100/// sign-resolving criterion over a fine grid spanning the whole chart window.
2101///
2102/// WHY THIS IS THE ESTIMATE, NOT JUST A SEED. The production profiled-REML
2103/// criterion (`fixed_kappa_profiled_reml_score`, and equivalently the joint
2104/// [ρ, ψ] solver's REML objective) is *sign-blind* in κ on a generic
2105/// center-peaked radial signal: the +κ chart compresses geodesic distances, so
2106/// the geodesic-exponential kernel becomes a uniformly better interpolator of
2107/// ANY radial peak regardless of the true curvature sign. Its V_p therefore
2108/// decreases monotonically toward the +chart bound for BOTH spherical and
2109/// hyperbolic truth (verified: `vp_grid_identifies_planted_kappa_sign` puts the
2110/// raw-V_p argmin at the +bound even for κ⋆ = −2). Seeding the joint solver in
2111/// the correct basin and pinning the window to one sign half-axis is not enough:
2112/// the raw REML is still monotone toward 0 inside that half-axis, so the solver
2113/// rails κ̂ to the 0 boundary (the observed hyperbolic κ̂ = 0). The cure is to
2114/// stop using the sign-blind criterion to *choose* κ at all for these terms and
2115/// instead use the κ-fair criterion
2116/// [`gam_terms::basis::constant_curvature_kappa_fair_sign_score`], whose generic
2117/// radial-peak-fitting power is subtracted out so only the genuine
2118/// curvature-shape signal remains — its argmin is sign-AND-magnitude correct
2119/// (spherical κ̂ > 0, hyperbolic κ̂ < 0, materially distinguished).
2120///
2121/// Returns `None` when the term carries no usable κ window or every probe fit
2122/// fails (caller falls back to the spec's κ seed / joint solve).
2123fn constant_curvature_kappa_fair_argmin(
2124    data: ArrayView2<'_, f64>,
2125    y: ArrayView1<'_, f64>,
2126    resolvedspec: &TermCollectionSpec,
2127    term_idx: usize,
2128) -> Option<f64> {
2129    let (kappa_min, kappa_max) = constant_curvature_kappa_bounds(data, resolvedspec, term_idx);
2130    if !(kappa_min.is_finite() && kappa_max.is_finite() && kappa_max > kappa_min) {
2131        return None;
2132    }
2133    let (feature_cols, base_spec) = match resolvedspec.smooth_terms.get(term_idx).map(|t| &t.basis) {
2134        Some(SmoothBasisSpec::ConstantCurvature {
2135            feature_cols, spec, ..
2136        }) => (feature_cols, spec.clone()),
2137        _ => return None,
2138    };
2139    let x_term = match select_columns(data, feature_cols) {
2140        Ok(x) => x,
2141        Err(e) => {
2142            log::info!("[spatial-kappa] #1464 κ-fair argmin column select failed ({e}); skipping");
2143            return None;
2144        }
2145    };
2146    // Dense symmetric grid over the full chart window. 24 steps resolves the
2147    // κ-fair criterion's interior optimum well within the contract tolerances
2148    // (the criterion is smooth and single-welled in κ on curved truth); the
2149    // argmin's SIGN — the headline #1464 requirement — is robust to the grid
2150    // resolution. κ = 0 is included so genuinely flat truth can be selected.
2151    const GRID_STEPS: usize = 24;
2152    let mut best: Option<(f64, f64)> = None; // (κ-fair score, kappa)
2153    for i in 0..=GRID_STEPS {
2154        let t = i as f64 / GRID_STEPS as f64;
2155        let kappa = kappa_min + (kappa_max - kappa_min) * t;
2156        let mut probe_spec = base_spec.clone();
2157        probe_spec.kappa = kappa;
2158        match gam_terms::basis::constant_curvature_kappa_fair_sign_score(x_term.view(), y, &probe_spec) {
2159            Ok(score) => {
2160                if best.as_ref().is_none_or(|(b, _)| score < *b) {
2161                    best = Some((score, kappa));
2162                }
2163            }
2164            Err(e) => {
2165                log::info!(
2166                    "[spatial-kappa] #1464 κ-fair argmin probe at κ={kappa:.4} failed ({e}); skipping"
2167                );
2168            }
2169        }
2170    }
2171    best.map(|(score, kappa)| {
2172        log::info!(
2173            "[spatial-kappa] #1464 κ-fair argmin κ̂={kappa:.4} (κ-fair score={score:.6e}) for term {term_idx}"
2174        );
2175        kappa
2176    })
2177}
2178
2179/// #1464: choose the κ-sign basin for the joint spatial fit by scanning the
2180/// sign-correct fixed-κ profiled-REML criterion `V_p(κ)` over a small symmetric
2181/// grid spanning both chart signs, and return the argmin κ. The joint [ρ, ψ]
2182/// optimiser is then seeded at this κ so it polishes inside the correct basin
2183/// rather than descending from a single (near-zero) κ seed into the spurious +κ
2184/// collapsed-kernel corner. Returns `None` when the term carries no usable κ
2185/// window or every probe fit fails (caller falls back to the spec's κ seed).
2186fn select_constant_curvature_kappa_sign_seed(
2187    data: ArrayView2<'_, f64>,
2188    y: ArrayView1<'_, f64>,
2189    resolvedspec: &TermCollectionSpec,
2190    term_idx: usize,
2191) -> Option<f64> {
2192    let (kappa_min, kappa_max) = constant_curvature_kappa_bounds(data, resolvedspec, term_idx);
2193    if !(kappa_min.is_finite() && kappa_max.is_finite() && kappa_max > kappa_min) {
2194        return None;
2195    }
2196    // Resolve this term's chart-coordinate columns and its base spec so the
2197    // sign-basin scan can score each probe κ with the κ-FAIR criterion directly
2198    // on the production constant-curvature basis (#1464). The κ-fair score
2199    // subtracts the design's generic radial-peak-fitting power (measured on a
2200    // bank of κ-independent Euclidean-radial reference signals) from the data's
2201    // profiled REML, so the +κ chart's distance-compression interpolation
2202    // advantage — which lifts BOTH the data and the reference fits equally —
2203    // cancels, leaving only the genuine curvature-shape signal. This is what
2204    // makes the SIGN identifiable: the raw `fixed_kappa_profiled_reml_score`
2205    // (still used for the magnitude/CI) is sign-blind on a generic radial signal
2206    // and rails to the +chart bound for both spherical and hyperbolic truth.
2207    let (feature_cols, base_spec) = match resolvedspec.smooth_terms.get(term_idx).map(|t| &t.basis) {
2208        Some(SmoothBasisSpec::ConstantCurvature {
2209            feature_cols, spec, ..
2210        }) => (feature_cols, spec.clone()),
2211        _ => return None,
2212    };
2213    let x_term = match select_columns(data, feature_cols) {
2214        Ok(x) => x,
2215        Err(e) => {
2216            log::info!("[spatial-kappa] #1464 sign-basin scan column select failed ({e}); skipping");
2217            return None;
2218        }
2219    };
2220    // Five probes spanning both signs: the two interior corners (half the chart
2221    // bound on each side, away from the saturating boundary), flat (κ = 0), and
2222    // the chart bounds.
2223    let probes = [
2224        kappa_min,
2225        0.5 * kappa_min,
2226        0.0,
2227        0.5 * kappa_max,
2228        kappa_max,
2229    ];
2230    let mut best: Option<(f64, f64)> = None; // (κ-fair score, kappa)
2231    for &kappa in &probes {
2232        let mut probe_spec = base_spec.clone();
2233        probe_spec.kappa = kappa;
2234        match gam_terms::basis::constant_curvature_kappa_fair_sign_score(
2235            x_term.view(),
2236            y,
2237            &probe_spec,
2238        ) {
2239            Ok(score) => {
2240                if best.as_ref().is_none_or(|(b, _)| score < *b) {
2241                    best = Some((score, kappa));
2242                }
2243            }
2244            Err(e) => {
2245                log::info!(
2246                    "[spatial-kappa] #1464 sign-basin probe at κ={kappa:.4} failed ({e}); skipping"
2247                );
2248            }
2249        }
2250    }
2251    best.map(|(score, kappa)| {
2252        log::info!(
2253            "[spatial-kappa] #1464 κ-fair sign-basin scan selected κ_seed={kappa:.4} \
2254             (κ-fair score={score:.6e}) for term {term_idx}"
2255        );
2256        kappa
2257    })
2258}
2259
2260/// Number of length-scale restarts in the #1074 GP-range multi-start pre-scan
2261/// (inclusive log-κ grid endpoints), spanning the per-term data-derived κ window.
2262const SPATIAL_RANGE_PRESCAN_GRID: usize = 7;
2263
2264/// #1074 — kernel-range multi-start for isotropic Matérn/Duchon GP smooths.
2265///
2266/// The joint `[ρ, ψ]` REML objective of an isotropic radial GP smooth is
2267/// genuinely MULTIMODAL in the kernel range `ψ = log κ`: a long-range (stiff)
2268/// basin and a short-range (flexible) basin can each be a local optimum,
2269/// separated by a barrier the local ARC/BFGS joint solver cannot cross. From the
2270/// single data-window-midpoint seed the local solver descends into whichever
2271/// basin holds the seed. For the ROUGHEST kernels (Matérn ν=3/2) the midpoint is
2272/// the long-range basin, which over-smooths the domain boundary and leaves the
2273/// global short-range optimum unreached — the observed `matern_varying_nu` ν=3/2
2274/// failure recovered the interior fine but the edges 4× worse than the interior,
2275/// at a REML score ~16 nats WORSE than the reachable short-range optimum (the
2276/// criterion is correct; only the optimizer was stuck).
2277///
2278/// The cure is the textbook remedy for a multimodal length-scale likelihood: a
2279/// coarse grid restart. For each isotropic spatial term we evaluate the profiled
2280/// REML (ρ optimised at fixed κ — exactly [`fit_term_collection_forspec`]) at a
2281/// log-κ grid spanning the term's data-derived window, and adopt the best-scoring
2282/// length scale as the seed handed to the joint solver, which then polishes
2283/// locally inside the global basin. Coordinate descent across terms (each scanned
2284/// with the others held at their running best) keeps the cost linear in the term
2285/// count. Only a STRICT REML improvement over the incumbent seed is adopted, so
2286/// the pre-scan can never regress a fit that the midpoint seed already solved
2287/// well — it only rescues the ones stuck in the wrong basin.
2288///
2289/// Gated to the isotropic, non-constant-curvature case: anisotropic ψ-per-axis
2290/// terms and constant-curvature `curv()` terms carry their own dedicated seeding
2291/// (the η-aware constructors and the #1464 κ-fair sign-basin scan respectively),
2292/// so they are left untouched. Returns the `(term_idx, length_scale)` overrides
2293/// that strictly improve REML; an empty vector means the incumbent seed already
2294/// sits in the best-scoring basin and nothing downstream changes.
2295fn prescan_isotropic_spatial_range_seed(
2296    data: ArrayView2<'_, f64>,
2297    y: ArrayView1<'_, f64>,
2298    weights: ArrayView1<'_, f64>,
2299    offset: ArrayView1<'_, f64>,
2300    resolvedspec: &TermCollectionSpec,
2301    baseline_score: f64,
2302    family: &LikelihoodSpec,
2303    options: &FitOptions,
2304    kappa_options: &SpatialLengthScaleOptimizationOptions,
2305    spatial_terms: &[usize],
2306) -> Result<Vec<(usize, f64)>, EstimationError> {
2307    // Anisotropic and constant-curvature terms have their own seeding paths.
2308    if has_aniso_terms(resolvedspec, spatial_terms)
2309        || !constant_curvature_term_indices(resolvedspec).is_empty()
2310    {
2311        return Ok(Vec::new());
2312    }
2313    let dims = spatial_dims_per_term(resolvedspec, spatial_terms);
2314    // The grid coordinate-descends term by term; `working` carries the chosen
2315    // length scales of already-scanned terms so a later term is scored against
2316    // the improved earlier ones, not the stale midpoint seed.
2317    let mut working = resolvedspec.clone();
2318    let mut best_score = if baseline_score.is_finite() {
2319        baseline_score
2320    } else {
2321        f64::INFINITY
2322    };
2323    let mut overrides: Vec<(usize, f64)> = Vec::new();
2324    for (slot, &term_idx) in spatial_terms.iter().enumerate() {
2325        // Isotropic terms contribute a single ψ; a per-axis (anisotropic) slot
2326        // is excluded by the gate above, but stay defensive.
2327        if dims.get(slot).copied().unwrap_or(1) != 1 {
2328            continue;
2329        }
2330        // Only terms that actually carry a free length scale (Matérn / hybrid
2331        // Duchon). Pure Duchon / TPS without a length scale are skipped.
2332        if get_spatial_length_scale(&working, term_idx).is_none() {
2333            continue;
2334        }
2335        let (psi_lo, psi_hi) = spatial_term_psi_bounds(data, &working, term_idx, kappa_options);
2336        if !(psi_lo.is_finite() && psi_hi.is_finite()) || psi_hi <= psi_lo {
2337            continue;
2338        }
2339        let mut term_best: Option<f64> = None;
2340        for g in 0..SPATIAL_RANGE_PRESCAN_GRID {
2341            let frac = g as f64 / (SPATIAL_RANGE_PRESCAN_GRID - 1) as f64;
2342            let psi = psi_lo + (psi_hi - psi_lo) * frac;
2343            // `apply_log_kappa_to_term` converts the optimizer's ψ to the spec
2344            // length scale as `ℓ = exp(-ψ)`; mirror it so the grid lives in the
2345            // SAME coordinate the joint solver and the ψ window use.
2346            let ls = (-psi).exp();
2347            if !ls.is_finite() || ls <= 0.0 {
2348                continue;
2349            }
2350            let mut probe = working.clone();
2351            if set_spatial_length_scale(&mut probe, term_idx, ls).is_err() {
2352                continue;
2353            }
2354            let fit = match fit_term_collection_forspec(
2355                data,
2356                y,
2357                weights,
2358                offset,
2359                &probe,
2360                family.clone(),
2361                options,
2362            ) {
2363                Ok(fit) => fit,
2364                // A grid point can hit an infeasible kernel geometry (rank
2365                // collapse at an extreme range); skip it, don't abort the scan.
2366                Err(_) => continue,
2367            };
2368            let score = fit_score(&fit.fit);
2369            // Strict improvement only — guards against adopting a numerically
2370            // equal basin and against ever regressing the incumbent seed.
2371            if score.is_finite() && score < best_score - 1e-7 * best_score.abs().max(1.0) {
2372                best_score = score;
2373                term_best = Some(ls);
2374            }
2375        }
2376        if let Some(ls) = term_best {
2377            set_spatial_length_scale(&mut working, term_idx, ls)?;
2378            overrides.push((term_idx, ls));
2379            log::info!(
2380                "[spatial-kappa] #1074 range pre-scan: term {term_idx} re-seeded at \
2381                 length_scale={ls:.5} (profiled REML {best_score:.5}, was {baseline_score:.5})"
2382            );
2383        }
2384    }
2385    Ok(overrides)
2386}
2387
2388fn try_exact_joint_spatial_length_scale_optimization(
2389    data: ArrayView2<'_, f64>,
2390    y: ArrayView1<'_, f64>,
2391    weights: ArrayView1<'_, f64>,
2392    offset: ArrayView1<'_, f64>,
2393    resolvedspec: &TermCollectionSpec,
2394    best: &FittedTermCollection,
2395    family: LikelihoodSpec,
2396    options: &FitOptions,
2397    kappa_options: &SpatialLengthScaleOptimizationOptions,
2398    spatial_terms: &[usize],
2399) -> Result<Option<FittedTermCollectionWithSpec>, EstimationError> {
2400    if spatial_terms.is_empty() {
2401        return Ok(None);
2402    }
2403    // Fail loud on nonsensical κ options rather than letting them propagate
2404    // silent NaNs (e.g. inverted min/max inverts the BFGS window, negative
2405    // scales produce NaN logs). This is the first function on every outer-κ
2406    // path; downstream paths assume validated options.
2407    kappa_options
2408        .validate()
2409        .map_err(EstimationError::InvalidInput)?;
2410
2411    // #1464 constant-curvature κ̂ via the κ-FAIR criterion (NOT the joint REML).
2412    //
2413    // The joint [ρ, ψ] solver below minimises the production profiled REML, which
2414    // is SIGN-BLIND in κ on a generic radial signal: the +κ chart compresses
2415    // geodesic distances, making the geodesic-exponential kernel a uniformly
2416    // better interpolator of any radial peak regardless of the true curvature
2417    // sign, so its objective is monotone toward the +chart bound for BOTH
2418    // spherical and hyperbolic truth. Seeding + one-sided window pinning is not
2419    // enough — inside the correct half-axis the raw REML is still monotone toward
2420    // 0, so the solver rails κ̂ to the 0 boundary (the observed hyperbolic
2421    // κ̂ = 0). When EVERY spatial term in this solve is a constant-curvature term,
2422    // we therefore choose κ̂ directly from the κ-fair criterion's fine-grid argmin
2423    // (`constant_curvature_kappa_fair_argmin`), which subtracts the design's
2424    // generic radial-peak-fitting power and so is sign-AND-magnitude correct
2425    // (spherical κ̂ > 0, hyperbolic κ̂ < 0), then profile ONLY ρ at that fixed κ.
2426    // This is gated on a pure-CC spatial problem (the `curv()` use case); mixed
2427    // CC + Matérn/Duchon/sphere solves fall through to the unchanged joint path,
2428    // so no non-CC fit is affected. The frozen-baseline harvest is used so the κ̂
2429    // is persisted in the returned spec and read back by `model.curvature()`.
2430    let cc_term_set = constant_curvature_term_indices(resolvedspec);
2431    let all_spatial_are_cc =
2432        !cc_term_set.is_empty() && spatial_terms.iter().all(|t| cc_term_set.contains(t));
2433    if all_spatial_are_cc {
2434        let mut fixed_kappa_spec = resolvedspec.clone();
2435        let mut any_kappa_chosen = false;
2436        for &term_idx in spatial_terms {
2437            // Only OVERRIDE κ with the κ-fair argmin when it selects a NEGATIVE
2438            // (hyperbolic) curvature. This is the one regime the sign-blind joint
2439            // REML cannot reach: its objective is monotone toward +κ, so seeding +
2440            // one-sided pinning still rails κ̂ to the 0 boundary (hyperbolic
2441            // recovered as flat). For a positive κ-fair argmin the joint solver
2442            // ALREADY rails to the (correct) +chart bound, and its jointly-
2443            // optimised [ρ, κ] gives a strictly better realized fit than fixing κ
2444            // and profiling ρ alone — so we leave the spherical/positive case to
2445            // the unchanged joint path below, preserving its recovery R². A κ-fair
2446            // argmin of exactly 0 (genuinely flat) likewise falls through.
2447            if let Some(kappa_hat) =
2448                constant_curvature_kappa_fair_argmin(data, y, resolvedspec, term_idx)
2449                    .filter(|&k| k < 0.0)
2450            {
2451                if let Some(SmoothBasisSpec::ConstantCurvature { spec: cc, .. }) = fixed_kappa_spec
2452                    .smooth_terms
2453                    .get_mut(term_idx)
2454                    .map(|t| &mut t.basis)
2455                {
2456                    cc.kappa = kappa_hat;
2457                    any_kappa_chosen = true;
2458                    log::info!(
2459                        "[spatial-kappa] #1464 term {term_idx}: fixed κ̂ = {kappa_hat:.4} from κ-fair argmin (hyperbolic basin; profiling ρ only)"
2460                    );
2461                }
2462            }
2463        }
2464        if any_kappa_chosen {
2465            // Profiled-ρ fit at the κ-fair κ̂, then a fresh REML-seeded harvest so
2466            // the returned spec carries the κ̂ for read-back, exactly as the
2467            // frozen-baseline path does for its geometry.
2468            let baseline_score = fit_score(&best.fit);
2469            let fitted = fit_term_collection_forspec(
2470                data,
2471                y,
2472                weights,
2473                offset,
2474                &fixed_kappa_spec,
2475                family.clone(),
2476                options,
2477            )?;
2478            let frozen_spec =
2479                freeze_term_collection_from_design(&fixed_kappa_spec, &fitted.design)?;
2480            let mut fit = fitted.fit;
2481            // Stamp the κ = 0 baseline REML score, exactly as
2482            // `fit_frozen_baseline_geometry` does for its chosen geometry. The
2483            // outer `require_successful_spatial_optimization_result` guard exists
2484            // to reject genuine optimizer DIVERGENCE (a κ that the production REML
2485            // it minimises says is worse than the seed). It does NOT apply here:
2486            // κ̂ is deliberately chosen by the κ-FAIR criterion precisely because
2487            // the production REML is sign-blind in κ and would always score a
2488            // genuinely-curved κ̂ as "worse" than flat. Reporting the baseline
2489            // score keeps the principled κ̂ from being spuriously rejected, while
2490            // the fitted β / λ are the real ρ-profiled fit AT κ̂. (The CI/flatness
2491            // statistics downstream re-profile V_p around κ̂ on their own.)
2492            fit.reml_score = baseline_score;
2493            return Ok(Some(FittedTermCollectionWithSpec {
2494                fit,
2495                design: fitted.design,
2496                resolvedspec: frozen_spec,
2497                adaptive_diagnostics: fitted.adaptive_diagnostics,
2498                kappa_timing: None,
2499            }));
2500        }
2501    }
2502
2503    if try_build_spatial_log_kappa_hyper_dirs(data, resolvedspec, &best.design, spatial_terms)?
2504        .is_none()
2505    {
2506        if !constant_curvature_term_indices(resolvedspec).is_empty() {
2507            log::info!(
2508                "[#1464-trace] try_exact_joint RETURNED None (hyper_dirs unavailable); \
2509                 κ̂ comes from a NON-joint path"
2510            );
2511        }
2512        return Ok(None);
2513    }
2514    if !constant_curvature_term_indices(resolvedspec).is_empty() {
2515        log::info!(
2516            "[#1464-trace] try_exact_joint ENTERED for {} spatial term(s); CC present",
2517            spatial_terms.len()
2518        );
2519    }
2520
2521    const JOINT_RHO_BOUND: f64 = 12.0;
2522    let rho_dim = best.fit.lambdas.len();
2523
2524    // #1464: a constant-curvature `curv()` term's geodesic-exponential kernel
2525    // COLLAPSES toward the constant function as κ grows positive (sphere
2526    // distances compress), so its global REML optimum at the +κ side is a LARGE
2527    // smoothing λ — often ρ > +JOINT_RHO_BOUND. With the symmetric ±12 box the
2528    // joint [ρ,ψ] optimizer is structurally clamped into the shallow
2529    // under-smoothing basin whose spuriously-low deviance rails κ̂ to the +chart
2530    // bound for any curved data (hyperbolic truth mis-recovered as spherical).
2531    // When a constant-curvature term is present, widen ONLY the over-smoothing
2532    // (upper) ρ bound to the standard `RHO_BOUND`, leaving the lower bound at
2533    // −JOINT_RHO_BOUND so an overfit origin is never reachable — the same
2534    // asymmetric-bound rationale the standard scalar-ρ path uses for the
2535    // gam#1266 high-λ basin. Every other spatial/Matérn/Duchon/sphere joint fit
2536    // keeps the historical ±12 box byte-for-byte.
2537    let has_constant_curvature_term = !constant_curvature_term_indices(resolvedspec).is_empty();
2538    let rho_upper_bound = if has_constant_curvature_term {
2539        gam_solve::estimate::RHO_BOUND
2540    } else {
2541        JOINT_RHO_BOUND
2542    };
2543
2544    // Compute per-term dimensionality for anisotropic terms.
2545    let dims_per_term = spatial_dims_per_term(resolvedspec, spatial_terms);
2546    let use_aniso = has_aniso_terms(resolvedspec, spatial_terms);
2547
2548    // Build initial ψ values and bounds, using aniso-aware constructors
2549    // when any term has d > 1 axes. Bounds are tied to each term's center
2550    // geometry (r_min, r_max) so κ cannot saturate at an upper bound that
2551    // has no relationship to the data's distance scale.
2552    let log_kappa0 = if use_aniso {
2553        SpatialLogKappaCoords::from_length_scales_aniso(resolvedspec, spatial_terms, kappa_options)
2554    } else {
2555        SpatialLogKappaCoords::from_length_scales(resolvedspec, spatial_terms, kappa_options)
2556    };
2557    // If the user/spec did not set a length_scale, re-seed ψ at the midpoint
2558    // of the data-derived window instead of the arbitrary options fallback.
2559    let mut log_kappa0 =
2560        log_kappa0.reseed_from_data(data, resolvedspec, spatial_terms, kappa_options);
2561    // #1464: for each constant-curvature term, pick the κ-sign basin from the
2562    // sign-correct fixed-κ profiled-REML criterion (κ-sign PINNED during each
2563    // ρ-profile) and seed the joint solver THERE, instead of letting the joint
2564    // [ρ, ψ] optimiser descend from a single near-zero κ seed into the spurious
2565    // +κ collapsed-kernel corner that rails κ̂ to the +chart bound regardless of
2566    // the true sign. CC-gated: non-CC spatial/Matérn/Duchon/sphere joint fits
2567    // never enter this loop, so their seed is byte-identical to before. The κ-opt
2568    // OFF profiled fits are the SAME criterion `curvature_inference_forspec`
2569    // already trusts for the CI, so this reuses a verified sign-correct oracle.
2570    // Records `(slot, selected_kappa_seed)` for each constant-curvature term so
2571    // the joint ψ bounds can be HARD-PINNED to the selected sign's half-axis
2572    // below: the joint ARC genuinely prefers the collapsed +κ corner (its
2573    // production REML there is lower than the correct basin), so a seed alone is
2574    // not enough — without a one-sided bound the optimiser walks back across
2575    // κ = 0 to the spurious corner (the observed #1464 bit-identical railing).
2576    let mut cc_sign_seeds: Vec<(usize, f64)> = Vec::new();
2577    if has_constant_curvature_term {
2578        for (slot, &term_idx) in spatial_terms.iter().enumerate() {
2579            if constant_curvature_term_spec(resolvedspec, term_idx).is_none() {
2580                continue;
2581            }
2582            let scan = select_constant_curvature_kappa_sign_seed(
2583                data,
2584                y,
2585                resolvedspec,
2586                term_idx,
2587            );
2588            // #1464 diagnostic: what the κ-fair sign-basin scan picked for this CC
2589            // term, before any joint solve. If this prints a negative κ for the
2590            // hyperbolic dataset but the final κ̂ is +1.08, the bug is downstream of
2591            // the scan (solver railing or readback), not the scan.
2592            match scan {
2593                Some(kappa_seed) => {
2594                    log::info!(
2595                        "[#1464-trace] term {term_idx}: κ-fair sign-basin scan picked κ_seed = {kappa_seed}"
2596                    );
2597                    log_kappa0.set_scalar_slot(slot, kappa_seed);
2598                    cc_sign_seeds.push((slot, kappa_seed));
2599                }
2600                None => {
2601                    log::info!(
2602                        "[#1464-trace] term {term_idx}: fixed-κ sign-basin scan returned NONE (no seed applied)"
2603                    );
2604                }
2605            }
2606        }
2607    }
2608    let log_kappa_lower = if use_aniso {
2609        SpatialLogKappaCoords::lower_bounds_aniso_from_data(
2610            data,
2611            resolvedspec,
2612            spatial_terms,
2613            &dims_per_term,
2614            kappa_options,
2615        )
2616    } else {
2617        SpatialLogKappaCoords::lower_bounds_from_data(
2618            data,
2619            resolvedspec,
2620            spatial_terms,
2621            kappa_options,
2622        )
2623    };
2624    let log_kappa_upper = if use_aniso {
2625        SpatialLogKappaCoords::upper_bounds_aniso_from_data(
2626            data,
2627            resolvedspec,
2628            spatial_terms,
2629            &dims_per_term,
2630            kappa_options,
2631        )
2632    } else {
2633        SpatialLogKappaCoords::upper_bounds_from_data(
2634            data,
2635            resolvedspec,
2636            spatial_terms,
2637            kappa_options,
2638        )
2639    };
2640    // #1464 hard κ-PIN: for each constant-curvature term whose κ-FAIR sign-basin
2641    // scan chose a definite sign, FREEZE the joint ψ coordinate at the scanned
2642    // κ value (both bounds = κ_seed) rather than only closing the far half-axis at
2643    // κ = 0. Why the full freeze and not the half-axis pin: the joint solver
2644    // refines κ against the production profiled-REML `fit_score`, and that raw
2645    // criterion is SIGN-BLIND — on a generic radial signal its data-fit term
2646    // decreases MONOTONICALLY toward +κ for BOTH spherical and hyperbolic truth
2647    // (the +κ chart compresses geodesic distances → a uniformly better radial
2648    // interpolator; verified by `bug_hunt_1464_criterion_vs_solver`, V_p(+2) <
2649    // V_p(0) < V_p(−2) for hyperbolic data). So a half-axis window [κ_min, 0] does
2650    // NOT stop the rail: the solver walks κ to the 0-edge (κ̂ → 0, the observed
2651    // hyperbolic-recovered-as-spherical failure). Only the κ-FAIR scan
2652    // (`constant_curvature_kappa_fair_sign_score`, which subtracts the design's
2653    // generic radial-peak-fitting power) is sign-identifying, and since the κ
2654    // MAGNITUDE is unidentified here (V_p is monotone — it rails to whichever
2655    // bound the window exposes), the scan's argmin is the authoritative κ̂. Freeze
2656    // there and let the joint solve optimize only ρ (and any non-CC ψ) at that κ.
2657    // This is byte-identical to the prior behaviour for SPHERICAL data — the
2658    // half-axis pin already railed κ̂ to κ_max = the scan value — and only changes
2659    // the negative-sign cases, which previously railed to 0. A scan result of
2660    // exactly κ = 0 (genuinely flat) leaves the window untouched. CC-gated —
2661    // non-CC terms are never in `cc_sign_seeds`, so every other
2662    // spatial/Matérn/Duchon/sphere joint window is byte-identical to before.
2663    let mut log_kappa_lower = log_kappa_lower;
2664    let mut log_kappa_upper = log_kappa_upper;
2665    for &(slot, kappa_seed) in &cc_sign_seeds {
2666        if kappa_seed != 0.0 {
2667            log_kappa_lower.set_scalar_slot(slot, kappa_seed);
2668            log_kappa_upper.set_scalar_slot(slot, kappa_seed);
2669        }
2670        log::info!(
2671            "[#1464-trace] slot {slot}: FROZE joint ψ coordinate at κ_seed={kappa_seed} \
2672             (window [{}, {}]); raw fit_score is sign-blind so the κ-fair scan is authoritative",
2673            log_kappa_lower.as_array()[log_kappa_lower.dims_per_term()[..slot].iter().sum::<usize>()],
2674            log_kappa_upper.as_array()[log_kappa_upper.dims_per_term()[..slot].iter().sum::<usize>()],
2675        );
2676    }
2677    // Project seed onto data-derived bounds; spec.length_scale is a hint,
2678    // not a hard constraint. BFGS requires theta0 ∈ [lower, upper].
2679    let log_kappa0 = log_kappa0.clamp_to_bounds(&log_kappa_lower, &log_kappa_upper);
2680    let setup = ExactJointHyperSetup::new(
2681        best.fit.lambdas.mapv(f64::ln),
2682        Array1::<f64>::from_elem(rho_dim, -JOINT_RHO_BOUND),
2683        Array1::<f64>::from_elem(rho_dim, rho_upper_bound),
2684        log_kappa0,
2685        log_kappa_lower,
2686        log_kappa_upper,
2687    );
2688
2689    let theta0 = setup.theta0();
2690    let lower = setup.lower();
2691    let upper = setup.upper();
2692
2693    // ───────────────────────────────────────────────────────────────────────
2694    //  Both coordinate kinds drive the SAME exact joint optimizer
2695    //  (`run_exact_joint_spatial_optimization`): the unified REML evaluator with
2696    //  ext_coords for joint [ρ, ψ] optimization, with analytic gradient +
2697    //  Hessian flowing through the
2698    //  AnisoBasisPsiDerivatives / SpatialPsiDerivative → DirectionalHyperParam →
2699    //  HyperCoord pipeline for Newton/BFGS quadratic convergence. The only
2700    //  difference is the coordinate kind: anisotropic carries one ψ per axis per
2701    //  term, isotropic one log-κ per term. `outer_strategy` handles the
2702    //  centralized degradation path when the analytic Hessian is unavailable.
2703    // ───────────────────────────────────────────────────────────────────────
2704    let kind = if use_aniso {
2705        SpatialHyperKind::Anisotropic
2706    } else {
2707        SpatialHyperKind::Isotropic
2708    };
2709    let (outcome, kappa_timing) = run_exact_joint_spatial_optimization(
2710        kind,
2711        data,
2712        y,
2713        weights,
2714        offset,
2715        resolvedspec,
2716        &best.design,
2717        family.clone(),
2718        options,
2719        spatial_terms,
2720        &dims_per_term,
2721        &theta0,
2722        &lower,
2723        &upper,
2724        rho_dim,
2725        kappa_options,
2726    )?;
2727
2728    let baseline_score = fit_score(&best.fit);
2729
2730    // The joint κ optimizer is a refinement on top of the frozen baseline
2731    // geometry, never a precondition for a fit. There are two ways its candidate
2732    // is not adopted, and both keep the baseline rather than aborting:
2733    //   1. it ran to a finite cost but did not certify a stationary point
2734    //      (`NonConverged`) — the formula/FFI path's tight outer tolerance can
2735    //      leave the optimizer mid-descent at the iteration cap where the CLI's
2736    //      looser tolerance converges (#1126); and
2737    //   2. it converged to a candidate whose certified cost worsens the profiled
2738    //      score (the gate below).
2739    let (theta_star, joint_final_value) = match outcome {
2740        SpatialJointOutcome::Optimized {
2741            theta_star,
2742            final_value,
2743        } => (theta_star, final_value),
2744        SpatialJointOutcome::NonConverged {
2745            iterations,
2746            final_value,
2747            final_grad_norm,
2748        } => {
2749            if has_constant_curvature_term {
2750                log::info!(
2751                    "[#1464-trace] joint solve NONCONVERGED (iters={iterations}, \
2752                     final_value={final_value}); returning FROZEN BASELINE geometry \
2753                     (κ̂ = spec default, NOT the joint candidate)"
2754                );
2755            }
2756            log::info!(
2757                "[spatial-kappa] joint spatial optimization did not converge \
2758                 (iterations={}, final_objective={:.6e}, final_grad_norm={}); \
2759                 keeping the frozen baseline geometry",
2760                iterations,
2761                final_value,
2762                final_grad_norm.map_or_else(|| "n/a".to_string(), |g| format!("{g:.3e}")),
2763            );
2764            return Ok(Some(fit_frozen_baseline_geometry(
2765                data,
2766                y,
2767                weights,
2768                offset,
2769                resolvedspec,
2770                best,
2771                family,
2772                options,
2773                baseline_score,
2774                Some(kappa_timing),
2775            )?));
2776        }
2777    };
2778
2779    // Compare the joint optimizer's certified cost (final_value at theta*)
2780    // against the baseline. Tolerance ≥ options.tol because both endpoints
2781    // are outer-BFGS approximations accurate to options.tol; a tighter
2782    // gate would reject true improvements due to floating-point noise.
2783    let accept_tol = options.tol.max(1e-8 * baseline_score.abs()).max(1e-12);
2784    if joint_final_value > baseline_score + accept_tol {
2785        if has_constant_curvature_term {
2786            log::info!(
2787                "[#1464-trace] joint candidate WORSENED score (joint={joint_final_value}, \
2788                 baseline={baseline_score}); returning FROZEN BASELINE geometry \
2789                 (κ̂ = spec default, NOT the joint candidate)"
2790            );
2791        }
2792        log::info!(
2793            "[spatial-kappa] exact joint spatial candidate worsened the profiled score (joint={:.6e}, baseline={:.6e}, tol={:.2e}); keeping the frozen baseline geometry",
2794            joint_final_value,
2795            baseline_score,
2796            accept_tol,
2797        );
2798        return Ok(Some(fit_frozen_baseline_geometry(
2799            data,
2800            y,
2801            weights,
2802            offset,
2803            resolvedspec,
2804            best,
2805            family,
2806            options,
2807            baseline_score,
2808            Some(kappa_timing),
2809        )?));
2810    }
2811
2812    let rho_star = theta_star.slice(s![..rho_dim]).mapv(f64::exp);
2813    let log_kappa_star =
2814        SpatialLogKappaCoords::from_theta_tail_with_dims(&theta_star, rho_dim, dims_per_term);
2815    // #1464 diagnostic (ban-clean): the joint solver's CONVERGED ψ-tail κ for each
2816    // CC term — the value BEFORE any spec write-back / freeze / readback. If this
2817    // is negative for the hyperbolic dataset but `get_constant_curvature_kappa`
2818    // later returns +1.08, the railing is a POST-SOLVE clamp/readback, not the
2819    // optimiser. If this is itself +1.08, the joint solver railed past the pin.
2820    if has_constant_curvature_term {
2821        let star = log_kappa_star.as_array();
2822        let dims = log_kappa_star.dims_per_term();
2823        for (slot, &term_idx) in spatial_terms.iter().enumerate() {
2824            if constant_curvature_term_spec(resolvedspec, term_idx).is_some() {
2825                let off: usize = dims[..slot].iter().sum();
2826                log::info!(
2827                    "[#1464-trace] term {term_idx}: joint solver CONVERGED ψ-tail κ = {} \
2828                     (this is the optimised candidate; joint_final_value={joint_final_value})",
2829                    star[off]
2830                );
2831            }
2832        }
2833    }
2834    // Keep a handle on the baseline geometry spec before shadowing `resolvedspec`
2835    // with the κ-optimized spec, so the #1357 degenerate-corner guard below can
2836    // fall back to the frozen baseline.
2837    let baseline_spec = resolvedspec;
2838    let optimized_spec = log_kappa_star.apply_tospec(resolvedspec, spatial_terms)?;
2839    let optimized = fit_term_collection_forspecwith_heuristic_lambdas(
2840        data,
2841        y,
2842        weights,
2843        offset,
2844        &optimized_spec,
2845        rho_star.as_slice(),
2846        family.clone(),
2847        options,
2848    )?;
2849
2850    // #1357 degenerate-corner guard. In the flat (ρ, κ) valley the joint
2851    // optimizer can certify a κ at which the kernel block goes nearly flat and
2852    // REML then shrinks the whole smooth onto its intercept (EDF → the null
2853    // floor, prediction returns a constant surface). Such a corner can carry a
2854    // *better* profiled REML cost than the informative baseline — the
2855    // smoothing-correction trace flips between the near-boundary cubature and
2856    // first-order branches across draws, so the `joint_final_value` ≤
2857    // `baseline_score` gate above does not catch it. The frozen baseline
2858    // geometry (the data-derived default length scale with its own REML-seeded
2859    // λ) keeps the kernel informative, so when the joint optimum has collapsed
2860    // to the null while the baseline has materially more effective DOF, reject
2861    // the optimum and keep the baseline. This never blocks a genuine refinement:
2862    // the baseline is only preferred when the joint candidate is degenerate.
2863    let optimized_edf = optimized.fit.inference.as_ref().map(|inf| inf.edf_total);
2864    if let Some(opt_edf) = optimized_edf
2865        && opt_edf < SPATIAL_COLLAPSE_EDF_FLOOR
2866    {
2867        let baseline = fit_frozen_baseline_geometry(
2868            data,
2869            y,
2870            weights,
2871            offset,
2872            baseline_spec,
2873            best,
2874            family.clone(),
2875            options,
2876            baseline_score,
2877            Some(kappa_timing),
2878        )?;
2879        let baseline_edf = baseline.fit.inference.as_ref().map(|inf| inf.edf_total);
2880        if let Some(base_edf) = baseline_edf
2881            && base_edf >= opt_edf + SPATIAL_COLLAPSE_EDF_MARGIN
2882        {
2883            log::info!(
2884                "[spatial-kappa] joint candidate collapsed to the null (edf={opt_edf:.3}); \
2885                 baseline geometry retains edf={base_edf:.3} — keeping the frozen baseline",
2886            );
2887            return Ok(Some(baseline));
2888        }
2889        // Baseline is no better (both genuinely near-null, or baseline lacks
2890        // inference): keep the optimized candidate via the normal path below.
2891    }
2892
2893    // Stamp reml_score with joint_final_value so downstream consumers see a
2894    // score consistent with the gate decision; the refit serves as a
2895    // β/inference harvester at the certified (ρ*, ψ*).
2896    let mut fit = optimized.fit;
2897    fit.reml_score = joint_final_value;
2898    let optimized_result = FittedTermCollectionWithSpec {
2899        fit,
2900        design: optimized.design,
2901        resolvedspec: optimized_spec,
2902        adaptive_diagnostics: optimized.adaptive_diagnostics,
2903        kappa_timing: Some(kappa_timing),
2904    };
2905
2906    Ok(Some(optimized_result))
2907}
2908
2909/// EDF below this is treated as an intercept-only / null collapse of the spatial
2910/// smooth (#1357): the model has shed essentially all effective degrees of
2911/// freedom beyond a handful of unpenalized coordinates.
2912const SPATIAL_COLLAPSE_EDF_FLOOR: f64 = 2.5;
2913
2914/// A non-degenerate baseline must carry at least this much more effective DOF
2915/// than the collapsed joint candidate before the baseline is preferred (#1357),
2916/// so genuinely-near-null surfaces (where both fits agree there is no signal)
2917/// are left untouched.
2918const SPATIAL_COLLAPSE_EDF_MARGIN: f64 = 1.0;
2919
2920/// Re-fit at the frozen baseline geometry — the REML-seeded length scales and
2921/// heuristic λ already certified in `best` — and stamp the certified baseline
2922/// REML score onto the result.
2923///
2924/// This is the graceful-degradation target for the joint spatial-κ optimizer. It
2925/// is reached whenever the joint refinement is not adopted: when the optimizer
2926/// converges to a candidate that worsens the profiled score, *and* when it fails
2927/// to converge at all (#1126). The geometry is the same baseline the parent fit
2928/// started from, so it is always valid — the joint step can only ever improve on
2929/// it, never block it.
2930///
2931/// The refit is a β/inference harvester at the frozen baseline `resolvedspec`;
2932/// the score that geometry was certified at is
2933/// `baseline_score = fit_score(&best.fit)`. We stamp that certified value rather
2934/// than the harvest's own re-derived `reml_score`, which drifts because the
2935/// harvest runs the full-inference option set (and re-runs the adaptive spatial
2936/// overlay) instead of the superseded baseline path that produced `best`. The
2937/// spatial-κ result gate (`require_successful_spatial_optimization_result`)
2938/// compares the returned fit's `fit_score` against `fit_score(&best.fit)`;
2939/// without this stamp a downward drift of a few REML units on the *same*
2940/// geometry spuriously reads as "the optimizer made the score worse" and aborts
2941/// an otherwise-valid fit. Stamping keeps the returned score consistent with the
2942/// gate decision that selected this geometry, identical to the optimized branch.
2943///
2944/// #1357: the harvest warm-starts REML from `best.fit.lambdas` (reproducing the
2945/// certified baseline cheaply), but on the flat (ρ, κ) Matérn valley that warm
2946/// start can slide the ρ search into a degenerate basin that collapses the smooth
2947/// onto its intercept (EDF → 1) even though `best` at the same geometry is
2948/// healthy — the double-penalty nullspace-shrinkage block of `best`'s λ sits near
2949/// the shrink-out corner, and the relaxed log-λ cap then lets it run away. When
2950/// the warm-started harvest collapses far below `best`'s certified EDF, refit the
2951/// same geometry from scratch (no λ seed, exactly how `best` was produced); the
2952/// scratch fit recovers the healthy baseline. This retry only fires on the
2953/// collapse pathology, so warm-starting's speed/uniformity is preserved for every
2954/// non-degenerate fallback.
2955fn fit_frozen_baseline_geometry(
2956    data: ArrayView2<'_, f64>,
2957    y: ArrayView1<'_, f64>,
2958    weights: ArrayView1<'_, f64>,
2959    offset: ArrayView1<'_, f64>,
2960    resolvedspec: &TermCollectionSpec,
2961    best: &FittedTermCollection,
2962    family: LikelihoodSpec,
2963    options: &FitOptions,
2964    baseline_score: f64,
2965    kappa_timing: Option<SpatialLengthScaleOptimizationTiming>,
2966) -> Result<FittedTermCollectionWithSpec, EstimationError> {
2967    let baseline = fit_term_collection_forspecwith_heuristic_lambdas(
2968        data,
2969        y,
2970        weights,
2971        offset,
2972        resolvedspec,
2973        best.fit.lambdas.as_slice(),
2974        family.clone(),
2975        options,
2976    )?;
2977    // #1357 collapse retry: if the warm-started harvest shed essentially all of
2978    // `best`'s certified effective DOF (a flat-valley collapse onto the
2979    // intercept), re-derive λ from scratch — `best` itself was fit from scratch
2980    // and is healthy, so the scratch harvest reproduces it.
2981    let best_edf = best.fit.inference.as_ref().map(|inf| inf.edf_total);
2982    let baseline_edf = baseline.fit.inference.as_ref().map(|inf| inf.edf_total);
2983    let baseline = match (best_edf, baseline_edf) {
2984        (Some(best_edf), Some(base_edf))
2985            if base_edf < SPATIAL_COLLAPSE_EDF_FLOOR
2986                && best_edf >= base_edf + SPATIAL_COLLAPSE_EDF_MARGIN =>
2987        {
2988            log::info!(
2989                "[spatial-kappa] warm-started frozen baseline collapsed (edf={base_edf:.3}) \
2990                 below the certified baseline (edf={best_edf:.3}); refitting from scratch",
2991            );
2992            fit_term_collection_forspec(data, y, weights, offset, resolvedspec, family, options)?
2993        }
2994        _ => baseline,
2995    };
2996    let mut fit = baseline.fit;
2997    fit.reml_score = baseline_score;
2998    Ok(FittedTermCollectionWithSpec {
2999        fit,
3000        design: baseline.design,
3001        resolvedspec: resolvedspec.clone(),
3002        adaptive_diagnostics: baseline.adaptive_diagnostics,
3003        kappa_timing,
3004    })
3005}
3006
3007/// Coordinate kind for the exact joint spatial hyperparameter optimizer.
3008///
3009/// Anisotropic and isotropic spatial terms drive the *same* joint `[ρ, ψ]`
3010/// optimizer: identical outer-Hessian policy, identical
3011/// `ExternalJointHyperEvaluator` wiring, identical multistart problem, identical
3012/// convergence processing, and an identical `eval_full / eval_efs / eval_cost`
3013/// inner loop that routes ψ through `try_build_spatial_log_kappa_hyper_dirs`.
3014/// The only difference is the coordinate *kind*: the anisotropic path carries
3015/// one log-scale coordinate per axis per term (ψ_a) while the isotropic path
3016/// carries one log-κ coordinate per term. The kind selects diagnostic labels
3017/// only — the numerics are shared verbatim.
3018#[derive(Clone, Copy, PartialEq, Eq, Debug)]
3019enum SpatialHyperKind {
3020    Anisotropic,
3021    Isotropic,
3022}
3023
3024impl SpatialHyperKind {
3025    /// Stable diagnostic prefix used in every `log::*` line and as the
3026    /// `ExternalJointHyperEvaluator` / cost-only label root.
3027    fn label(self) -> &'static str {
3028        match self {
3029            SpatialHyperKind::Anisotropic => "spatial-aniso-joint",
3030            SpatialHyperKind::Isotropic => "spatial-iso-joint",
3031        }
3032    }
3033
3034    /// Human-readable adjective for error strings ("anisotropic" / "isotropic").
3035    fn adjective(self) -> &'static str {
3036        match self {
3037            SpatialHyperKind::Anisotropic => "anisotropic",
3038            SpatialHyperKind::Isotropic => "isotropic",
3039        }
3040    }
3041
3042    /// Name of the directional coordinate being optimized ("psi" / "kappa"),
3043    /// used only in hyper-direction construction error messages.
3044    fn coord_name(self) -> &'static str {
3045        match self {
3046            SpatialHyperKind::Anisotropic => "psi",
3047            SpatialHyperKind::Isotropic => "kappa",
3048        }
3049    }
3050}
3051
3052/// Shared context for the exact joint spatial optimizer's closures. Holds the
3053/// realized-design cache and the joint REML evaluator, plus the coordinate
3054/// `kind` whose only effect is the diagnostic label routed into the cost-only
3055/// evaluation path. The `eval_full / eval_efs / eval_cost` methods are the
3056/// single source of truth for both anisotropic and isotropic spatial terms.
3057struct SpatialFrozenGlmInputs {
3058    y: Array1<f64>,
3059    weights: Array1<f64>,
3060    offset: Array1<f64>,
3061    family: LikelihoodSpec,
3062}
3063
3064/// True when the frozen-weight GLM ψ-tensor (#1111 / #1033 mechanism (c)) is a
3065/// faithful first-Fisher-step provider for this family.
3066///
3067/// The mechanism freezes the working weight `w = w(η_warm)` and working response
3068/// `z = z(η_warm)` once per outer ψ-sweep, so it is exact for ANY family whose
3069/// per-iteration PIRLS reduces to a Gaussian working model with a SINGLE
3070/// canonical Fisher weight at a FIXED dispersion — i.e. the one-parameter
3071/// exponential families Binomial, Poisson, Gamma, and Negative-Binomial (the
3072/// θ-fixed running-seed weight `W = μθ/(θ+μ)` is a clean per-row Fisher weight).
3073/// These are precisely the "Poisson/Binomial/etc" families the issue names.
3074///
3075/// Tweedie and Beta jointly estimate an extra dispersion parameter that moves
3076/// the working weight outside the frozen snapshot, so the frozen-W stand-in is
3077/// not faithful for them and they keep the exact per-trial PIRLS rebuild.
3078/// Gaussian-identity is served by the (exact, converged) `PsiGramTensor` lane,
3079/// and Royston-Parmar is the survival path, neither of which routes here.
3080fn frozen_glm_tensor_eligible_family(family: &LikelihoodSpec) -> bool {
3081    !family.is_gaussian_identity()
3082        && matches!(
3083            &family.response,
3084            ResponseFamily::Binomial
3085                | ResponseFamily::Poisson
3086                | ResponseFamily::Gamma
3087                | ResponseFamily::NegativeBinomial { .. }
3088        )
3089}
3090
3091struct SpatialJointContext<'d> {
3092    data: ArrayView2<'d, f64>,
3093    rho_dim: usize,
3094    kind: SpatialHyperKind,
3095    cache: SingleBlockExactJointDesignCache<'d>,
3096    evaluator: gam_solve::estimate::ExternalJointHyperEvaluator<'d>,
3097    frozen_glm_inputs: Option<SpatialFrozenGlmInputs>,
3098    frozen_glm_psi_bounds: Option<(f64, f64)>,
3099    frozen_glm_tensor: Option<gam_solve::glm_sufficient_lane::FrozenWeightGramTensor>,
3100    frozen_glm_tensor_attempted: bool,
3101    /// #1033: memo of the frozen-W trial Fisher weights keyed on the warm β that
3102    /// produced them. `stage_frozen_glm_trial_statistics` runs on EVERY κ trial
3103    /// (every cost / gradient probe), and the only β-dependent quantity it needs
3104    /// is the current Fisher weight vector `W(η)` (η = Xβ + offset) for the
3105    /// drift check and the n-free gradient soundness gate. Computing `W` is an
3106    /// O(n·p) GEMV + O(n) family evaluation; β only changes when the inner solve
3107    /// re-converges (after an accepted outer step), so recomputing it on every
3108    /// same-β probe was a redundant per-trial n-touch. Cache `(β, W)` and reuse
3109    /// `W` whenever β is unchanged — the GEMV runs once per distinct β, i.e.
3110    /// O(outer steps), not O(trials). `None` until the first compute / when no
3111    /// frozen-W inputs are installed.
3112    frozen_glm_weight_memo: Option<(Array1<f64>, Array1<f64>)>,
3113}
3114
3115#[derive(Clone, Copy, Debug, Default)]
3116struct NfreeSkipGateStatus {
3117    shape: bool,
3118    value: bool,
3119    gradient: bool,
3120    penalty: bool,
3121    revision: bool,
3122    second_order: bool,
3123}
3124
3125impl NfreeSkipGateStatus {
3126    fn would_skip(self, require_gradient: bool) -> bool {
3127        self.shape
3128            && self.value
3129            && (!require_gradient || self.gradient)
3130            && self.penalty
3131            && self.revision
3132            && !self.second_order
3133    }
3134}
3135
3136impl<'d> SpatialJointContext<'d> {
3137    fn nfree_skip_gate_status(
3138        &self,
3139        theta: &Array1<f64>,
3140        allow_second_order: bool,
3141        require_gradient: bool,
3142    ) -> NfreeSkipGateStatus {
3143        let shape = theta.len() == self.rho_dim + 1;
3144        let (value, gradient) = if shape {
3145            let psi = theta[self.rho_dim];
3146            (
3147                self.evaluator.psi_gram_tensor_covers(psi)
3148                    && self.evaluator.psi_gram_tensor_covers_skip(psi),
3149                !require_gradient || self.evaluator.psi_gram_tensor_covers_gradient(psi),
3150            )
3151        } else {
3152            (false, false)
3153        };
3154        NfreeSkipGateStatus {
3155            shape,
3156            value,
3157            gradient,
3158            penalty: self.evaluator.supports_nfree_penalty_rekey(),
3159            revision: self.evaluator.nfree_fast_path_revision().is_some(),
3160            second_order: allow_second_order,
3161        }
3162    }
3163
3164    fn frozen_glm_working_state(
3165        &self,
3166        beta: &Array1<f64>,
3167    ) -> Result<Option<(Array1<f64>, Array1<f64>)>, EstimationError> {
3168        let Some(inputs) = self.frozen_glm_inputs.as_ref() else {
3169            return Ok(None);
3170        };
3171        if beta.len() != self.cache.design().design.ncols() {
3172            return Ok(None);
3173        }
3174        let mut eta = self.cache.design().design.matrixvectormultiply(beta);
3175        if eta.len() != inputs.offset.len() {
3176            crate::bail_invalid_estim!(
3177                "frozen GLM tensor warm-state row mismatch: eta={}, offset={}",
3178                eta.len(),
3179                inputs.offset.len()
3180            );
3181        }
3182        eta += &inputs.offset;
3183        let obs = evaluate_standard_familyobservations(
3184            inputs.family.clone(),
3185            None,
3186            None,
3187            None,
3188            &inputs.y,
3189            &inputs.weights,
3190            &eta,
3191        )?;
3192        let mut working_response = obs.eta.clone();
3193        for i in 0..working_response.len() {
3194            let wi = obs.fisherweight[i].max(1e-12);
3195            working_response[i] += obs.score[i] / wi;
3196        }
3197        Ok(Some((obs.fisherweight, working_response)))
3198    }
3199
3200    /// #1033: the trial Fisher weight vector `W(η)` for `beta`, memoized on
3201    /// `beta`. `stage_frozen_glm_trial_statistics` consults `W` on EVERY κ trial
3202    /// (drift check + n-free gradient soundness gate) but `W` is a deterministic
3203    /// function of β (η = Xβ + offset), and β only changes when the inner solve
3204    /// re-converges — many cost / gradient probes share one β. Recompute the
3205    /// O(n·p) working state only when β differs from the memoized key; otherwise
3206    /// return the cached weights. Returns `None` exactly when
3207    /// `frozen_glm_working_state` does (no frozen-W inputs / β shape mismatch).
3208    fn frozen_glm_trial_weights(
3209        &mut self,
3210        beta: &Array1<f64>,
3211    ) -> Result<Option<Array1<f64>>, EstimationError> {
3212        if let Some((memo_beta, memo_w)) = self.frozen_glm_weight_memo.as_ref()
3213            && memo_beta.len() == beta.len()
3214            && memo_beta
3215                .iter()
3216                .zip(beta.iter())
3217                .all(|(a, b)| a.to_bits() == b.to_bits())
3218        {
3219            return Ok(Some(memo_w.clone()));
3220        }
3221        match self.frozen_glm_working_state(beta)? {
3222            Some((current_w, _)) => {
3223                self.frozen_glm_weight_memo = Some((beta.clone(), current_w.clone()));
3224                Ok(Some(current_w))
3225            }
3226            None => Ok(None),
3227        }
3228    }
3229
3230    fn ensure_frozen_glm_tensor(
3231        &mut self,
3232        theta: &Array1<f64>,
3233        warm_beta: Option<&Array1<f64>>,
3234    ) -> Result<(), EstimationError> {
3235        if self.frozen_glm_tensor.is_some() || self.frozen_glm_tensor_attempted {
3236            return Ok(());
3237        }
3238        let Some((psi_lo, psi_hi)) = self.frozen_glm_psi_bounds else {
3239            return Ok(());
3240        };
3241        if theta.len() != self.rho_dim + 1 {
3242            self.frozen_glm_tensor_attempted = true;
3243            return Ok(());
3244        }
3245        let Some(beta) = warm_beta else {
3246            return Ok(());
3247        };
3248        let Some((frozen_w, working_z)) = self.frozen_glm_working_state(beta)? else {
3249            self.frozen_glm_tensor_attempted = true;
3250            return Ok(());
3251        };
3252        let theta_probe_base = theta.clone();
3253        let rho_dim = self.rho_dim;
3254        // Build through the evaluator so the frozen-W Gram is assembled in the
3255        // SAME conditioned `x_fit` column frame the inner PIRLS solve uses
3256        // (the evaluator owns the ψ-invariant parametric conditioning). Disjoint
3257        // mutable borrows of `cache` (in the realizer) and `evaluator` (the
3258        // build host) — both fields of `self` — exactly as the Gaussian
3259        // `build_and_set_psi_gram_tensor` site does.
3260        let Self {
3261            cache, evaluator, ..
3262        } = self;
3263        let tensor = evaluator.build_frozen_glm_gram_tensor(
3264            |psi| {
3265                let mut theta_probe = theta_probe_base.clone();
3266                theta_probe[rho_dim] = psi;
3267                cache.ensure_theta(&theta_probe)?;
3268                Ok(cache.design().design.clone())
3269            },
3270            frozen_w.view(),
3271            working_z.view(),
3272            psi_lo,
3273            psi_hi,
3274        );
3275        self.cache
3276            .ensure_theta(theta)
3277            .map_err(EstimationError::InvalidInput)?;
3278        self.frozen_glm_tensor_attempted = true;
3279        if let Some(tensor) = tensor {
3280            self.frozen_glm_tensor = Some(tensor);
3281            log::info!(
3282                "[STAGE] {} certified frozen-W GLM ψ tensor over [{psi_lo:.3}, {psi_hi:.3}]",
3283                self.kind.label(),
3284            );
3285        } else {
3286            log::info!(
3287                "[STAGE] {} frozen-W GLM ψ tensor did not certify over [{psi_lo:.3}, {psi_hi:.3}]",
3288                self.kind.label(),
3289            );
3290        }
3291        Ok(())
3292    }
3293
3294    fn stage_frozen_glm_trial_statistics(
3295        &mut self,
3296        theta: &Array1<f64>,
3297        warm_beta: Option<&Array1<f64>>,
3298        allow_gradient: bool,
3299    ) -> Result<(), EstimationError> {
3300        let kind = self.kind;
3301        let mut staged_gram: Option<Array2<f64>> = None;
3302        let mut staged_deriv: Option<(Array2<f64>, Array1<f64>)> = None;
3303        if theta.len() == self.rho_dim + 1 {
3304            let psi = theta[self.rho_dim];
3305            // Compute the β-memoized trial Fisher weights up front (mutable
3306            // self borrow) so the immutable `self.frozen_glm_tensor` borrow
3307            // below does not alias it. `frozen_glm_trial_weights` recomputes the
3308            // O(n·p) working state only on a β change, so a same-β probe pays
3309            // nothing here (#1033). Only proceed when a tensor is installed and
3310            // covers this ψ — otherwise skip the weight compute entirely.
3311            let tensor_covers = self
3312                .frozen_glm_tensor
3313                .as_ref()
3314                .is_some_and(|t| t.contains(psi));
3315            let current_w = if tensor_covers {
3316                match warm_beta {
3317                    Some(beta) => self.frozen_glm_trial_weights(beta)?,
3318                    None => None,
3319                }
3320            } else {
3321                None
3322            };
3323            if let (Some(tensor), Some(current_w)) =
3324                (self.frozen_glm_tensor.as_ref(), current_w.as_ref())
3325            {
3326                const FROZEN_GLM_WEIGHT_DRIFT_RTOL: f64 = 1e-3;
3327                if tensor.weight_drift_within(current_w.view(), FROZEN_GLM_WEIGHT_DRIFT_RTOL) {
3328                    staged_gram = Some(tensor.gram_at(psi));
3329                    log::debug!(
3330                        "[STAGE] {} trial at psi={psi:.6}: serving frozen-W GLM \
3331                         first-Fisher-step XᵀWX n-free (weight drift within tol)",
3332                        kind.label(),
3333                    );
3334                }
3335                if allow_gradient
3336                    && tensor.contains_for_gradient(psi)
3337                    && let Some((dgram_dpsi, drhs_dpsi)) =
3338                        tensor.gradient_pair_if_sound(psi, current_w.view())
3339                {
3340                    staged_deriv = Some((dgram_dpsi, drhs_dpsi));
3341                    log::debug!(
3342                        "[STAGE] {} trial at psi={psi:.6}: serving frozen-W GLM \
3343                         ψ-gradient (∂G/∂ψ, ∂b/∂ψ) n-free (gradient weight drift within \
3344                         tight tol); B_j stays exact",
3345                        kind.label(),
3346                    );
3347                }
3348            }
3349        }
3350        self.evaluator.stage_glm_first_step_gram(staged_gram);
3351        self.evaluator.stage_glm_psi_gram_deriv(staged_deriv);
3352        Ok(())
3353    }
3354
3355    /// Full evaluation on the current realized design + hyper_dirs.
3356    fn eval_full(
3357        &mut self,
3358        theta: &Array1<f64>,
3359        order: gam_solve::rho_optimizer::OuterEvalOrder,
3360        analytic_outer_hessian_available: bool,
3361    ) -> Result<
3362        (
3363            f64,
3364            Array1<f64>,
3365            gam_problem::HessianResult,
3366        ),
3367        EstimationError,
3368    > {
3369        use gam_solve::rho_optimizer::OuterEvalOrder;
3370        let allow_second_order = matches!(order, OuterEvalOrder::ValueGradientHessian)
3371            && analytic_outer_hessian_available;
3372        if let Some(eval) = self.cache.memoized_eval(theta) {
3373            let cached_satisfies_order = !allow_second_order || eval.2.is_analytic();
3374            if cached_satisfies_order {
3375                return Ok(eval);
3376            }
3377        }
3378        let kind = self.kind;
3379        // #1033: the per-trial n×k design re-realization (`ensure_theta` →
3380        // `apply_log_kappa`) plus the downstream n-row reconditioning
3381        // (`reset_surface`) are the LAST n-passes in the certified κ loop. They
3382        // are redundant on the Gaussian-identity certified path: the inner
3383        // Gaussian PLS reads its `XᵀWX(ψ)/XᵀW(y−offset)(ψ)` entirely from the
3384        // ψ-keyed `GaussianFixedCache` the certified tensor installs (zero row
3385        // access), and the ψ-gradient HyperCoord is served from the k-space
3386        // `(∂G/∂ψ, ∂b/∂ψ)` tensor derivatives — never the n×k ∂X/∂ψ slab. So when
3387        //   (a) this is the single design-moving ψ coordinate (`rho_dim + 1`),
3388        //   (b) the certified ψ-Gram tensor covers ψ for BOTH the value lane
3389        //       (`psi_gram_tensor_covers`) AND the gradient window
3390        //       (`psi_gram_tensor_covers_gradient`) — so neither channel reads
3391        //       the realized rows,
3392        //   (c) this eval is gradient-only (`!allow_second_order`) — the exact
3393        //       outer-Hessian `B_j` path DOES read the slab, so a Hessian trial
3394        //       must keep a faithful (freshly realized) design, and
3395        //   (d) the evaluator has a pinned canonical slow-path revision — i.e.
3396        //       a prior slow-path eval already built a faithful reference surface,
3397        //       which `prepare_eval_state` will reuse while re-installing the
3398        //       ψ-keyed cache,
3399        // we SKIP `ensure_theta`. The realizer revision then does not advance, so
3400        // `prepare_eval_state` takes its design-revision fast path by receiving
3401        // that pinned revision back: it skips `reset_surface` + the n×k
3402        // `apply_to_design`, keeps the reference surface, and re-keys the
3403        // `GaussianFixedCache` to this ψ. The hyper_dirs built below are a pure
3404        // function of (data, frozen spec, column layout) — ψ-invariant — so they
3405        // are bit-identical whether or not the design was re-realized, and the
3406        // tensor branch never reads their n×k slab anyway. Net: criterion +
3407        // gradient + inner solve come from k-space statistics only, with no
3408        // per-trial O(n·k) pass.
3409        //
3410        // When ANY gate clause fails (non-Gaussian, off-window, off the gradient
3411        // sub-window, a Hessian eval, or no pinned canonical surface yet) we
3412        // realize the design as before so the slow path rebuilds a faithful
3413        // surface — the existing exact lane runs unchanged.
3414        let nfree_fast_path_revision = self.evaluator.nfree_fast_path_revision();
3415        let skip_design_realization = !allow_second_order && theta.len() == self.rho_dim + 1 && {
3416            let psi = theta[self.rho_dim];
3417            self.evaluator.psi_gram_tensor_covers(psi)
3418                    // #1033 gradient coverage: the skip serves the ψ-gradient n-free
3419                    // only where the analytic Chebyshev derivative is CERTIFIED.
3420                    // The kappa sufficient-statistic outer loop is routed here only
3421                    // when the certified gradient window spans the entire optimizer
3422                    // bounds, so a measured trial cannot pay an edge streamed
3423                    // ∂X/∂ψ pass after the initial priming eval.
3424                    && self.evaluator.psi_gram_tensor_covers_gradient(psi)
3425                    // #1264 (RESTORED) reduced-basis-rotation soundness precondition.
3426                    // The Gaussian inner penalized solve `(QsᵀGQs+S)β=b` runs in the
3427                    // CONDITIONED reduced basis. On the near-singular production
3428                    // Duchon Gram (κ(G)≈9.5e14) that basis ROTATES with ψ, and the
3429                    // skip installs the Chebyshev-interpolated `gram_at(ψ)` (≤1e-10
3430                    // vs streamed exact). When the trial-ψ basis differs from the
3431                    // reference surface's, the κ-amplified round-off moves β̂ by
3432                    // ~1.7e-5 — 17× the issue's 1e-6 bar — EVEN at a ψ the n-free
3433                    // VALUE window admits (cluster: β̂rel=1.749e-5 at ψ=2.803). The
3434                    // "stale-penalty-not-stale-basis" theory that dropped this gate
3435                    // was empirically refuted. So the skip is β̂-sound ONLY where the
3436                    // gauge-invariant range projector is unchanged vs the pinning ψ:
3437                    // `reduced_basis_equal(psi_ref, psi)`. Value coverage is NOT
3438                    // sufficient. This forces the exact O(n) `reset_surface` fallback
3439                    // across a basis rotation — correctness over n-independence
3440                    // (#1033 is frontier-blocked on rotating Duchon geometry).
3441                    && self.evaluator.psi_gram_tensor_covers_skip(psi)
3442                    // #1033 penalty lane: ψ moves S(ψ) too, and the skip leaves
3443                    // `reset_surface` un-run; only skip when the penalty can be
3444                    // rebuilt EXACTLY and n-free on the fast path, else the inner
3445                    // solve would pair XᵀWX(ψ_new) with the stale S(ψ_old).
3446                    && self.evaluator.supports_nfree_penalty_rekey()
3447                    && nfree_fast_path_revision.is_some()
3448        };
3449        if skip_design_realization {
3450            log::debug!(
3451                "[STAGE] {} eval_full at psi={:.6}: skipping n×k design re-realization \
3452                 + reconditioning — criterion/gradient/inner-solve served n-free from \
3453                 the certified ψ-gram tensor (GaussianFixedCache + k-space ψ-derivatives)",
3454                kind.label(),
3455                theta[self.rho_dim],
3456            );
3457        } else {
3458            self.cache
3459                .ensure_theta(theta)
3460                .map_err(EstimationError::InvalidInput)?;
3461        }
3462        let warm_beta = self.evaluator.current_beta();
3463        self.ensure_frozen_glm_tensor(theta, warm_beta.as_ref())?;
3464        // #1033 / #1111: stage the GLM frozen-W first-step Gram and conditioned
3465        // ψ-gradient whenever the certified frozen-weight tensor covers this
3466        // trial's ψ. The provider applies its drift guards, so misses clear the
3467        // staged slots and the exact streamed path runs.
3468        //
3469        // Stage through a shared helper because cost-only line-search probes use
3470        // the same first-Fisher-step Gram; they simply pass `allow_gradient=false`.
3471        self.stage_frozen_glm_trial_statistics(theta, warm_beta.as_ref(), !allow_second_order)?;
3472        // #1033: on the certified Gaussian skip path the value and ψ-gradient
3473        // are both served by k-space tensor statistics, so the row-wise X_ψ slab
3474        // is dead. Build only the exact n-free S_ψ components from frozen
3475        // geometry and attach a zero-storage design derivative placeholder.
3476        // Edge-gradient/Hessian/non-certified trials keep the exact row-wise
3477        // builder, because those lanes genuinely consume X_ψ.
3478        let hyper_dirs = if skip_design_realization {
3479            self.cache.nfree_tensor_gradient_hyper_dirs(theta)?
3480        } else {
3481            self.cache.hyper_dirs_for_current_design(self.data, kind)?
3482        };
3483
3484        let design_revision = if skip_design_realization {
3485            nfree_fast_path_revision
3486        } else {
3487            Some(self.cache.design_revision())
3488        };
3489        // #1033 penalty lane: stage the EXACT n-free `S(ψ)` for this trial so the
3490        // evaluator's design-revision fast path can re-key the kept reference
3491        // surface without `reset_surface`. Built from the FROZEN basis geometry
3492        // (centers + identifiability transform + operator collocation points) at
3493        // the trial length-scale — no data rows — so it is valid even on the
3494        // design-realization skip path (where the design was not re-realized). The
3495        // caller (holding `cache`) computes it and hands the owned result to the
3496        // evaluator, sidestepping a `&mut cache` borrow alias. On the slow path
3497        // the evaluator ignores + clears the staged value (it rebuilds S from the
3498        // realized design). A build error here clears the stage; if the skip
3499        // already fired (fast path), the evaluator then hard-errors rather than
3500        // pairing a stale S — the safe outcome, since a rebuild from frozen
3501        // geometry should never fail in practice.
3502        if self.evaluator.supports_nfree_penalty_rekey() {
3503            match self.cache.canonical_penalties_at(theta) {
3504                Ok(penalty) => self.evaluator.stage_fast_path_penalty(Some(penalty)),
3505                Err(e) => {
3506                    log::warn!(
3507                        "[STAGE] {} eval_full at psi={:.6}: exact n-free S(ψ) rebuild failed \
3508                         ({e}); clearing stage (eval falls to slow path)",
3509                        kind.label(),
3510                        theta[self.rho_dim],
3511                    );
3512                    self.evaluator.stage_fast_path_penalty(None);
3513                }
3514            }
3515        }
3516        // Warm-start PIRLS from the previous outer step's converged β. This is
3517        // especially impactful for GLM families (Poisson, NB, Binomial) that
3518        // cannot use the Gaussian Gram tensor n-free shortcut: without the warm
3519        // β every outer step cold-solves a full PIRLS from β=0, paying the full
3520        // O(n·p²) cost × PIRLS-iters × outer-iters budget. With the warm β the
3521        // inner solve typically converges in 1-2 Newton steps instead of 4-8.
3522        let eval = evaluate_joint_reml_outer_eval_at_theta(
3523            &mut self.evaluator,
3524            self.cache.design(),
3525            theta,
3526            self.rho_dim,
3527            hyper_dirs,
3528            warm_beta.as_ref().map(|b: &Array1<f64>| b.view()),
3529            if allow_second_order {
3530                order
3531            } else {
3532                OuterEvalOrder::ValueAndGradient
3533            },
3534            design_revision,
3535        );
3536        if let Ok(ref value) = eval {
3537            self.cache.store_eval_at(theta, value.clone());
3538        }
3539        eval
3540    }
3541
3542    fn eval_efs(
3543        &mut self,
3544        theta: &Array1<f64>,
3545    ) -> Result<gam_problem::EfsEval, EstimationError> {
3546        self.cache
3547            .ensure_theta(theta)
3548            .map_err(EstimationError::InvalidInput)?;
3549        let kind = self.kind;
3550        let hyper_dirs = try_build_spatial_log_kappa_hyper_dirs(
3551            self.data,
3552            self.cache.spec(),
3553            self.cache.design(),
3554            &self.cache.spatial_terms,
3555        )?
3556        .ok_or_else(|| {
3557            EstimationError::InvalidInput(format!(
3558                "failed to build {} hyper_dirs for exact-joint EFS",
3559                kind.adjective(),
3560            ))
3561        })?;
3562        let design_revision = Some(self.cache.design_revision());
3563        let warm_beta = self.evaluator.current_beta();
3564        evaluate_joint_reml_efs_at_theta(
3565            &mut self.evaluator,
3566            self.cache.design(),
3567            theta,
3568            self.rho_dim,
3569            hyper_dirs,
3570            warm_beta.as_ref().map(|b: &Array1<f64>| b.view()),
3571            design_revision,
3572        )
3573    }
3574
3575    /// Cost-only evaluation. BFGS line-search probes route through the
3576    /// evaluator's true value-only path so they neither construct
3577    /// `try_build_spatial_log_kappa_hyper_dirs` nor assemble a gradient that
3578    /// the line search will discard. Split-borrow on `self.cache` +
3579    /// `self.evaluator` matches the pattern already used by `eval_full`.
3580    fn eval_cost(&mut self, theta: &Array1<f64>) -> f64 {
3581        if let Some(cost) = self.cache.memoized_cost(theta) {
3582            return cost;
3583        }
3584        // #1029: a BFGS line-search VALUE probe. It converges the inner PIRLS to
3585        // the SAME tolerance the accepted-point full eval uses (NOT a capped
3586        // surrogate — a cap returns ∞ for a feasible point and re-imports the
3587        // #787/#808 outer stall), so probe and incumbent values live in ONE
3588        // refinement regime (measure-consistent Armijo). It is cheaper only
3589        // because it skips the gradient / hyper-dir assembly. Time the inner
3590        // cost-only solve and report it alongside the trial-θ distance from the
3591        // last evaluated point so this convergence-critical regression class is
3592        // visible in the STAGE trace (the spatial REML lane has no PROGRESS-
3593        // EXTENDED refine multiplier — that knob is SAE-only — so there is no
3594        // extended polish to strip from a probe here).
3595        //
3596        // Capture the previous evaluated θ BEFORE `ensure_theta` overwrites it,
3597        // so the logged distance reflects the backtracking step rather than 0.
3598        let probe_start = std::time::Instant::now();
3599        let psi_distance = self
3600            .cache
3601            .current_theta
3602            .as_ref()
3603            .filter(|reference| reference.len() == theta.len())
3604            .map(|reference| {
3605                reference
3606                    .iter()
3607                    .zip(theta.iter())
3608                    .map(|(a, b)| (a - b) * (a - b))
3609                    .sum::<f64>()
3610                    .sqrt()
3611            })
3612            .unwrap_or(f64::NAN);
3613        // #1033: a VALUE-only line-search probe needs only the certified ψ-Gram
3614        // tensor's value lane (`XᵀWX(ψ)/XᵀW(y−offset)(ψ)`), which the inner
3615        // Gaussian PLS reads n-free from the ψ-keyed `GaussianFixedCache`. So when
3616        // the single design-moving ψ is covered for the VALUE lane and the
3617        // evaluator has a pinned canonical slow-path revision, skip the n×k
3618        // design re-realization: `evaluate_cost_only` receives that pinned
3619        // revision, takes its `prepare_eval_state_cost_only` fast path (which
3620        // skips `reset_surface` + the n×k `apply_to_design` and re-keys the cache
3621        // to this probe's ψ), and the probe cost comes from k-space statistics
3622        // only. Line-search probes are the bulk of the κ-loop per-trial work, so
3623        // this is the dominant n-flat lever. Any miss (non-Gaussian, off-window,
3624        // missing penalty re-key support, or no pinned surface yet) realizes the
3625        // design and runs the exact streamed probe unchanged.
3626        let nfree_fast_path_revision = self.evaluator.nfree_fast_path_revision();
3627        let skip_value_realization = theta.len() == self.rho_dim + 1 && {
3628            let psi = theta[self.rho_dim];
3629            self.evaluator.psi_gram_tensor_covers(psi)
3630                    // #1264 (RESTORED): the value-only line-search probe runs the
3631                    // SAME conditioned-basis Gaussian solve, so it ships the same
3632                    // κ-amplified interpolated-Gram β̂ error across a basis rotation
3633                    // (cluster: β̂rel≈1.7e-5 ≫ 1e-6). The probe is β̂-sound only where the
3634                    // reduced basis is provably unchanged vs the pinning ψ, exactly
3635                    // as the eval_full gate. See the eval_full gate for the full
3636                    // justification; the dropped-precondition "stale-penalty" theory
3637                    // was empirically refuted.
3638                    && self.evaluator.psi_gram_tensor_covers_skip(psi)
3639                    // #1033 penalty lane: the value-probe fast path also skips
3640                    // `reset_surface`, so the probe must be able to re-key S(ψ)
3641                    // EXACTLY and n-free; otherwise its cost would use the stale
3642                    // S(ψ_old) and mis-rank the line search.
3643                    && self.evaluator.supports_nfree_penalty_rekey()
3644                    && nfree_fast_path_revision.is_some()
3645        };
3646        if theta.len() == self.rho_dim + 1
3647            && self.evaluator.has_psi_gram_tensor()
3648            && !self.evaluator.psi_gram_tensor_covers(theta[self.rho_dim])
3649        {
3650            self.cache.store_cost_at(theta, f64::INFINITY);
3651            return f64::INFINITY;
3652        }
3653        if !skip_value_realization && self.cache.ensure_theta(theta).is_err() {
3654            return f64::INFINITY;
3655        }
3656        // #1033 penalty lane: stage the EXACT n-free `S(ψ)` for this probe's ψ so
3657        // the cost-only fast path re-keys the kept surface without `reset_surface`
3658        // (built from frozen geometry — valid even when the design was not
3659        // re-realized). The slow path clears it. A rebuild failure clears the
3660        // stage; the evaluator then takes the slow path or hard-errors (safe).
3661        if self.evaluator.supports_nfree_penalty_rekey() {
3662            match self.cache.canonical_penalties_at(theta) {
3663                Ok(penalty) => self.evaluator.stage_fast_path_penalty(Some(penalty)),
3664                Err(_) => self.evaluator.stage_fast_path_penalty(None),
3665            }
3666        }
3667        let warm_beta = self.evaluator.current_beta();
3668        if let Err(err) = self.ensure_frozen_glm_tensor(theta, warm_beta.as_ref()) {
3669            log::warn!(
3670                "[STAGE] {} value-probe at psi={:.6}: frozen-W GLM tensor setup failed ({err}); \
3671                 falling back to exact streamed Gram",
3672                self.kind.label(),
3673                if theta.len() > self.rho_dim {
3674                    theta[self.rho_dim]
3675                } else {
3676                    f64::NAN
3677                },
3678            );
3679            self.evaluator.stage_glm_first_step_gram(None);
3680            self.evaluator.stage_glm_psi_gram_deriv(None);
3681        } else if let Err(err) =
3682            self.stage_frozen_glm_trial_statistics(theta, warm_beta.as_ref(), false)
3683        {
3684            log::warn!(
3685                "[STAGE] {} value-probe at psi={:.6}: frozen-W GLM staging failed ({err}); \
3686                 falling back to exact streamed Gram",
3687                self.kind.label(),
3688                if theta.len() > self.rho_dim {
3689                    theta[self.rho_dim]
3690                } else {
3691                    f64::NAN
3692                },
3693            );
3694            self.evaluator.stage_glm_first_step_gram(None);
3695            self.evaluator.stage_glm_psi_gram_deriv(None);
3696        }
3697        let design_revision = if skip_value_realization {
3698            nfree_fast_path_revision
3699        } else {
3700            Some(self.cache.design_revision())
3701        };
3702        let cost_label = self.kind.label();
3703        let result = {
3704            let design = self.cache.design();
3705            self.evaluator.evaluate_cost_only(
3706                &design.design,
3707                &design.penalties,
3708                &design.nullspace_dims,
3709                design.linear_constraints.clone(),
3710                theta,
3711                self.rho_dim,
3712                warm_beta.as_ref().map(|b: &Array1<f64>| b.view()),
3713                cost_label,
3714                design_revision,
3715            )
3716        };
3717        match result {
3718            Ok(cost) => {
3719                log::debug!(
3720                    "[STAGE] {cost_label} value-probe (order=Value): elapsed={:.3}s \
3721                     cost={cost:.6e} trial_theta_distance={psi_distance:.3e}",
3722                    probe_start.elapsed().as_secs_f64(),
3723                );
3724                self.cache.store_cost_at(theta, cost);
3725                cost
3726            }
3727            Err(_) => f64::INFINITY,
3728        }
3729    }
3730
3731    fn reset(&mut self) {
3732        self.cache.current_theta = None;
3733        self.cache.last_eval_theta = None;
3734        self.cache.last_cost = None;
3735        self.cache.last_eval = None;
3736    }
3737}
3738
3739/// Exact joint `[ρ, ψ]` optimization for spatial terms using analytic
3740/// derivatives through the unified REML evaluator. This is the single shared
3741/// engine for both the anisotropic and isotropic coordinate kinds (selected by
3742/// `kind`).
3743///
3744/// At each outer iteration, the frozen term topology is reused and only the
3745/// spatial realized blocks affected by the current ψ are refreshed before the
3746/// unified evaluator returns cost + gradient + Hessian for the full
3747/// θ = [ρ, ψ] vector. The ψ derivatives flow through:
3748///
3749///   `AnisoBasisPsiDerivatives` / `SpatialPsiDerivative` → `DirectionalHyperParam`
3750///     → `build_tau_unified_objects` → `HyperCoord` ext_coords → unified evaluator
3751///
3752/// This gives Newton/BFGS quadratic convergence on the length-scale /
3753/// anisotropy parameters while jointly optimizing the smoothing parameters.
3754///
3755/// The ψ coordinates are parameterized as unconstrained log-scales. For the
3756/// anisotropic kind the decomposition into isotropic scale (ψ̄ = mean(ψ_a)) and
3757/// anisotropy (η_a = ψ_a − ψ̄, with Ση_a = 0) happens only on writeback via
3758/// `SpatialLogKappaCoords::apply_tospec`; the all-ones direction in ψ-space is
3759/// NOT a gauge direction — it controls the identifiable isotropic scale
3760/// κ = exp(ψ̄). The isotropic kind carries one log-κ coordinate per term. In
3761/// neither case is a sum-to-zero constraint enforced during optimization.
3762/// Outcome of the joint spatial hyperparameter `(ρ, ψ/κ)` optimization.
3763///
3764/// The joint κ optimizer refines an *already-valid* frozen baseline geometry
3765/// (the REML-seeded length scales in `best`); it is therefore best-effort. A run
3766/// that does not certify a stationary point must degrade to the baseline rather
3767/// than abort the parent fit (#1126), so this enum lets the caller distinguish a
3768/// usable iterate from a non-convergence that should fall back to the baseline.
3769/// Genuine numerical blowups (a non-finite terminal cost) still surface as
3770/// `Err` from [`run_exact_joint_spatial_optimization`] and never reach here.
3771enum SpatialJointOutcome {
3772    /// The optimizer produced a usable iterate: it either converged to a
3773    /// stationary point or its terminal iterate cleared the mgcv-style
3774    /// relative-to-cost REML acceptance gate. Carries `(θ*, final_value)`.
3775    Optimized {
3776        theta_star: Array1<f64>,
3777        final_value: f64,
3778    },
3779    /// The optimizer ran to a finite terminal cost but neither converged nor
3780    /// cleared the relative-to-cost gate. The caller keeps the frozen baseline
3781    /// geometry; the fields are diagnostics only.
3782    NonConverged {
3783        iterations: usize,
3784        final_value: f64,
3785        final_grad_norm: Option<f64>,
3786    },
3787}
3788
3789fn kphase_log_norms(theta: &Array1<f64>, rho_dim: usize) -> (f64, f64) {
3790    let theta_norm = theta.iter().map(|v| v * v).sum::<f64>().sqrt();
3791    let log_kappa_norm = theta
3792        .iter()
3793        .skip(rho_dim)
3794        .map(|v| v * v)
3795        .sum::<f64>()
3796        .sqrt();
3797    (theta_norm, log_kappa_norm)
3798}
3799
3800fn run_exact_joint_spatial_optimization(
3801    kind: SpatialHyperKind,
3802    data: ArrayView2<'_, f64>,
3803    y: ArrayView1<'_, f64>,
3804    weights: ArrayView1<'_, f64>,
3805    offset: ArrayView1<'_, f64>,
3806    resolvedspec: &TermCollectionSpec,
3807    baseline_design: &TermCollectionDesign,
3808    family: LikelihoodSpec,
3809    options: &FitOptions,
3810    spatial_terms: &[usize],
3811    dims_per_term: &[usize],
3812    theta0: &Array1<f64>,
3813    lower: &Array1<f64>,
3814    upper: &Array1<f64>,
3815    rho_dim: usize,
3816    kappa_options: &SpatialLengthScaleOptimizationOptions,
3817) -> Result<(SpatialJointOutcome, SpatialLengthScaleOptimizationTiming), EstimationError> {
3818    let label = kind.label();
3819    // Use bounds and design metadata for validation.
3820    assert!(
3821        lower.len() == theta0.len() && upper.len() == theta0.len(),
3822        "spatial hyperparameter bounds must match theta length: lower_len={}, upper_len={}, theta_len={}",
3823        lower.len(),
3824        upper.len(),
3825        theta0.len()
3826    );
3827    assert!(
3828        baseline_design.smooth.terms.len() >= spatial_terms.len(),
3829        "baseline design must have at least one smooth term per spatial term: baseline_terms={}, spatial_terms={}",
3830        baseline_design.smooth.terms.len(),
3831        spatial_terms.len()
3832    );
3833    use gam_solve::rho_optimizer::OuterEvalOrder;
3834    use gam_problem::{DeclaredHessianForm, Derivative, OuterEval};
3835
3836    let theta_dim = theta0.len();
3837    // Directional-coordinate dimension: psi-per-axis (anisotropic) or
3838    // kappa-per-term (isotropic). The numerics below are identical either way.
3839    let coord_dim = theta_dim - rho_dim;
3840    // Capability is declared solely from derivative coverage, not from
3841    // problem size. The unified REML evaluator now exposes exact matrix-free
3842    // outer Hessian operators for the costly third/fourth-derivative
3843    // contractions used by spatial ψ coordinates; its internal
3844    // `(n, p, K)` work model chooses `HessianResult::Operator` at large-scale
3845    // scale and the dense analytic matrix only below that crossover. Keeping
3846    // `Derivative::Analytic` here preserves ARC / trust-region-CG second-order
3847    // optimization for `n > 50_000` and `coord_dim > 30` instead of forcing the
3848    // obsolete HybridEFS compatibility path.
3849    let analytic_outer_hessian_available =
3850        exact_joint_spatial_outer_hessian_available(&family, baseline_design);
3851    if !analytic_outer_hessian_available {
3852        log::info!(
3853            "[{label}] analytic outer Hessian unavailable for family/design; routing without second-order geometry (coord_dim={coord_dim})"
3854        );
3855    }
3856    // Cost-aware second-order routing, mirroring the n-block path's
3857    // work-budget policy: past the pair budget gradient-only quasi-Newton
3858    // converges to the same optimum strictly cheaper per eval; below it,
3859    // exact second-order keeps the ARC/TR-CG geometry. The budget's
3860    // derivation is owned by `EXACT_JOINT_SECOND_ORDER_THETA_CAP`.
3861    let mut prefer_gradient_only = theta_dim > EXACT_JOINT_SECOND_ORDER_THETA_CAP;
3862    if prefer_gradient_only {
3863        log::info!(
3864            "[{label}] joint θ-dim {theta_dim} exceeds the exact pair-Hessian budget \
3865             ({EXACT_JOINT_SECOND_ORDER_THETA_CAP}); routing gradient-only quasi-Newton"
3866        );
3867    }
3868    // #1033: set when the n-free Gaussian ψ-lane arms below. It must SUPPRESS the
3869    // declared analytic outer Hessian (force `Unavailable`), not merely prefer
3870    // gradient-only: the planner keeps the second-order ARC solver whenever an
3871    // analytic Hessian is declared `Either`, even under `prefer_gradient_only`
3872    // (see `plan_prefer_gradient_only_does_not_hide_analytic_hessian`). A
3873    // `ValueGradientHessian` eval forces the O(n) design re-realization because
3874    // the outer Hessian curvature slab `B_j` is irreducibly n-dependent, so only
3875    // routing to a gradient-only solver (BFGS) keeps every in-window κ-trial on
3876    // the n-free `ValueAndGradient` skip.
3877    let mut suppress_outer_hessian_for_nfree = false;
3878
3879    log::trace!(
3880        "[{}] starting analytic optimization: rho_dim={}, coord_dim={}, dims_per_term={:?}",
3881        label,
3882        rho_dim,
3883        coord_dim,
3884        dims_per_term,
3885    );
3886
3887    let mut ctx = SpatialJointContext {
3888        data,
3889        rho_dim,
3890        kind,
3891        cache: SingleBlockExactJointDesignCache::new(
3892            data,
3893            resolvedspec.clone(),
3894            baseline_design.clone(),
3895            spatial_terms.to_vec(),
3896            rho_dim,
3897            dims_per_term.to_vec(),
3898        )
3899        .map_err(EstimationError::InvalidInput)?,
3900        evaluator: gam_solve::estimate::ExternalJointHyperEvaluator::new(
3901            y,
3902            weights,
3903            &baseline_design.design,
3904            offset,
3905            &baseline_design.penalties,
3906            &external_opts_for_design(&family, baseline_design, options),
3907            label,
3908        )?,
3909        frozen_glm_inputs: if coord_dim == 1 && frozen_glm_tensor_eligible_family(&family) {
3910            Some(SpatialFrozenGlmInputs {
3911                y: y.to_owned(),
3912                weights: weights.to_owned(),
3913                offset: offset.to_owned(),
3914                family: family.clone(),
3915            })
3916        } else {
3917            None
3918        },
3919        frozen_glm_psi_bounds: if coord_dim == 1 && frozen_glm_tensor_eligible_family(&family) {
3920            Some((lower[rho_dim], upper[rho_dim]))
3921        } else {
3922            None
3923        },
3924        frozen_glm_tensor: None,
3925        frozen_glm_tensor_attempted: false,
3926        frozen_glm_weight_memo: None,
3927    };
3928
3929    // #1033b: single isotropic design-moving coordinate on a Gaussian-identity
3930    // fit — build the certified Chebyshev-in-ψ Gram tensor ONCE over the
3931    // optimizer's ψ window and hand it to the evaluator. Every in-window trial
3932    // then receives its Gaussian sufficient statistics (XᵀWX(ψ), XᵀW(y−offset),
3933    // (y−offset)ᵀW(y−offset)) assembled n-free instead of paying the per-trial
3934    // O(n·p²) Gram re-stream after the design rebuild. The realizer closure
3935    // returns the RAW realized design; the evaluator threads it through its
3936    // own (fixed, ψ-invariant) parametric column conditioning so the tensor
3937    // lives in the same frame as the streamed Gram. Certification failure,
3938    // off-window trials, or any other ineligibility silently keep the exact
3939    // streamed path (same numbers, the tensor is certified to
3940    // PSI_GRAM_SPOT_RTOL against the exact rebuild).
3941    // #1033 (rank-stable κ-floor): set to the lowest ψ at which the certified
3942    // tensor's conditioned Gram holds maximal numerical rank. Below it the
3943    // reduced basis collapses/rotates and the design-realization skip is SOUNDLY
3944    // refused (→ O(n) reset_surface); the κ window floor `ln(2/r_max)` lands
3945    // inside that degenerate sliver and DRIFTS with n through the sample-std
3946    // standardization, so n=2000's line search re-enters the slow lane while
3947    // n=1000's does not. Lifting the optimizer's lower bound to this n-FREE
3948    // (k-space) floor keeps every in-window trial on the fast path for all n,
3949    // and only excludes over-smoothed length scales the `2/r_max` geometry floor
3950    // already meant to exclude (the κ-optimum lives well above it).
3951    let mut psi_rank_stable_floor: Option<f64> = None;
3952    // #1033 (rank-stable κ-ceiling): symmetric twin of the floor. The conditioned
3953    // Gram is rank-deficient at the HIGH window edge too (the longest-frequency
3954    // radial mode goes collinear), so a line-search overshoot above the maximal-
3955    // rank band soundly refuses the design-realization skip → O(n) reset_surface,
3956    // and the deficient pinning ψ it records makes the NEXT in-band trial reset a
3957    // second time. Clamping the optimizer's UPPER bound to this n-free k-space
3958    // ceiling keeps every trial inside the band. The κ-optimum lives well inside
3959    // it, so the clamp only excludes over-fit (too-short) length scales.
3960    let mut psi_rank_stable_ceiling: Option<f64> = None;
3961    let nfree_penalty_capable = coord_dim == 1
3962        && family.is_gaussian_identity()
3963        && ctx.cache.supports_nfree_penalty_rekey();
3964    if nfree_penalty_capable {
3965        let psi_lo = lower[rho_dim];
3966        let psi_hi = upper[rho_dim];
3967        let z = Array1::from_iter(y.iter().zip(offset.iter()).map(|(yi, oi)| yi - oi));
3968        let theta_probe_base = theta0.clone();
3969        // Disjoint mutable borrows of `cache` (in the realizer) and
3970        // `evaluator` (the build target) — both fields of `ctx`.
3971        let SpatialJointContext {
3972            cache, evaluator, ..
3973        } = &mut ctx;
3974        let attached = evaluator.build_and_set_psi_gram_tensor(
3975            |psi| {
3976                let mut theta_probe = theta_probe_base.clone();
3977                theta_probe[rho_dim] = psi;
3978                cache.ensure_theta(&theta_probe)?;
3979                Ok(cache.design().design.clone())
3980            },
3981            weights,
3982            z.view(),
3983            psi_lo,
3984            psi_hi,
3985        );
3986        if attached {
3987            log::info!(
3988                "[{label}] certified ψ-gram tensor over [{psi_lo:.3}, {psi_hi:.3}]: \
3989                 in-window trials assemble Gaussian sufficient statistics n-free"
3990            );
3991            // #1033: read the n-free rank-stable κ-floor off the k-space tensor.
3992            // Only lift INTO the window (never below psi_lo, never above the seed
3993            // ψ — the seed is the geometric-mean midpoint and is well clear of the
3994            // degenerate band), so the optimizer never starts outside its bounds.
3995            let psi_anchor = theta0[rho_dim];
3996            psi_rank_stable_floor = evaluator
3997                .psi_gram_rank_stable_floor(psi_anchor)
3998                .filter(|&f| f.is_finite() && f > psi_lo && f < psi_anchor);
3999            log::info!(
4000                "[KAPPA-PHASE-FLOOR] n_rows={} psi_lo={psi_lo:.6} psi_anchor={psi_anchor:.6} \
4001                 rank_stable_floor={:?} lifted={}",
4002                data.nrows(),
4003                evaluator.psi_gram_rank_stable_floor(psi_anchor),
4004                psi_rank_stable_floor.is_some(),
4005            );
4006            if let Some(floor) = psi_rank_stable_floor {
4007                log::info!(
4008                    "[{label}] rank-stable κ-floor ψ_floor={floor:.6} > window floor \
4009                     ψ_lo={psi_lo:.6}: lifting the optimizer lower bound to keep every \
4010                     in-window trial on the n-free design-realization skip (#1033). The \
4011                     conditioned Gram is rank-deficient below ψ_floor (longest-length-scale \
4012                     radial mode collapses into the nullspace), where the skip is soundly \
4013                     refused; that band drifts with n via the sample-std standardization, \
4014                     so this n-free k-space floor is the n-independent fix."
4015                );
4016            }
4017            // #1033: read the n-free rank-stable κ-CEILING (symmetric twin of the
4018            // floor). Only clamp INTO the window (strictly below psi_hi, strictly
4019            // above the seed ψ — the seed is the geometric-mean midpoint, well
4020            // inside the maximal-rank band), so the optimizer never starts outside
4021            // its bounds. This is the fix for the n=16000 fast-ladder resets: the
4022            // line search overshot to ψ≈1.0 (rank 11→10 at the high edge), tripping
4023            // two O(n) reset_surface calls; clamping the upper bound keeps the
4024            // search inside the band where the n-free skip stays sound.
4025            psi_rank_stable_ceiling = evaluator
4026                .psi_gram_rank_stable_ceiling(psi_anchor)
4027                .filter(|&c| c.is_finite() && c < psi_hi && c > psi_anchor);
4028            log::info!(
4029                "[KAPPA-PHASE-CEIL] n_rows={} psi_hi={psi_hi:.6} psi_anchor={psi_anchor:.6} \
4030                 rank_stable_ceiling={:?} clamped={}",
4031                data.nrows(),
4032                evaluator.psi_gram_rank_stable_ceiling(psi_anchor),
4033                psi_rank_stable_ceiling.is_some(),
4034            );
4035            if let Some(ceiling) = psi_rank_stable_ceiling {
4036                log::info!(
4037                    "[{label}] rank-stable κ-ceiling ψ_ceil={ceiling:.6} < window ceiling \
4038                     ψ_hi={psi_hi:.6}: clamping the optimizer upper bound to keep every \
4039                     in-window trial on the n-free design-realization skip (#1033). The \
4040                     conditioned Gram is rank-deficient above ψ_ceil (longest-frequency \
4041                     radial mode goes collinear), where the skip is soundly refused; a \
4042                     line-search overshoot there trips the O(n) reset_surface lane (and the \
4043                     deficient pinning ψ it records resets the next in-band trial too)."
4044                );
4045            }
4046            let gradient_covers_full_window = evaluator.psi_gram_tensor_covers_gradient(psi_lo)
4047                && evaluator.psi_gram_tensor_covers_gradient(psi_hi);
4048            if gradient_covers_full_window {
4049                log::info!(
4050                    "[{label}] certified ψ-gram tensor gradient lane covers the full \
4051                     optimizer window [{psi_lo:.3}, {psi_hi:.3}]"
4052                );
4053            } else {
4054                log::info!(
4055                    "[{label}] ψ-gram tensor value lane certified, but the gradient lane \
4056                     does not cover the full optimizer window [{psi_lo:.3}, {psi_hi:.3}]; \
4057                     keeping exact streamed kappa routing"
4058                );
4059            }
4060            // #1033 penalty lane: ψ also moves the penalty `S(ψ)` (the
4061            // Duchon/ThinPlate Hilbert scale is an analytic function of the
4062            // length-scale, built from the FROZEN basis CENTERS — not the data
4063            // rows). The design-revision fast path that the Gram tensor enables
4064            // SKIPS `reset_surface`, the only place the canonical penalty surface
4065            // is rebuilt; without re-keying, the inner solve would pair
4066            // `XᵀWX(ψ_new)` with the stale `S(ψ_old)` and converge to the wrong
4067            // β̂ / κ-optimum. Rather than interpolate `S(ψ)`, the fast path rebuilds
4068            // it EXACTLY and n-free per trial from the frozen geometry via
4069            // `cache.canonical_penalties_at(theta)` (the SAME
4070            // `canonicalize_penalty_specs` pipeline the slow `reset_surface` runs).
4071            // Here we only DECLARE the capability to the evaluator; the per-trial
4072            // staging happens in `eval_full` / `eval_cost`. The skip is enabled
4073            // exactly when the single spatial term's frozen metadata
4074            // (Duchon/ThinPlate) admits the exact rebuild. Matérn deliberately
4075            // does not enter this block: mixing tensor value probes with exact
4076            // streamed gradients/Hessians changed its selected κ enough to miss
4077            // the truth-recovery quality gate, so Matérn stays on one exact
4078            // streamed objective for value, gradient, and Hessian.
4079            evaluator.set_supports_nfree_penalty_rekey(true);
4080            log::info!(
4081                "[{label}] exact n-free ψ-penalty re-key enabled over [{psi_lo:.3}, \
4082                 {psi_hi:.3}]: in-window fast-path trials rebuild S(ψ) n-free from frozen \
4083                 geometry (no reset_surface)"
4084            );
4085        } else {
4086            log::info!(
4087                "[{label}] ψ-gram tensor did not certify over [{psi_lo:.3}, {psi_hi:.3}]; \
4088                 keeping the exact per-trial path"
4089            );
4090        }
4091        // #1033 (n-independent outer loop): with the n-free Gaussian lane fully
4092        // armed (Gram tensor attached + exact n-free penalty re-key), the design-
4093        // realization skip serves the criterion AND the ψ-gradient `(a_j, g_j)`
4094        // n-free for every in-window trial — but ONLY a `ValueAndGradient` eval
4095        // takes that skip. A `ValueGradientHessian` eval sets `allow_second_order`,
4096        // which forces `ensure_theta` → `reset_surface` (the O(n) design re-
4097        // realization) because the outer Hessian curvature `B_j` is the exact
4098        // n-dependent slab. So second-order outer steps are the LAST O(n) per-trial
4099        // cost in the κ search, and they make the outer loop scale with n. Route
4100        // gradient-only here: the spatial length-scale objective is smooth and the
4101        // budget policy already establishes that gradient-only quasi-Newton
4102        // converges to the same optimum strictly cheaper per eval past the pair-
4103        // Hessian budget — and with the tensor, the realized Hessian is the only
4104        // remaining expensive operation, so the same argument applies for ANY n
4105        // once the lane is armed. This keeps every in-window κ-trial on the n-free
4106        // `ValueAndGradient` skip, delivering the n-independent outer loop. The
4107        // exact second-order geometry is preserved whenever the lane is NOT armed
4108        // for gradient-only routing (non-Gaussian, multi-term, Matérn, or an
4109        // uncertified window), where it still pays O(n) per Hessian but keeps the
4110        // quality-sensitive exact second-order path.
4111        if attached
4112            && evaluator.psi_gram_tensor_covers_gradient(psi_lo)
4113            && evaluator.psi_gram_tensor_covers_gradient(psi_hi)
4114            && evaluator.supports_nfree_penalty_rekey()
4115            && cache.supports_nfree_gradient_only_routing()
4116        {
4117            suppress_outer_hessian_for_nfree = true;
4118            prefer_gradient_only = true;
4119            log::info!(
4120                "[{label}] n-free Gaussian ψ-lane armed; suppressing the analytic outer \
4121                 Hessian and routing gradient-only (BFGS) so the κ outer loop never realizes \
4122                the O(n) second-order slab — n-independent outer loop (#1033)"
4123            );
4124        }
4125    } else if coord_dim == 1 && family.is_gaussian_identity() {
4126        log::info!(
4127            "[{label}] exact n-free ψ-penalty re-key unavailable; skipping ψ-gram tensor \
4128             attachment so value, gradient, and Hessian remain on the same exact streamed \
4129             objective"
4130        );
4131    }
4132
4133    // ── Discriminating outer-gradient FD audit (issue #1040 / #944 merge gate) ──
4134    //
4135    // At θ₀, central-difference the outer criterion component-by-component and
4136    // compare it to the analytic outer gradient that drives this single-block
4137    // joint optimizer. This forks the two failure modes of a non-terminating
4138    // outer loop — an objective↔gradient DESYNC (analytic ≠ FD) vs weak
4139    // identifiability (analytic ≈ FD but a near-singular outer Hessian) — and is
4140    // the standing merge gate for any design-moving ψ-coordinate, including the
4141    // #944 raw-κ constant-curvature coordinate (labelled `psi_kappa[..]`).
4142    //
4143    // Gated strictly to diagnostic-sized problems (auto-derived from the
4144    // realized (n, θ_dim), no flag) so it never taxes a production fit. The
4145    // same gate the n-block driver uses.
4146    // FD-OK: FD-audit of the analytic outer gradient (small-problem gate, never feeds the optimizer)
4147    const OUTER_FD_AUDIT_MAX_N: usize = 4_000; // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4148    const OUTER_FD_AUDIT_MAX_THETA_DIM: usize = 32; // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4149    let n_total = data.nrows();
4150    let outer_fd_audit_eligible = analytic_outer_hessian_available // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4151        && n_total <= OUTER_FD_AUDIT_MAX_N // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4152        && theta_dim <= OUTER_FD_AUDIT_MAX_THETA_DIM; // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4153    log::warn!(
4154        "[OUTER-FD-AUDIT/spatial-exact-joint] gate eligible={outer_fd_audit_eligible} \
4155         analytic_grad={analytic_outer_hessian_available} n_total={n_total} \
4156         theta_dim={theta_dim} rho_dim={rho_dim} psi_dim={coord_dim}"
4157    );
4158    if outer_fd_audit_eligible {
4159        // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4160        let audit = (|| -> Result<gam_solve::rho_optimizer::OuterGradientFdAudit, String> {
4161            let mut eval_at = |theta: &Array1<f64>,
4162                               mode: gam_solve::estimate::reml::reml_outer_engine::EvalMode|
4163             -> Result<
4164                (
4165                    f64,
4166                    Array1<f64>,
4167                    gam_problem::HessianResult,
4168                ),
4169                String,
4170            > {
4171                use gam_solve::estimate::reml::reml_outer_engine::EvalMode;
4172                let order = if matches!(mode, EvalMode::ValueGradientHessian) {
4173                    OuterEvalOrder::ValueGradientHessian
4174                } else {
4175                    OuterEvalOrder::Value
4176                };
4177                ctx.eval_full(theta, order, analytic_outer_hessian_available)
4178                    .map_err(|e| format!("fd-audit eval_full: {e}"))
4179            };
4180            let rho_dim_audit = rho_dim;
4181            let label_fn = move |i: usize| -> String {
4182                if i < rho_dim_audit {
4183                    format!("rho[{i}]")
4184                } else {
4185                    format!("psi_kappa[{}]", i - rho_dim_audit)
4186                }
4187            };
4188            gam_solve::rho_optimizer::outer_gradient_fd_audit(
4189                // fd-ok: FD-audit gate, runs diagnostic oracle only, not in fit math
4190                theta0,
4191                1e-4,
4192                label_fn,
4193                &mut eval_at,
4194            )
4195        })();
4196        // END-FD-OK
4197        match audit {
4198            Ok(audit) => audit.log_verdict("spatial-exact-joint"),
4199            Err(e) => log::warn!("[OUTER-FD-AUDIT/spatial-exact-joint] skipped: {e}"),
4200        }
4201    }
4202
4203    let kphase_prime_order = if analytic_outer_hessian_available && !suppress_outer_hessian_for_nfree {
4204        OuterEvalOrder::ValueGradientHessian
4205    } else {
4206        OuterEvalOrder::ValueAndGradient
4207    };
4208    let kphase_prime_start = std::time::Instant::now();
4209    drop(ctx.eval_full(theta0, kphase_prime_order, analytic_outer_hessian_available)?);
4210    log::info!(
4211        "[KAPPA-PHASE-PRIME] n_rows={} order={:?} elapsed_s={:.4} slow_path_resets_total={} design_revision={}",
4212        data.nrows(),
4213        kphase_prime_order,
4214        kphase_prime_start.elapsed().as_secs_f64(),
4215        ctx.evaluator.slow_path_reset_count(),
4216        ctx.cache.design_revision(),
4217    );
4218
4219    let kphase_cost_calls = std::cell::Cell::new(0usize);
4220    let kphase_eval_calls = std::cell::Cell::new(0usize);
4221    let kphase_efs_calls = std::cell::Cell::new(0usize);
4222    let kphase_cost_total_s = std::cell::Cell::new(0.0);
4223    let kphase_eval_total_s = std::cell::Cell::new(0.0);
4224    let kphase_efs_total_s = std::cell::Cell::new(0.0);
4225    let kphase_nfree_miss_shape = std::cell::Cell::new(0u64);
4226    let kphase_nfree_miss_value = std::cell::Cell::new(0u64);
4227    let kphase_nfree_miss_gradient = std::cell::Cell::new(0u64);
4228    let kphase_nfree_miss_penalty = std::cell::Cell::new(0u64);
4229    let kphase_nfree_miss_revision = std::cell::Cell::new(0u64);
4230    let kphase_nfree_miss_second_order = std::cell::Cell::new(0u64);
4231    let kphase_nfree_miss_other = std::cell::Cell::new(0u64);
4232    let kphase_optim_start = std::time::Instant::now();
4233    let kphase_log_kappa_dim = coord_dim;
4234    let kphase_slow_resets_start = ctx.evaluator.slow_path_reset_count();
4235    let kphase_design_revision_start = ctx.cache.design_revision();
4236
4237    // #1033: lift the ψ (log-κ) lower bound to the n-free rank-stable floor so the
4238    // optimizer never line-searches into the rank-deficient sliver where the
4239    // design-realization skip is soundly refused (→ O(n) reset_surface). The lift
4240    // touches ONLY the single design-moving ψ coordinate at `rho_dim`; all ρ
4241    // bounds are untouched. `psi_rank_stable_floor` is already constrained to lie
4242    // strictly inside `(psi_lo, theta0[rho_dim])`, so theta0 stays feasible.
4243    let lower_effective: std::borrow::Cow<'_, Array1<f64>> = match psi_rank_stable_floor {
4244        Some(floor) if coord_dim == 1 && floor > lower[rho_dim] => {
4245            let mut lifted = lower.clone();
4246            lifted[rho_dim] = floor;
4247            std::borrow::Cow::Owned(lifted)
4248        }
4249        _ => std::borrow::Cow::Borrowed(lower),
4250    };
4251    let lower = lower_effective.as_ref();
4252
4253    // #1033: clamp the ψ (log-κ) upper bound DOWN to the n-free rank-stable ceiling
4254    // so the optimizer never line-searches into the high-edge rank-deficient sliver
4255    // where the design-realization skip is soundly refused (→ O(n) reset_surface,
4256    // plus a second reset from the deficient pinning ψ). Touches ONLY the single
4257    // design-moving ψ coordinate at `rho_dim`; all ρ bounds are untouched.
4258    // `psi_rank_stable_ceiling` is already constrained to lie strictly inside
4259    // `(theta0[rho_dim], psi_hi)`, so theta0 stays feasible.
4260    let upper_effective: std::borrow::Cow<'_, Array1<f64>> = match psi_rank_stable_ceiling {
4261        Some(ceiling) if coord_dim == 1 && ceiling < upper[rho_dim] => {
4262            let mut clamped = upper.clone();
4263            clamped[rho_dim] = ceiling;
4264            std::borrow::Cow::Owned(clamped)
4265        }
4266        _ => std::borrow::Cow::Borrowed(upper),
4267    };
4268    let upper = upper_effective.as_ref();
4269
4270    let problem = exact_joint_multistart_outer_problem(
4271        theta0,
4272        lower,
4273        upper,
4274        rho_dim,
4275        coord_dim,
4276        theta_dim,
4277        Derivative::Analytic,
4278        if analytic_outer_hessian_available && !suppress_outer_hessian_for_nfree {
4279            DeclaredHessianForm::Either
4280        } else {
4281            // `Unavailable` when the n-free Gaussian ψ-lane is armed (#1033): the
4282            // planner then selects BFGS instead of ARC, so the κ loop issues only
4283            // `ValueAndGradient` evals and every in-window trial takes the n-free
4284            // design-realization skip.
4285            DeclaredHessianForm::Unavailable
4286        },
4287        prefer_gradient_only,
4288        // Single-block spatial path: penalty-like rho + spatial psi.
4289        // EFS/HybridEFS remain eligible (the Wood-Fasiolo PSD structure holds
4290        // for single-block families with β-independent joint H_L) UNLESS the
4291        // n-free Gaussian ψ-lane is armed (#1033): HybridEFS forms the trace Gram
4292        // `tr(H⁻¹ B_d H⁻¹ B_e)` from the n-dependent curvature slab `B_d`, so it
4293        // realizes O(n) per step exactly like a Hessian eval. Disabling the
4294        // fixed-point lane there forces the planner to BFGS (`(Analytic,
4295        // Unavailable)` → `S::Bfgs`), keeping every in-window κ-trial on the
4296        // n-free `ValueAndGradient` skip even when `n_params` exceeds the small-
4297        // BFGS threshold (aniso / multi-ψ).
4298        suppress_outer_hessian_for_nfree,
4299        seed_risk_profile_for_likelihood_family(&family),
4300        kappa_options.rel_tol.max(1e-6),
4301        kappa_options.max_outer_iter.max(1),
4302        // Rho-axis BFGS cap: log-λ's natural step is ≈ 5 per
4303        // `first_order_bfgs_loglambda_step_cap`. Anything tighter throttles
4304        // BFGS on flat REML valleys.
4305        Some(5.0),
4306        // Psi-axis BFGS cap: kappa / aniso-log-scale needs ~ln 2 per iter.
4307        Some(kappa_options.log_step.clamp(0.25, 1.0)),
4308        None,
4309        // Calibrate the outer to the n-scaled profiled REML/LAML objective for
4310        // every family — the iso-κ non-convergence cure (#1053 1-D Matérn,
4311        // #1066 2-D binomial geo, #1069 GP/kriging). p = baseline design column
4312        // count.
4313        Some((data.nrows(), baseline_design.design.ncols())),
4314        // #1464: widen the over-smoothing ρ ceiling + seed a high-λ probe when a
4315        // constant-curvature term is present (collapsing +κ kernel needs a large
4316        // smoothing λ beyond the historical ±12 box).
4317        !constant_curvature_term_indices(resolvedspec).is_empty(),
4318    );
4319
4320    let eval_outer = |ctx: &mut &mut SpatialJointContext<'_>,
4321                      theta: &Array1<f64>,
4322                      order: OuterEvalOrder|
4323     -> Result<OuterEval, EstimationError> {
4324        let t0 = std::time::Instant::now();
4325        let allow_second_order_for_call = matches!(order, OuterEvalOrder::ValueGradientHessian)
4326            && analytic_outer_hessian_available;
4327        let gate = ctx.nfree_skip_gate_status(theta, allow_second_order_for_call, true);
4328        let resets_before = ctx.evaluator.slow_path_reset_count();
4329        let raw = ctx.eval_full(theta, order, analytic_outer_hessian_available);
4330        let reset_delta = ctx
4331            .evaluator
4332            .slow_path_reset_count()
4333            .saturating_sub(resets_before);
4334        if reset_delta > 0 {
4335            if !gate.shape {
4336                kphase_nfree_miss_shape.set(kphase_nfree_miss_shape.get() + reset_delta);
4337            }
4338            if gate.shape && !gate.value {
4339                kphase_nfree_miss_value.set(kphase_nfree_miss_value.get() + reset_delta);
4340            }
4341            if gate.shape && gate.value && !gate.gradient {
4342                kphase_nfree_miss_gradient.set(kphase_nfree_miss_gradient.get() + reset_delta);
4343            }
4344            if gate.shape && gate.value && gate.gradient && !gate.penalty {
4345                kphase_nfree_miss_penalty.set(kphase_nfree_miss_penalty.get() + reset_delta);
4346            }
4347            if gate.shape && gate.value && gate.gradient && gate.penalty && !gate.revision {
4348                kphase_nfree_miss_revision.set(kphase_nfree_miss_revision.get() + reset_delta);
4349            }
4350            if gate.shape
4351                && gate.value
4352                && gate.gradient
4353                && gate.penalty
4354                && gate.revision
4355                && gate.second_order
4356            {
4357                kphase_nfree_miss_second_order
4358                    .set(kphase_nfree_miss_second_order.get() + reset_delta);
4359            }
4360            if gate.would_skip(true) {
4361                kphase_nfree_miss_other.set(kphase_nfree_miss_other.get() + reset_delta);
4362            }
4363        }
4364        let elapsed_s = t0.elapsed().as_secs_f64();
4365        kphase_eval_calls.set(kphase_eval_calls.get() + 1);
4366        kphase_eval_total_s.set(kphase_eval_total_s.get() + elapsed_s);
4367        let (theta_norm, log_kappa_norm) = kphase_log_norms(theta, rho_dim);
4368        log::info!(
4369            "[KAPPA-PHASE] phase=eval_outer call={} order={:?} design_revision={:?} theta_norm={:.4e} log_kappa_norm={:.4e} elapsed_s={:.4}",
4370            kphase_eval_calls.get(),
4371            order,
4372            Some(ctx.cache.design_revision()),
4373            theta_norm,
4374            log_kappa_norm,
4375            elapsed_s,
4376        );
4377        match raw {
4378            Ok((cost, grad, hess)) => Ok(OuterEval {
4379                cost,
4380                gradient: grad,
4381                hessian: hess,
4382                inner_beta_hint: None,
4383            }),
4384            // A trial hyperparameter at which the spatial kernel design /
4385            // ψ-derivatives are non-constructible is an infeasible point, not
4386            // a fatal error: the gradient/Hessian path must retreat exactly as
4387            // the cost-only path (which already returns +∞) does. Returning
4388            // `OuterEval::infeasible` keeps the two paths symmetric so a single
4389            // bad probe — e.g. an anisotropy that overflows the Duchon radial
4390            // kernel — no longer aborts the whole REML optimization.
4391            Err(err) if is_recoverable_trial_point_error(&err) => {
4392                log::debug!(
4393                    "[{label}] trial point infeasible (kernel design \
4394                     not constructible at theta={theta:?}): {err}; retreating",
4395                );
4396                Ok(OuterEval::infeasible(theta_dim))
4397            }
4398            Err(err) => Err(err),
4399        }
4400    };
4401
4402    let mut obj = problem.build_objective_with_eval_order(
4403        &mut ctx,
4404        |ctx: &mut &mut SpatialJointContext<'_>, theta: &Array1<f64>| {
4405            let t0 = std::time::Instant::now();
4406            let gate = ctx.nfree_skip_gate_status(theta, false, false);
4407            let resets_before = ctx.evaluator.slow_path_reset_count();
4408            let cost = ctx.eval_cost(theta);
4409            let reset_delta = ctx
4410                .evaluator
4411                .slow_path_reset_count()
4412                .saturating_sub(resets_before);
4413            if reset_delta > 0 {
4414                if !gate.shape {
4415                    kphase_nfree_miss_shape.set(kphase_nfree_miss_shape.get() + reset_delta);
4416                }
4417                if gate.shape && !gate.value {
4418                    kphase_nfree_miss_value.set(kphase_nfree_miss_value.get() + reset_delta);
4419                }
4420                if gate.shape && gate.value && !gate.penalty {
4421                    kphase_nfree_miss_penalty.set(kphase_nfree_miss_penalty.get() + reset_delta);
4422                }
4423                if gate.shape && gate.value && gate.penalty && !gate.revision {
4424                    kphase_nfree_miss_revision.set(kphase_nfree_miss_revision.get() + reset_delta);
4425                }
4426                if gate.would_skip(false) {
4427                    kphase_nfree_miss_other.set(kphase_nfree_miss_other.get() + reset_delta);
4428                }
4429            }
4430            let elapsed_s = t0.elapsed().as_secs_f64();
4431            kphase_cost_calls.set(kphase_cost_calls.get() + 1);
4432            kphase_cost_total_s.set(kphase_cost_total_s.get() + elapsed_s);
4433            let (theta_norm, log_kappa_norm) = kphase_log_norms(theta, rho_dim);
4434            log::info!(
4435                "[KAPPA-PHASE] phase=cost call={} design_revision={:?} theta_norm={:.4e} log_kappa_norm={:.4e} elapsed_s={:.4}",
4436                kphase_cost_calls.get(),
4437                Some(ctx.cache.design_revision()),
4438                theta_norm,
4439                log_kappa_norm,
4440                elapsed_s,
4441            );
4442            Ok(cost)
4443        },
4444        |ctx: &mut &mut SpatialJointContext<'_>, theta: &Array1<f64>| {
4445            eval_outer(
4446                ctx,
4447                theta,
4448                // #1033: when the n-free Gaussian ψ-lane is armed we suppress the
4449                // outer Hessian and route BFGS — so this default gradient eval MUST
4450                // request `ValueAndGradient`, not `ValueGradientHessian`. A
4451                // second-order order sets `allow_second_order`, which forces
4452                // `ensure_theta` → the O(n) design re-realization (the Hessian slab
4453                // is irreducibly n-dependent), DISARMING the design-revision fast
4454                // path for every trial — exactly the O(n) κ-loop this lane exists to
4455                // remove. Gating only the planner's solver (Unavailable→BFGS)
4456                // without gating this eval-order left every trial second-order.
4457                if analytic_outer_hessian_available && !suppress_outer_hessian_for_nfree {
4458                    OuterEvalOrder::ValueGradientHessian
4459                } else {
4460                    OuterEvalOrder::ValueAndGradient
4461                },
4462            )
4463        },
4464        |ctx: &mut &mut SpatialJointContext<'_>, theta: &Array1<f64>, order: OuterEvalOrder| {
4465            eval_outer(ctx, theta, order)
4466        },
4467        Some(|ctx: &mut &mut SpatialJointContext<'_>| {
4468            ctx.reset();
4469        }),
4470        Some(|ctx: &mut &mut SpatialJointContext<'_>, theta: &Array1<f64>| {
4471            let t0 = std::time::Instant::now();
4472            let eval = ctx.eval_efs(theta);
4473            let elapsed_s = t0.elapsed().as_secs_f64();
4474            kphase_efs_calls.set(kphase_efs_calls.get() + 1);
4475            kphase_efs_total_s.set(kphase_efs_total_s.get() + elapsed_s);
4476            let (theta_norm, log_kappa_norm) = kphase_log_norms(theta, rho_dim);
4477            log::info!(
4478                "[KAPPA-PHASE] phase=efs call={} design_revision={:?} theta_norm={:.4e} log_kappa_norm={:.4e} elapsed_s={:.4}",
4479                kphase_efs_calls.get(),
4480                Some(ctx.cache.design_revision()),
4481                theta_norm,
4482                log_kappa_norm,
4483                elapsed_s,
4484            );
4485            eval
4486        }),
4487    );
4488
4489    let run_label = match kind {
4490        SpatialHyperKind::Anisotropic => "aniso-psi joint REML",
4491        SpatialHyperKind::Isotropic => "iso-kappa joint REML",
4492    };
4493    let result = problem.run(&mut obj, run_label).map_err(|e| {
4494        EstimationError::InvalidInput(format!(
4495            "{} analytic optimization failed after exhausting strategy fallbacks: {e}",
4496            kind.adjective(),
4497        ))
4498    })?;
4499    drop(obj);
4500    let kphase_total_s = kphase_optim_start.elapsed().as_secs_f64();
4501    let kphase_slow_resets = ctx
4502        .evaluator
4503        .slow_path_reset_count()
4504        .saturating_sub(kphase_slow_resets_start);
4505    let kphase_design_revision_delta = ctx
4506        .cache
4507        .design_revision()
4508        .saturating_sub(kphase_design_revision_start);
4509    log::info!(
4510        "[KAPPA-PHASE-SUMMARY] n_rows={} log_kappa_dim={} n_cost={} cost_total_s={:.4} n_eval={} eval_total_s={:.4} n_efs={} efs_total_s={:.4} slow_path_resets={} design_revision_delta={} nfree_miss_shape={} nfree_miss_value={} nfree_miss_gradient={} nfree_miss_penalty={} nfree_miss_revision={} nfree_miss_second_order={} nfree_miss_other={} optim_total_s={:.4}",
4511        data.nrows(),
4512        kphase_log_kappa_dim,
4513        kphase_cost_calls.get(),
4514        kphase_cost_total_s.get(),
4515        kphase_eval_calls.get(),
4516        kphase_eval_total_s.get(),
4517        kphase_efs_calls.get(),
4518        kphase_efs_total_s.get(),
4519        kphase_slow_resets,
4520        kphase_design_revision_delta,
4521        kphase_nfree_miss_shape.get(),
4522        kphase_nfree_miss_value.get(),
4523        kphase_nfree_miss_gradient.get(),
4524        kphase_nfree_miss_penalty.get(),
4525        kphase_nfree_miss_revision.get(),
4526        kphase_nfree_miss_second_order.get(),
4527        kphase_nfree_miss_other.get(),
4528        kphase_total_s,
4529    );
4530    let timing = SpatialLengthScaleOptimizationTiming {
4531        log_kappa_dim: kphase_log_kappa_dim,
4532        cost_calls: kphase_cost_calls.get(),
4533        cost_total_s: kphase_cost_total_s.get(),
4534        eval_calls: kphase_eval_calls.get(),
4535        eval_total_s: kphase_eval_total_s.get(),
4536        efs_calls: kphase_efs_calls.get(),
4537        efs_total_s: kphase_efs_total_s.get(),
4538        slow_path_resets: kphase_slow_resets,
4539        design_revision_delta: kphase_design_revision_delta,
4540        nfree_miss_shape: kphase_nfree_miss_shape.get(),
4541        nfree_miss_value: kphase_nfree_miss_value.get(),
4542        nfree_miss_gradient: kphase_nfree_miss_gradient.get(),
4543        nfree_miss_penalty: kphase_nfree_miss_penalty.get(),
4544        nfree_miss_revision: kphase_nfree_miss_revision.get(),
4545        nfree_miss_second_order: kphase_nfree_miss_second_order.get(),
4546        nfree_miss_other: kphase_nfree_miss_other.get(),
4547        optim_total_s: kphase_total_s,
4548    };
4549    if !result.converged {
4550        // Mirror `fit_term_collectionwith_exact_spatial_adaptive_regularization`
4551        // (commit 0267d082): the strict absolute-floor gradient criterion is too
4552        // tight when the outer Hessian carries a near-null direction (η-anchor
4553        // drift, ill-conditioned operator-collocation Gram, etc.) — the iterate
4554        // settles into a flat valley with ‖g‖_proj at numerical-noise scale
4555        // (~1e-5 for cost ~1e1 in double precision) which is above the 1e-6
4556        // absolute floor but well below the textbook mgcv `magic` REML rule
4557        // ‖g‖_proj ≤ τ·(1 + |f|). Accept the iterate under the rel-to-cost
4558        // form when the absolute form has timed out; divergent runs (‖g‖
4559        // large relative to |f|) still surface as errors.
4560        let rel_to_cost_threshold = options.tol * (1.0_f64 + result.final_value.abs());
4561        if let Some(final_grad) = result
4562            .final_grad_norm
4563            .filter(|v| v.is_finite() && *v <= rel_to_cost_threshold)
4564        {
4565            log::info!(
4566                "[{}] outer optimization hit max_iter={} but \
4567                 projected gradient norm {:.3e} ≤ τ·(1+|f|) = {:.3e} \
4568                 (τ={:.3e}, |f|={:.3e}); accepting iterate under the mgcv-style \
4569                 relative-to-cost REML convergence criterion.",
4570                label,
4571                result.iterations,
4572                final_grad,
4573                rel_to_cost_threshold,
4574                options.tol,
4575                result.final_value.abs(),
4576            );
4577        } else if result.final_value.is_finite() {
4578            // The joint κ optimizer is a *refinement* layered on top of an
4579            // always-valid frozen baseline geometry (the REML-seeded length
4580            // scales in `best`); a run that hits the iteration cap without
4581            // certifying a stationary point — and without clearing the
4582            // relative-to-cost gate above — must degrade to that baseline, not
4583            // abort the parent fit. The `gam` CLI fits this exact data (#1126):
4584            // its looser outer tolerance (`tol=1e-6`) lets this same optimizer
4585            // converge in ≤80 iters, whereas the formula/FFI path's tightened
4586            // `tol=1e-10` (the #893 replication-invariance tolerance) leaves it
4587            // mid-descent at the cap. Loosening the tolerance would weaken that
4588            // invariant for every fit; instead we report the non-convergence and
4589            // let the caller keep the baseline. The terminal cost is finite, so
4590            // the iterate is well-defined — this is ordinary slow convergence,
4591            // not a numerical blowup.
4592            log::warn!(
4593                "[{}] {} did not converge after {} iterations \
4594                 (final_objective={:.6e}, final_grad_norm={}); keeping the \
4595                 frozen baseline geometry instead of aborting the fit.",
4596                label,
4597                kind.adjective(),
4598                result.iterations,
4599                result.final_value,
4600                result.final_grad_norm_report(),
4601            );
4602            return Ok((
4603                SpatialJointOutcome::NonConverged {
4604                    iterations: result.iterations,
4605                    final_value: result.final_value,
4606                    final_grad_norm: result.final_grad_norm,
4607                },
4608                timing,
4609            ));
4610        } else {
4611            // A non-finite terminal cost is a genuine numerical blowup (NaN/inf
4612            // propagating through the gradient/Hessian wiring), not the ordinary
4613            // slow convergence handled above — surface it rather than masking a
4614            // real defect behind the baseline fallback.
4615            crate::bail_invalid_estim!(
4616                "{} analytic optimization diverged after {} iterations (final_objective={:.6e}, final_grad_norm={})",
4617                kind.adjective(),
4618                result.iterations,
4619                result.final_value,
4620                result.final_grad_norm_report(),
4621            );
4622        }
4623    }
4624    log::trace!(
4625        "[{}] converged in {} iterations, final_value={:.6e}, grad_norm={}",
4626        label,
4627        result.iterations,
4628        result.final_value,
4629        result.final_grad_norm_report(),
4630    );
4631    // No sum-to-zero enforcement needed: ψ coordinates are unconstrained during
4632    // optimization. For the anisotropic kind the decomposition into (ψ̄, η)
4633    // happens later in apply_tospec.
4634    let theta_star = result.rho;
4635    Ok((
4636        SpatialJointOutcome::Optimized {
4637            theta_star,
4638            final_value: result.final_value,
4639        },
4640        timing,
4641    ))
4642}
4643
4644/// Apply a length scale to a single `SmoothTermSpec` (independent of any
4645/// outer `TermCollectionSpec`). Mirrors `set_spatial_length_scale` but on a
4646/// term in isolation; used by the incremental realizer's cached planned spec.
4647fn set_single_term_spatial_length_scale(
4648    term: &mut SmoothTermSpec,
4649    length_scale: f64,
4650) -> Result<(), EstimationError> {
4651    match &mut term.basis {
4652        SmoothBasisSpec::ThinPlate { spec, .. } => {
4653            spec.length_scale = length_scale;
4654            Ok(())
4655        }
4656        SmoothBasisSpec::Matern { spec, .. } => {
4657            spec.length_scale = length_scale;
4658            Ok(())
4659        }
4660        SmoothBasisSpec::Duchon { spec, .. } => {
4661            spec.length_scale = Some(length_scale);
4662            Ok(())
4663        }
4664        _ => Err(EstimationError::InvalidInput(format!(
4665            "term '{}' does not expose a spatial length scale",
4666            term.name
4667        ))),
4668    }
4669}
4670
4671/// Apply anisotropy contrasts to a single `SmoothTermSpec`. Mirrors
4672/// `set_spatial_aniso_log_scales` but on a term in isolation; used by the
4673/// incremental realizer's cached planned spec.
4674fn set_single_term_spatial_aniso_log_scales(
4675    term: &mut SmoothTermSpec,
4676    eta: Vec<f64>,
4677) -> Result<(), EstimationError> {
4678    let eta = center_aniso_log_scales(&eta);
4679    match &mut term.basis {
4680        SmoothBasisSpec::Matern { spec, .. } => {
4681            spec.aniso_log_scales = Some(eta);
4682            Ok(())
4683        }
4684        SmoothBasisSpec::Duchon { spec, .. } => {
4685            spec.aniso_log_scales = Some(eta);
4686            Ok(())
4687        }
4688        _ => Err(EstimationError::InvalidInput(format!(
4689            "term '{}' does not support aniso_log_scales",
4690            term.name
4691        ))),
4692    }
4693}
4694
4695/// Freeze the design-moving representer length-scale dial on every measure-jet
4696/// term in `spec` (sets `learn_length_scale = false`), so ℓ stays at its
4697/// realized auto value with no outer REML enrollment.
4698///
4699/// Used by COUPLED-block families (bernoulli marginal-slope: a shared mjs
4700/// surface feeds both the marginal mean and the log-slope). In that coupling a
4701/// design-moving kernel-scale dial on the shared covariates is an
4702/// identifiability hazard: the outer search can reach a sharp ℓ at which a
4703/// marginal smooth direction trades off against the log-slope into a
4704/// separation-scale runaway (#1116). A single Gaussian surface has no such
4705/// coupling and keeps ℓ learnable. Returns the number of terms frozen.
4706/// The signed sectional curvature κ of a constant-curvature smooth at
4707/// `term_idx`, or `None` if that term is not a `curv(...)` smooth. After a fit
4708/// with κ-optimization enabled this reads the **fitted κ̂** out of the resolved
4709/// spec (`freeze_term_collection_from_design` writes the optimized κ back into
4710/// the spec, and `BasisMetadata::ConstantCurvature.kappa` carries the same
4711/// value). This is the headline #944 estimand accessor — the κ̂ in
4712/// "κ̂ = −1.8 (95% CI …)". Mirrors [`get_spatial_length_scale`].
4713pub fn get_constant_curvature_kappa(spec: &TermCollectionSpec, term_idx: usize) -> Option<f64> {
4714    constant_curvature_term_spec(spec, term_idx).map(|cc| cc.kappa)
4715}
4716
4717/// Indices of every constant-curvature (`curv(...)`) smooth term in `spec`.
4718pub fn constant_curvature_term_indices(spec: &TermCollectionSpec) -> Vec<usize> {
4719    (0..spec.smooth_terms.len())
4720        .filter(|&idx| constant_curvature_term_spec(spec, idx).is_some())
4721        .collect()
4722}
4723
4724
4725#[derive(Debug, Clone)]
4726struct SingleSmoothTermRealization {
4727    design_local: DesignMatrix,
4728    term: SmoothTerm,
4729    dropped_penaltyinfo: Vec<DroppedPenaltyBlockInfo>,
4730}
4731
4732impl SingleSmoothTermRealization {
4733    fn active_penaltyinfo(&self) -> Vec<PenaltyInfo> {
4734        self.term
4735            .penaltyinfo_local
4736            .iter()
4737            .filter(|info| info.active)
4738            .cloned()
4739            .collect()
4740    }
4741}
4742
4743fn build_single_smooth_term_realization(
4744    data: ArrayView2<'_, f64>,
4745    termspec: &SmoothTermSpec,
4746) -> Result<SingleSmoothTermRealization, BasisError> {
4747    let raw = build_smooth_design(data, std::slice::from_ref(termspec))?;
4748    finish_single_smooth_term_realization(raw)
4749}
4750
4751fn finish_single_smooth_term_realization(
4752    raw: RawSmoothDesign,
4753) -> Result<SingleSmoothTermRealization, BasisError> {
4754    let RawSmoothDesign {
4755        term_designs,
4756        dropped_penaltyinfo,
4757        terms,
4758        ..
4759    } = raw;
4760    let term = terms.into_iter().next().ok_or_else(|| {
4761        BasisError::InvalidInput("single-term smooth build returned no term".to_string())
4762    })?;
4763    let design = term_designs.into_iter().next().ok_or_else(|| {
4764        BasisError::InvalidInput("single-term smooth build returned no term design".to_string())
4765    })?;
4766
4767    Ok(SingleSmoothTermRealization {
4768        design_local: design,
4769        term,
4770        dropped_penaltyinfo,
4771    })
4772}
4773
4774/// Wrap a fresh `LocalSmoothTermBuild` (produced by `build_single_local_smooth_term`)
4775/// into a `SingleSmoothTermRealization`. Mirrors the single-term portion of
4776/// `build_smooth_design_withworkspace_unvalidated`, but skips the joint center
4777/// planner and per-term workspace fork — the realizer drives κ-only rebuilds
4778/// directly with its persistent workspace so basis caches survive across BFGS
4779/// κ proposals.
4780fn wrap_local_build_as_realization(
4781    mut local: LocalSmoothTermBuild,
4782    termspec: &SmoothTermSpec,
4783) -> Result<SingleSmoothTermRealization, String> {
4784    let p_local = local.dim;
4785    let lb_local = if local.box_reparam {
4786        shape_lower_bounds_local(termspec.shape, p_local)
4787    } else {
4788        None
4789    };
4790
4791    let active_count = local.penaltyinfo.iter().filter(|info| info.active).count();
4792    if active_count != local.penalties.len() {
4793        return Err(format!(
4794            "internal penalty info mismatch for term '{}': active_infos={}, penalties={}",
4795            termspec.name,
4796            active_count,
4797            local.penalties.len()
4798        ));
4799    }
4800
4801    let mut dropped_penaltyinfo = Vec::<DroppedPenaltyBlockInfo>::new();
4802    for info in local.penaltyinfo.iter().filter(|info| !info.active) {
4803        dropped_penaltyinfo.push(DroppedPenaltyBlockInfo {
4804            termname: Some(termspec.name.clone()),
4805            penalty: info.clone(),
4806        });
4807    }
4808    for info in &local.pre_dropped_penaltyinfo {
4809        dropped_penaltyinfo.push(DroppedPenaltyBlockInfo {
4810            termname: Some(termspec.name.clone()),
4811            penalty: info.clone(),
4812        });
4813    }
4814
4815    // Stage-2 joint-null absorption rotation, same logic as the main
4816    // aggregation loop in `build_smooth_design_withworkspace_unvalidated`:
4817    // apply Q when Some AND the smooth has no shape constraints.
4818    let applied_rotation: Option<gam_terms::basis::JointNullRotation> = match (
4819        local.joint_null_rotation.take(),
4820        lb_local.is_some(),
4821        local.linear_constraints.is_some(),
4822    ) {
4823        (Some(rot), false, false) => {
4824            let q = &rot.rotation;
4825            let dense = local
4826                .design
4827                .try_to_dense_by_chunks("joint-null absorption rotation (single realization)")
4828                .map_err(|e| {
4829                    format!(
4830                        "joint-null absorption rotation: dense conversion failed for term '{}': {}",
4831                        termspec.name, e
4832                    )
4833                })?;
4834            let rotated = gam_linalg::faer_ndarray::fast_ab(&dense, q);
4835            local.design = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(rotated));
4836            local.penalties = local
4837                .penalties
4838                .into_iter()
4839                .map(|s_local| {
4840                    let qt_s = gam_linalg::faer_ndarray::fast_atb(q, &s_local);
4841                    gam_linalg::faer_ndarray::fast_ab(&qt_s, q)
4842                })
4843                .collect();
4844            local.ops = vec![None; local.penalties.len()];
4845            local.kronecker_factored = None;
4846            Some(rot)
4847        }
4848        (Some(_), _, _) => None,
4849        (None, _, _) => None,
4850    };
4851
4852    let smooth_term = SmoothTerm {
4853        name: termspec.name.clone(),
4854        coeff_range: 0..p_local,
4855        shape: termspec.shape,
4856        penalties_local: local.penalties.clone(),
4857        nullspace_dims: local.nullspaces.clone(),
4858        penaltyinfo_local: local.penaltyinfo.clone(),
4859        metadata: local.metadata.clone(),
4860        lower_bounds_local: lb_local,
4861        linear_constraints_local: local.linear_constraints.clone(),
4862        kronecker_factored: local.kronecker_factored.take(),
4863        joint_null_rotation: applied_rotation,
4864        // Single-term realizations never run the global ownership pass, so
4865        // there is no overlap residualization to export here (#978).
4866        unabsorbed_global_orthogonality: None,
4867    };
4868
4869    Ok(SingleSmoothTermRealization {
4870        design_local: local.design,
4871        term: smooth_term,
4872        dropped_penaltyinfo,
4873    })
4874}
4875
4876/// Extract the κ-invariant pieces of a freshly-built spatial basis — center
4877/// cloud (in standardized coords) and `input_scales` — and bake them into a
4878/// `SmoothTermSpec` whose `center_strategy` becomes `UserProvided` and whose
4879/// `input_scales` is `Some`. Subsequent rebuilds driven from this cached spec
4880/// will short-circuit `select_centers_by_strategy` (KMeans / FarthestPoint /
4881/// EqualMass cluster searches over n×d data) and `compute_spatial_input_scales`
4882/// (per-axis variance over n rows), leaving only the κ-dependent kernel
4883/// values and basis assembly. Returns `None` for non-spatial families or when
4884/// the metadata does not yet expose the required pieces (for instance when a
4885/// ThinPlate request was auto-promoted to Duchon during the build).
4886fn freeze_geometry_from_metadata(
4887    termspec: &SmoothTermSpec,
4888    metadata: &BasisMetadata,
4889) -> Option<SmoothTermSpec> {
4890    let mut frozen = termspec.clone();
4891    match (&mut frozen.basis, metadata) {
4892        (
4893            SmoothBasisSpec::Matern {
4894                spec,
4895                input_scales: spec_scales,
4896                ..
4897            },
4898            BasisMetadata::Matern {
4899                centers,
4900                input_scales: meta_scales,
4901                identifiability_transform,
4902                nullspace_shrinkage_survived,
4903                ..
4904            },
4905        ) => {
4906            spec.center_strategy = CenterStrategy::UserProvided(centers.clone());
4907            if spec_scales.is_none()
4908                && let Some(s) = meta_scales.clone()
4909            {
4910                *spec_scales = Some(s);
4911            }
4912            // Pin BOTH the cold-build identifiability transform `Z` AND the
4913            // double-penalty nullspace-shrinkage decision into a
4914            // `FrozenTransform` (gam#787/#860, #1122). Without this, the
4915            // κ-optimizer's per-trial value rebuild re-runs the κ-DEPENDENT
4916            // spectral test (`build_nullspace_shrinkage_penalty`), whose
4917            // tolerance scales with `λ_max(A(κ))`: as κ moves, near-null
4918            // eigenvalues of the projected kernel Gram `A` cross the threshold,
4919            // so the `DoublePenaltyNullspace` block `P/√r` (and its null
4920            // dimension `r`) JUMP discontinuously between line-search trials.
4921            // The analytic ψ-gradient — assembled in a fixed frozen eigenbasis
4922            // — cannot follow those discrete jumps, so the joint REML objective
4923            // V(κ) is piecewise-discontinuous while the gradient is smooth: an
4924            // objective↔gradient desync that stalls the isotropic-κ optimizer
4925            // with a large residual gradient at the iteration cap. Freezing the
4926            // decision (and the transform that `A` is built from) makes the
4927            // per-trial value rebuild and the analytic gradient share one fixed
4928            // `Z` and one fixed `r`, restoring a smooth, differentiable V(κ).
4929            if let Some(transform) = identifiability_transform.clone() {
4930                spec.identifiability = MaternIdentifiability::FrozenTransform {
4931                    transform,
4932                    nullspace_shrinkage_survived: Some(*nullspace_shrinkage_survived),
4933                };
4934            }
4935            Some(frozen)
4936        }
4937        (
4938            SmoothBasisSpec::Duchon {
4939                spec,
4940                input_scales: spec_scales,
4941                ..
4942            },
4943            BasisMetadata::Duchon {
4944                centers,
4945                input_scales: meta_scales,
4946                ..
4947            },
4948        ) => {
4949            spec.center_strategy = CenterStrategy::UserProvided(centers.clone());
4950            if spec_scales.is_none()
4951                && let Some(s) = meta_scales.clone()
4952            {
4953                *spec_scales = Some(s);
4954            }
4955            Some(frozen)
4956        }
4957        (
4958            SmoothBasisSpec::ThinPlate {
4959                spec,
4960                input_scales: spec_scales,
4961                ..
4962            },
4963            BasisMetadata::ThinPlate {
4964                centers,
4965                input_scales: meta_scales,
4966                ..
4967            },
4968        ) => {
4969            spec.center_strategy = CenterStrategy::UserProvided(centers.clone());
4970            if spec_scales.is_none()
4971                && let Some(s) = meta_scales.clone()
4972            {
4973                *spec_scales = Some(s);
4974            }
4975            Some(frozen)
4976        }
4977        // Family mismatch (e.g. ThinPlate auto-promotion to Duchon) leaves the
4978        // cache empty; we'll retry materialization on the next κ apply.
4979        _ => None,
4980    }
4981}
4982
4983fn rebuild_smooth_auxiliary_state(
4984    smooth: &mut SmoothDesign,
4985    dropped_penaltyinfo_by_term: &[Vec<DroppedPenaltyBlockInfo>],
4986) -> Result<(), String> {
4987    if dropped_penaltyinfo_by_term.len() != smooth.terms.len() {
4988        return Err(SmoothError::dimension_mismatch(format!(
4989            "smooth dropped-penalty cache mismatch: terms={}, dropped_sets={}",
4990            smooth.terms.len(),
4991            dropped_penaltyinfo_by_term.len()
4992        ))
4993        .into());
4994    }
4995
4996    let total_p = smooth.total_smooth_cols();
4997    let mut coefficient_lower_bounds = Array1::<f64>::from_elem(total_p, f64::NEG_INFINITY);
4998    let mut any_bounds = false;
4999    let mut linear_constraintrows: Vec<Array1<f64>> = Vec::new();
5000    let mut linear_constraint_b: Vec<f64> = Vec::new();
5001
5002    for term in &smooth.terms {
5003        let range = term.coeff_range.clone();
5004        if let Some(lb_local) = term.lower_bounds_local.as_ref() {
5005            if lb_local.len() != range.len() {
5006                return Err(SmoothError::dimension_mismatch(format!(
5007                    "smooth lower-bound cache mismatch for term '{}': bounds={}, coeffs={}",
5008                    term.name,
5009                    lb_local.len(),
5010                    range.len()
5011                ))
5012                .into());
5013            }
5014            coefficient_lower_bounds
5015                .slice_mut(s![range.clone()])
5016                .assign(lb_local);
5017            any_bounds = true;
5018        }
5019        if let Some(lin_local) = term.linear_constraints_local.as_ref() {
5020            if lin_local.a.ncols() != range.len() {
5021                return Err(SmoothError::dimension_mismatch(format!(
5022                    "smooth linear-constraint cache mismatch for term '{}': cols={}, coeffs={}",
5023                    term.name,
5024                    lin_local.a.ncols(),
5025                    range.len()
5026                ))
5027                .into());
5028            }
5029            for r in 0..lin_local.a.nrows() {
5030                let mut row = Array1::<f64>::zeros(total_p);
5031                row.slice_mut(s![range.clone()]).assign(&lin_local.a.row(r));
5032                linear_constraintrows.push(row);
5033                linear_constraint_b.push(lin_local.b[r]);
5034            }
5035        }
5036    }
5037
5038    smooth.coefficient_lower_bounds = if any_bounds {
5039        Some(coefficient_lower_bounds)
5040    } else {
5041        None
5042    };
5043    smooth.linear_constraints = if linear_constraintrows.is_empty() {
5044        None
5045    } else {
5046        let mut a = Array2::<f64>::zeros((linear_constraintrows.len(), total_p));
5047        for (i, row) in linear_constraintrows.iter().enumerate() {
5048            a.row_mut(i).assign(row);
5049        }
5050        Some(LinearInequalityConstraints {
5051            a,
5052            b: Array1::from_vec(linear_constraint_b),
5053        })
5054    };
5055    smooth.dropped_penaltyinfo = dropped_penaltyinfo_by_term
5056        .iter()
5057        .flat_map(|infos| infos.iter().cloned())
5058        .collect();
5059    Ok(())
5060}
5061
5062fn rebuild_term_collection_auxiliary_state(
5063    spec: &TermCollectionSpec,
5064    design: &mut TermCollectionDesign,
5065) -> Result<(), String> {
5066    if spec.linear_terms.len() != design.linear_ranges.len() {
5067        return Err(SmoothError::dimension_mismatch(format!(
5068            "term-collection linear bookkeeping mismatch: spec_terms={}, design_ranges={}",
5069            spec.linear_terms.len(),
5070            design.linear_ranges.len()
5071        ))
5072        .into());
5073    }
5074
5075    let p_total = design.design.ncols();
5076    let smooth_start = p_total.saturating_sub(design.smooth.total_smooth_cols());
5077    let mut coefficient_lower_bounds = Array1::<f64>::from_elem(p_total, f64::NEG_INFINITY);
5078    let mut any_bounds = false;
5079    let mut linear_constraintrows: Vec<Array1<f64>> = Vec::new();
5080    let mut linear_constraint_b: Vec<f64> = Vec::new();
5081
5082    for (linear, (_, range)) in spec.linear_terms.iter().zip(design.linear_ranges.iter()) {
5083        if range.len() != 1 {
5084            return Err(SmoothError::dimension_mismatch(format!(
5085                "linear term '{}' expected one coefficient column, found {}",
5086                linear.name,
5087                range.len()
5088            ))
5089            .into());
5090        }
5091        let col = range.start;
5092        if let Some(lb) = linear.coefficient_min {
5093            let mut row = Array1::<f64>::zeros(p_total);
5094            row[col] = 1.0;
5095            linear_constraintrows.push(row);
5096            linear_constraint_b.push(lb);
5097        }
5098        if let Some(ub) = linear.coefficient_max {
5099            let mut row = Array1::<f64>::zeros(p_total);
5100            row[col] = -1.0;
5101            linear_constraintrows.push(row);
5102            linear_constraint_b.push(-ub);
5103        }
5104    }
5105
5106    if let Some(lb_smooth) = design.smooth.coefficient_lower_bounds.as_ref() {
5107        if lb_smooth.len() != design.smooth.total_smooth_cols() {
5108            return Err(SmoothError::dimension_mismatch(format!(
5109                "smooth lower-bound width mismatch: bounds={}, smooth_cols={}",
5110                lb_smooth.len(),
5111                design.smooth.total_smooth_cols()
5112            ))
5113            .into());
5114        }
5115        coefficient_lower_bounds
5116            .slice_mut(s![
5117                smooth_start..(smooth_start + design.smooth.total_smooth_cols())
5118            ])
5119            .assign(lb_smooth);
5120        any_bounds = true;
5121    }
5122    if let Some(lin_smooth) = design.smooth.linear_constraints.as_ref() {
5123        if lin_smooth.a.ncols() != design.smooth.total_smooth_cols() {
5124            return Err(SmoothError::dimension_mismatch(format!(
5125                "smooth linear-constraint width mismatch: cols={}, smooth_cols={}",
5126                lin_smooth.a.ncols(),
5127                design.smooth.total_smooth_cols()
5128            ))
5129            .into());
5130        }
5131        let mut a_global = Array2::<f64>::zeros((lin_smooth.a.nrows(), p_total));
5132        a_global
5133            .slice_mut(s![
5134                ..,
5135                smooth_start..(smooth_start + design.smooth.total_smooth_cols())
5136            ])
5137            .assign(&lin_smooth.a);
5138        for r in 0..a_global.nrows() {
5139            linear_constraintrows.push(a_global.row(r).to_owned());
5140            linear_constraint_b.push(lin_smooth.b[r]);
5141        }
5142    }
5143
5144    let lower_bound_constraints = if any_bounds {
5145        linear_constraints_from_lower_bounds_global(&coefficient_lower_bounds)
5146    } else {
5147        None
5148    };
5149    let explicit_linear_constraints = if linear_constraintrows.is_empty() {
5150        None
5151    } else {
5152        let mut a = Array2::<f64>::zeros((linear_constraintrows.len(), p_total));
5153        for (i, row) in linear_constraintrows.iter().enumerate() {
5154            a.row_mut(i).assign(row);
5155        }
5156        Some(LinearInequalityConstraints {
5157            a,
5158            b: Array1::from_vec(linear_constraint_b),
5159        })
5160    };
5161
5162    design.coefficient_lower_bounds = if any_bounds {
5163        Some(coefficient_lower_bounds)
5164    } else {
5165        None
5166    };
5167    design.linear_constraints =
5168        merge_linear_constraints_global(explicit_linear_constraints, lower_bound_constraints);
5169    design.dropped_penaltyinfo = design.smooth.dropped_penaltyinfo.clone();
5170    Ok(())
5171}
5172
5173fn theta_values_match(left: &Array1<f64>, right: &Array1<f64>) -> bool {
5174    left.len() == right.len()
5175        && left
5176            .iter()
5177            .zip(right.iter())
5178            .all(|(&l, &r)| l.to_bits() == r.to_bits())
5179}
5180
5181fn latent_values_match(left: &Array1<f64>, right: &Array1<f64>) -> bool {
5182    theta_values_match(left, right)
5183}
5184
5185fn spatial_aniso_matches(left: Option<&[f64]>, right: Option<&[f64]>) -> bool {
5186    match (left, right) {
5187        (None, None) => true,
5188        (Some(a), Some(b)) => {
5189            a.len() == b.len()
5190                && a.iter()
5191                    .zip(b.iter())
5192                    .all(|(&x, &y)| x.to_bits() == y.to_bits())
5193        }
5194        _ => false,
5195    }
5196}
5197
5198fn spatial_length_scale_matches(left: Option<f64>, right: Option<f64>) -> bool {
5199    match (left, right) {
5200        (None, None) => true,
5201        (Some(a), Some(b)) => a.to_bits() == b.to_bits(),
5202        _ => false,
5203    }
5204}
5205
5206struct FrozenTermCollectionIncrementalRealizer<'d> {
5207    data: ArrayView2<'d, f64>,
5208    spec: TermCollectionSpec,
5209    design: TermCollectionDesign,
5210    fixed_blocks: Vec<DesignBlock>,
5211    dropped_penaltyinfo_by_term: Vec<Vec<DroppedPenaltyBlockInfo>>,
5212    smooth_penalty_ranges: Vec<Range<usize>>,
5213    full_penalty_ranges: Vec<Range<usize>>,
5214    /// Persistent workspace for basis cache reuse across κ proposals.
5215    /// Distance matrices are cached here so they're computed once and
5216    /// reused across repeated `apply_log_kappa_to_term` calls.
5217    basisworkspace: gam_terms::basis::BasisWorkspace,
5218    /// Per-term cached realization geometry for incremental κ updates.
5219    ///
5220    /// On the first κ-driven rebuild of term `i`, this slot is populated with a
5221    /// `SmoothTermSpec` whose κ-invariant geometry — center cloud (as
5222    /// `CenterStrategy::UserProvided`) and `input_scales` — has been frozen
5223    /// out of the realized basis metadata. Subsequent
5224    /// `apply_log_kappa_to_term` calls reuse this spec, mutating only the
5225    /// κ / aniso fields. This short-circuits `select_centers_by_strategy`
5226    /// (KMeans / FarthestPoint / EqualMass cluster searches over the n×d data
5227    /// matrix) and `compute_spatial_input_scales` (per-axis variance pass
5228    /// over n rows) on every BFGS κ-eval, leaving the kernel-value pass and
5229    /// basis assembly as the only work.
5230    spatial_realization_geometry: Vec<Option<SmoothTermSpec>>,
5231    /// Monotonic counter incremented every time `apply_log_kappa` actually
5232    /// rebuilds the realized design / smooth penalties. Read by the
5233    /// design-revision-counter fast path in `ExternalJointHyperEvaluator`
5234    /// to skip redundant canonical-penalty rebuilds and cache wipes when
5235    /// the outer BFGS loop probes the same ψ twice in a row.
5236    design_revision: u64,
5237}
5238
5239impl<'d> std::fmt::Debug for FrozenTermCollectionIncrementalRealizer<'d> {
5240    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
5241        f.debug_struct("FrozenTermCollectionIncrementalRealizer")
5242            .field("data_shape", &(self.data.nrows(), self.data.ncols()))
5243            .field("fixed_blocks", &self.fixed_blocks.len())
5244            .finish_non_exhaustive()
5245    }
5246}
5247
5248impl<'d> FrozenTermCollectionIncrementalRealizer<'d> {
5249    fn new(
5250        data: ArrayView2<'d, f64>,
5251        spec: TermCollectionSpec,
5252        design: TermCollectionDesign,
5253    ) -> Result<Self, String> {
5254        if spec.smooth_terms.len() != design.smooth.terms.len() {
5255            return Err(SmoothError::dimension_mismatch(format!(
5256                "incremental realizer smooth term mismatch: spec_terms={}, design_terms={}",
5257                spec.smooth_terms.len(),
5258                design.smooth.terms.len()
5259            ))
5260            .into());
5261        }
5262
5263        let mut smooth_cursor = 0usize;
5264        let mut smooth_penalty_ranges = Vec::with_capacity(design.smooth.terms.len());
5265        for term in &design.smooth.terms {
5266            let next = smooth_cursor + term.penalties_local.len();
5267            smooth_penalty_ranges.push(smooth_cursor..next);
5268            smooth_cursor = next;
5269        }
5270        if smooth_cursor != design.smooth.penalties.len() {
5271            return Err(SmoothError::dimension_mismatch(format!(
5272                "incremental realizer smooth penalty mismatch: ranged={}, actual={}",
5273                smooth_cursor,
5274                design.smooth.penalties.len()
5275            ))
5276            .into());
5277        }
5278
5279        let fixed_penalty_offset = design
5280            .penalties
5281            .len()
5282            .checked_sub(design.smooth.penalties.len())
5283            .ok_or_else(|| {
5284                "incremental realizer encountered invalid penalty bookkeeping".to_string()
5285            })?;
5286        let full_penalty_ranges = smooth_penalty_ranges
5287            .iter()
5288            .map(|range| (fixed_penalty_offset + range.start)..(fixed_penalty_offset + range.end))
5289            .collect::<Vec<_>>();
5290        let fixed_blocks = build_term_collection_fixed_blocks(data, &spec)
5291            .map_err(|e| format!("failed to cache fixed term-collection blocks: {e}"))?;
5292
5293        let mut dropped_penaltyinfo_by_term = Vec::with_capacity(spec.smooth_terms.len());
5294        for (term_idx, termspec) in spec.smooth_terms.iter().enumerate() {
5295            let realization =
5296                build_single_smooth_term_realization(data, termspec).map_err(|e| {
5297                    format!(
5298                        "failed to build cached realization for smooth term '{}' (index {}): {e}",
5299                        termspec.name, term_idx
5300                    )
5301                })?;
5302            let expected_cols = design.smooth.terms[term_idx].coeff_range.len();
5303            if realization.design_local.ncols() != expected_cols {
5304                return Err(SmoothError::dimension_mismatch(format!(
5305                    "cached realization width mismatch for term '{}': cached_cols={}, design_cols={}",
5306                    termspec.name,
5307                    realization.design_local.ncols(),
5308                    expected_cols
5309                ))
5310                .into());
5311            }
5312            if realization.active_penaltyinfo().len()
5313                != design.smooth.terms[term_idx].penalties_local.len()
5314            {
5315                return Err(SmoothError::dimension_mismatch(format!(
5316                    "cached realization penalty mismatch for term '{}': cached_penalties={}, design_penalties={}",
5317                    termspec.name,
5318                    realization.active_penaltyinfo().len(),
5319                    design.smooth.terms[term_idx].penalties_local.len()
5320                ))
5321                .into());
5322            }
5323            dropped_penaltyinfo_by_term.push(realization.dropped_penaltyinfo);
5324        }
5325
5326        let geometry_slots = spec.smooth_terms.len();
5327        Ok(Self {
5328            data,
5329            spec,
5330            design,
5331            fixed_blocks,
5332            dropped_penaltyinfo_by_term,
5333            smooth_penalty_ranges,
5334            full_penalty_ranges,
5335            basisworkspace: gam_terms::basis::BasisWorkspace::new(),
5336            spatial_realization_geometry: vec![None; geometry_slots],
5337            design_revision: 0,
5338        })
5339    }
5340
5341    fn design_revision(&self) -> u64 {
5342        self.design_revision
5343    }
5344
5345    fn spec(&self) -> &TermCollectionSpec {
5346        &self.spec
5347    }
5348
5349    fn design(&self) -> &TermCollectionDesign {
5350        &self.design
5351    }
5352
5353    /// True when this realizer carries exactly ONE spatial smooth term whose
5354    /// frozen basis geometry (`BasisMetadata::Duchon`/`ThinPlate`)
5355    /// admits an EXACT, n-free penalty rebuild at a new length-scale (#1033).
5356    /// The κ-loop fast path gates its design-realization skip on this: the skip
5357    /// leaves `reset_surface` un-run, so it is only sound when `S(ψ_new)` can be
5358    /// re-keyed n-free from the frozen geometry (centers + identifiability
5359    /// transform + operator collocation points), never from the data rows, AND
5360    /// the re-keyed penalty's block topology is IDENTICAL to the one the frozen
5361    /// design carries.
5362    ///
5363    /// Matérn stays on the exact slow re-key path here. Its operator-triplet
5364    /// n-free rebuild exists, but the current quality gate shows that enabling
5365    /// the fast-path κ loop changes the selected fit enough to miss the mgcv
5366    /// truth-recovery bar. Duchon/ThinPlate are the #1033 acceptance lane.
5367    fn supports_nfree_penalty_rekey(&self, spatial_terms: &[usize]) -> bool {
5368        if spatial_terms.len() != 1 {
5369            return false;
5370        }
5371        let term_idx = spatial_terms[0];
5372        matches!(
5373            self.design.smooth.terms.get(term_idx).map(|t| &t.metadata),
5374            Some(BasisMetadata::Duchon { .. } | BasisMetadata::ThinPlate { .. })
5375        )
5376    }
5377
5378    /// True when the armed n-free Gaussian lane should suppress exact outer
5379    /// Hessians and route κ search through gradient-only BFGS.
5380    ///
5381    /// This is deliberately narrower than [`Self::supports_nfree_penalty_rekey`]:
5382    /// Matérn has an exact n-free operator-triplet `S(ψ)` re-key (#1274), but its
5383    /// quality gate still depends on the exact second-order outer route. Duchon
5384    /// and ThinPlate are the #1033 n-independent acceptance lane where the exact
5385    /// Hessian slab is the remaining O(n) per-trial cost.
5386    fn supports_nfree_gradient_only_routing(&self, spatial_terms: &[usize]) -> bool {
5387        if spatial_terms.len() != 1 {
5388            return false;
5389        }
5390        let term_idx = spatial_terms[0];
5391        matches!(
5392            self.design.smooth.terms.get(term_idx).map(|t| &t.metadata),
5393            Some(BasisMetadata::Duchon { .. } | BasisMetadata::ThinPlate { .. })
5394        )
5395    }
5396
5397    /// Rebuild the EXACT canonical penalty surface `S(ψ)` at the length-scale
5398    /// implied by `psi`, entirely n-free (#1033). Reuses the FROZEN basis
5399    /// geometry from the single spatial term's `BasisMetadata` (centers,
5400    /// identifiability transform, operator collocation points — all `k × d`, no
5401    /// data rows) and the spec's `(power, nullspace_order, operator_penalties,
5402    /// nu, …)`; only the length-scale moves. The reconstructed term-local
5403    /// penalty matrices replace the `local` of the FROZEN
5404    /// `design.penalties` templates (whose `col_range` / `prior_mean` /
5405    /// `structure_hint` / `op` are ψ-invariant), so the resulting
5406    /// `PenaltySpec`s are bit-identical in topology to the slow path's; running
5407    /// them through the SAME `canonicalize_penalty_specs` pipeline yields the
5408    /// canonical list the kept reference surface must be re-keyed with.
5409    fn canonical_penalties_at_psi(
5410        &mut self,
5411        spatial_terms: &[usize],
5412        psi: &[f64],
5413    ) -> Result<(Vec<gam_terms::construction::CanonicalPenalty>, Vec<usize>), String> {
5414        if spatial_terms.len() != 1 {
5415            return Err(format!(
5416                "n-free penalty re-key requires exactly one spatial term, found {}",
5417                spatial_terms.len()
5418            ));
5419        }
5420        let term_idx = spatial_terms[0];
5421        // Decode ψ with the same chart used by the slow rebuild path. For
5422        // Matérn, per-axis ψ entries are REML hyper-coordinates, so the n-free
5423        // penalty rebuild must consume the trial η contrasts as well as the
5424        // scalar length scale. Duchon keeps η as fixed geometry and continues
5425        // to use frozen metadata below.
5426        let (ls_opt, aniso_from_psi) = spatial_term_psi_to_length_scale_and_aniso(psi);
5427        // Pull the spec-level penalty configuration (which operator orders are
5428        // active / double_penalty) — ψ-invariant, frozen at construction.
5429        let termspec =
5430            self.spec.smooth_terms.get(term_idx).ok_or_else(|| {
5431                format!("spatial term {term_idx} out of range for n-free penalty")
5432            })?;
5433        let term = self
5434            .design
5435            .smooth
5436            .terms
5437            .get(term_idx)
5438            .ok_or_else(|| format!("realized smooth term {term_idx} out of range"))?;
5439        // The per-term penalties live contiguously in the collection penalty
5440        // list at the term's `coeff_range` (single-spatial-term collection).
5441        let p_total = self.design.design.ncols();
5442        let (locals, nullspace_dims): (Vec<Array2<f64>>, Vec<usize>) = match &term.metadata {
5443            BasisMetadata::Duchon {
5444                centers,
5445                identifiability_transform,
5446                operator_collocation_points,
5447                power,
5448                nullspace_order,
5449                aniso_log_scales,
5450                input_scales,
5451                radial_reparam,
5452                ..
5453            } => {
5454                let operator_penalties = match &termspec.basis {
5455                    SmoothBasisSpec::Duchon { spec, .. } => spec.operator_penalties.clone(),
5456                    _ => gam_terms::basis::DuchonOperatorPenaltySpec::default(),
5457                };
5458                // Slow-path Duchon realization stores centers/collocation points
5459                // in standardized coordinates and compensates the user-facing
5460                // length_scale by σ_geom before building penalties. The n-free
5461                // re-key must use the same effective length scale, or the fast
5462                // path pairs G(ψ_new) with an S(ψ_new) from a different
5463                // coordinate scale.
5464                let effective_ls = match input_scales.as_deref() {
5465                    Some(scales) => {
5466                        compensate_optional_length_scale_for_standardization(ls_opt, scales)
5467                    }
5468                    None => ls_opt,
5469                };
5470                gam_terms::basis::duchon_penalties_at_length_scale(
5471                    centers.view(),
5472                    identifiability_transform.as_ref(),
5473                    operator_collocation_points.as_ref().map(|p| p.view()),
5474                    &operator_penalties,
5475                    *power,
5476                    *nullspace_order,
5477                    aniso_log_scales.as_deref(),
5478                    radial_reparam.as_ref(),
5479                    effective_ls,
5480                    &mut self.basisworkspace,
5481                )
5482                .map_err(|e| e.to_string())?
5483            }
5484            BasisMetadata::Matern {
5485                centers,
5486                periodic,
5487                nu,
5488                include_intercept,
5489                identifiability_transform,
5490                aniso_log_scales,
5491                input_scales,
5492                ..
5493            } => {
5494                // `spatial_term_psi_to_length_scale_and_aniso` decodes ψ to a
5495                // length scale in ORIGINAL data coordinates — exactly what the
5496                // slow-path rebuild writes into `spec.length_scale` before
5497                // `matern_operator_penalty_triplet_from_metadata` compensates it
5498                // by σ_geom. Compensate identically here so the n-free re-key
5499                // reproduces the slow-path penalty surface byte-for-byte (#706).
5500                let ls = ls_opt.ok_or_else(|| {
5501                    "Matérn n-free penalty re-key requires a finite length-scale".to_string()
5502                })?;
5503                let effective_ls = match input_scales.as_deref() {
5504                    Some(scales) => compensate_length_scale_for_standardization(ls, scales),
5505                    None => ls,
5506                };
5507                let aniso_for_penalty = aniso_from_psi.as_deref().or(aniso_log_scales.as_deref());
5508                // Route through the SAME canonical operator-triplet builder the
5509                // realized design uses (`matern_operator_penalty_triplet_from_
5510                // metadata`). The Matérn design ALWAYS uses this {mass, tension,
5511                // stiffness} triplet (see the Matérn penalty selection in
5512                // term_specs.rs; #1074 confirmed by MSI measurement that the RKHS
5513                // kernel penalty does not improve recovery and regresses the
5514                // high-frequency guard), so re-keying via the kernel path would
5515                // produce a 1-block surface against a 3-block frozen design — the
5516                // topology desync #1270 hard-errored on. Sharing the builder
5517                // makes the block count ψ-stable by construction.
5518                let (penalties, nullspace_dims, _info) =
5519                    matern_operator_penalty_triplet_at_length_scale(
5520                        centers.view(),
5521                        periodic.as_deref(),
5522                        identifiability_transform.as_ref(),
5523                        *nu,
5524                        *include_intercept,
5525                        aniso_for_penalty,
5526                        effective_ls,
5527                    )
5528                    .map_err(|e| e.to_string())?;
5529                (penalties, nullspace_dims)
5530            }
5531            BasisMetadata::ThinPlate {
5532                centers,
5533                identifiability_transform,
5534                radial_reparam,
5535                ..
5536            } => {
5537                let ls = ls_opt.ok_or_else(|| {
5538                    "thin-plate n-free penalty re-key requires a finite length-scale".to_string()
5539                })?;
5540                let double_penalty = match &termspec.basis {
5541                    SmoothBasisSpec::ThinPlate { spec, .. } => spec.double_penalty,
5542                    _ => false,
5543                };
5544                gam_terms::basis::thin_plate_penalties_at_length_scale(
5545                    centers.view(),
5546                    identifiability_transform.as_ref(),
5547                    radial_reparam.as_ref(),
5548                    ls,
5549                    double_penalty,
5550                    &mut self.basisworkspace,
5551                )
5552                .map_err(|e| e.to_string())?
5553            }
5554            other => {
5555                return Err(format!(
5556                    "n-free penalty re-key unsupported for basis metadata {:?}",
5557                    std::mem::discriminant(other)
5558                ));
5559            }
5560        };
5561        // The frozen collection penalties for THIS term are the templates whose
5562        // ψ-invariant structure (col_range / prior_mean / structure_hint / op)
5563        // we keep, swapping only the numeric `local`. For a single-spatial-term
5564        // collection the term owns the whole penalty list.
5565        let templates = &self.design.penalties;
5566        if templates.len() != locals.len() {
5567            return Err(format!(
5568                "n-free penalty re-key produced {} blocks but the frozen design carries {} \
5569                 — penalty topology is not ψ-stable",
5570                locals.len(),
5571                templates.len()
5572            ));
5573        }
5574        let specs: Vec<gam_solve::estimate::PenaltySpec> = templates
5575            .iter()
5576            .zip(locals.into_iter())
5577            .map(|(tmpl, local)| gam_solve::estimate::PenaltySpec::Block {
5578                local,
5579                col_range: tmpl.col_range.clone(),
5580                prior_mean: tmpl.prior_mean.clone(),
5581                structure_hint: tmpl.structure_hint.clone(),
5582                op: tmpl.op.clone(),
5583            })
5584            .collect();
5585        gam_terms::construction::canonicalize_penalty_specs(
5586            &specs,
5587            &nullspace_dims,
5588            p_total,
5589            "nfree-psi-penalty",
5590        )
5591        .map_err(|e| e.to_string())
5592    }
5593
5594    fn canonical_penalty_derivatives_at_psi(
5595        &mut self,
5596        spatial_terms: &[usize],
5597        psi: &[f64],
5598    ) -> Result<(Range<usize>, usize, Vec<Array2<f64>>), String> {
5599        if spatial_terms.len() != 1 {
5600            return Err(format!(
5601                "n-free penalty derivative re-key requires exactly one spatial term, found {}",
5602                spatial_terms.len()
5603            ));
5604        }
5605        let term_idx = spatial_terms[0];
5606        let (ls_opt, aniso_from_psi) = spatial_term_psi_to_length_scale_and_aniso(psi);
5607        let termspec = self.spec.smooth_terms.get(term_idx).ok_or_else(|| {
5608            format!("spatial term {term_idx} out of range for n-free penalty derivative")
5609        })?;
5610        let term = self
5611            .design
5612            .smooth
5613            .terms
5614            .get(term_idx)
5615            .ok_or_else(|| format!("realized smooth term {term_idx} out of range"))?;
5616        let p_total = self.design.design.ncols();
5617        let smooth_start = p_total.saturating_sub(self.design.smooth.total_smooth_cols());
5618        let global_range =
5619            (smooth_start + term.coeff_range.start)..(smooth_start + term.coeff_range.end);
5620
5621        let locals = match &term.metadata {
5622            BasisMetadata::Duchon {
5623                centers,
5624                identifiability_transform,
5625                operator_collocation_points,
5626                power,
5627                nullspace_order,
5628                aniso_log_scales,
5629                input_scales,
5630                radial_reparam,
5631                ..
5632            } => {
5633                let mut spec = match &termspec.basis {
5634                    SmoothBasisSpec::Duchon { spec, .. } => spec.clone(),
5635                    _ => {
5636                        return Err(
5637                            "Duchon n-free penalty derivative requires a Duchon term spec"
5638                                .to_string(),
5639                        );
5640                    }
5641                };
5642                let effective_ls = match input_scales.as_deref() {
5643                    Some(scales) => {
5644                        compensate_optional_length_scale_for_standardization(ls_opt, scales)
5645                    }
5646                    None => ls_opt,
5647                };
5648                spec.length_scale = effective_ls;
5649                spec.power = *power;
5650                spec.nullspace_order = *nullspace_order;
5651                spec.aniso_log_scales = aniso_log_scales.clone();
5652                // #1355: replay the frozen data-metric reparam so the n-free
5653                // penalty ψ-derivative matches the rotated forward penalty.
5654                spec.radial_reparam = radial_reparam.clone();
5655                if spec.length_scale.is_none() {
5656                    return Err(
5657                        "Duchon n-free penalty derivative requires a hybrid length-scale"
5658                            .to_string(),
5659                    );
5660                }
5661                let collocation = operator_collocation_points
5662                    .as_ref()
5663                    .map(|points| points.view())
5664                    .unwrap_or_else(|| centers.view());
5665                let (_native_sources, mut first, _native_second) =
5666                    gam_terms::basis::build_duchon_native_penalty_psi_derivatives(
5667                        centers.view(),
5668                        &spec,
5669                        identifiability_transform.as_ref(),
5670                        &mut self.basisworkspace,
5671                    )
5672                    .map_err(|e| e.to_string())?;
5673                let (_operator_sources, operator_first, _operator_second) =
5674                    gam_terms::basis::build_duchon_operator_penalty_psi_derivatives(
5675                        collocation,
5676                        centers.view(),
5677                        &spec,
5678                        identifiability_transform.as_ref(),
5679                        &mut self.basisworkspace,
5680                    )
5681                    .map_err(|e| e.to_string())?;
5682                first.extend(operator_first);
5683                first
5684            }
5685            BasisMetadata::Matern {
5686                centers,
5687                periodic,
5688                nu,
5689                include_intercept,
5690                identifiability_transform,
5691                aniso_log_scales,
5692                input_scales,
5693                ..
5694            } => {
5695                let ls = ls_opt.ok_or_else(|| {
5696                    "Matérn n-free penalty derivative requires a finite length-scale".to_string()
5697                })?;
5698                let effective_ls = match input_scales.as_deref() {
5699                    Some(scales) => compensate_length_scale_for_standardization(ls, scales),
5700                    None => ls,
5701                };
5702                let penalty_centers =
5703                    gam_terms::basis::expand_periodic_centers(&centers.to_owned(), periodic.as_deref())
5704                        .map_err(|e| e.to_string())?;
5705                let aniso_for_penalty = aniso_from_psi.as_deref().or(aniso_log_scales.as_deref());
5706                let (first, _second) = gam_terms::basis::build_matern_operator_penalty_psi_derivatives(
5707                    penalty_centers.view(),
5708                    effective_ls,
5709                    *nu,
5710                    *include_intercept,
5711                    identifiability_transform.as_ref(),
5712                    aniso_for_penalty,
5713                )
5714                .map_err(|e| e.to_string())?;
5715                first
5716            }
5717            BasisMetadata::ThinPlate {
5718                centers,
5719                identifiability_transform,
5720                radial_reparam,
5721                ..
5722            } => {
5723                let ls = ls_opt.ok_or_else(|| {
5724                    "thin-plate n-free penalty derivative requires a finite length-scale"
5725                        .to_string()
5726                })?;
5727                let mut spec = match &termspec.basis {
5728                    SmoothBasisSpec::ThinPlate { spec, .. } => spec.clone(),
5729                    _ => {
5730                        return Err(
5731                            "thin-plate n-free penalty derivative requires a ThinPlate term spec"
5732                                .to_string(),
5733                        );
5734                    }
5735                };
5736                spec.length_scale = ls;
5737                if spec.radial_reparam.is_none() {
5738                    spec.radial_reparam = radial_reparam.clone();
5739                }
5740                let (primary, _primary_second) =
5741                    gam_terms::basis::build_thin_plate_penalty_psi_derivativeswithworkspace(
5742                        centers.view(),
5743                        &spec,
5744                        identifiability_transform.as_ref(),
5745                        &mut self.basisworkspace,
5746                    )
5747                    .map_err(|e| e.to_string())?;
5748                if self.design.penalties.len() > 1 {
5749                    vec![primary.clone(), Array2::<f64>::zeros(primary.raw_dim())]
5750                } else {
5751                    vec![primary]
5752                }
5753            }
5754            other => {
5755                return Err(format!(
5756                    "n-free penalty derivative re-key unsupported for basis metadata {:?}",
5757                    std::mem::discriminant(other)
5758                ));
5759            }
5760        };
5761        if locals.len() != self.design.penalties.len() {
5762            return Err(format!(
5763                "n-free penalty derivative re-key produced {} blocks but the frozen design carries {} \
5764                 — penalty topology is not ψ-stable",
5765                locals.len(),
5766                self.design.penalties.len()
5767            ));
5768        }
5769        Ok((global_range, p_total, locals))
5770    }
5771
5772    fn apply_log_kappa(
5773        &mut self,
5774        log_kappa: &SpatialLogKappaCoords,
5775        term_indices: &[usize],
5776    ) -> Result<(), String> {
5777        if term_indices.len() != log_kappa.dims_per_term().len() {
5778            return Err(SmoothError::dimension_mismatch(format!(
5779                "incremental realizer log-kappa term mismatch: term_indices={}, dims_per_term={}",
5780                term_indices.len(),
5781                log_kappa.dims_per_term().len()
5782            ))
5783            .into());
5784        }
5785
5786        let mut any_changed = false;
5787        for (slot, &term_idx) in term_indices.iter().enumerate() {
5788            any_changed |= self.apply_log_kappa_to_term(term_idx, log_kappa.term_slice(slot))?;
5789        }
5790
5791        if any_changed {
5792            self.refresh_full_design_operator()?;
5793            rebuild_smooth_auxiliary_state(
5794                &mut self.design.smooth,
5795                &self.dropped_penaltyinfo_by_term,
5796            )?;
5797            rebuild_term_collection_auxiliary_state(&self.spec, &mut self.design)?;
5798            self.design_revision = self.design_revision.wrapping_add(1);
5799        }
5800        Ok(())
5801    }
5802
5803    fn apply_log_kappa_to_term(&mut self, term_idx: usize, psi: &[f64]) -> Result<bool, String> {
5804        if !spatial_term_supports_hyper_optimization(&self.spec, term_idx) {
5805            return Err(SmoothError::invalid_config(format!(
5806                "incremental realizer term {term_idx} does not expose spatial hyperparameters"
5807            ))
5808            .into());
5809        }
5810        // Measure-jet ψ slots are dial coordinates, not log-κ (dial docs:
5811        // the MEASURE_JET_PSI_* bounds block); route through the dial setter
5812        // so the κ-translation below never misreads them as log-scales.
5813        let measure_jet_term = measure_jet_term_spec(&self.spec, term_idx).is_some();
5814        // Constant-curvature ψ is the raw signed curvature κ, NOT a log-scale;
5815        // route through the κ setter so `spatial_term_psi_to_length_scale_and_aniso`
5816        // never misreads it (and never hits the "no length scale" rejection).
5817        let constant_curvature_term = constant_curvature_term_spec(&self.spec, term_idx).is_some();
5818        let mut next_length_scale = None;
5819        let mut next_aniso: Option<Vec<f64>> = None;
5820        if measure_jet_term {
5821            if !set_measure_jet_psi_dials(&mut self.spec, term_idx, psi)
5822                .map_err(|e| e.to_string())?
5823            {
5824                return Ok(false);
5825            }
5826        } else if constant_curvature_term {
5827            if !set_constant_curvature_kappa(&mut self.spec, term_idx, psi)
5828                .map_err(|e| e.to_string())?
5829            {
5830                return Ok(false);
5831            }
5832        } else {
5833            let current_length_scale = get_spatial_length_scale(&self.spec, term_idx);
5834            let current_aniso = get_spatial_aniso_log_scales(&self.spec, term_idx);
5835            let (ls, eta) = spatial_term_psi_to_length_scale_and_aniso(psi);
5836            next_length_scale = ls;
5837            next_aniso = eta;
5838            let same_length = spatial_length_scale_matches(current_length_scale, next_length_scale);
5839            let same_aniso = spatial_aniso_matches(current_aniso.as_deref(), next_aniso.as_deref());
5840            if same_length && same_aniso {
5841                return Ok(false);
5842            }
5843            if let Some(length_scale) = next_length_scale {
5844                set_spatial_length_scale(&mut self.spec, term_idx, length_scale)
5845                    .map_err(|e| e.to_string())?;
5846            }
5847            if let Some(eta) = next_aniso.clone() {
5848                set_spatial_aniso_log_scales(&mut self.spec, term_idx, eta)
5849                    .map_err(|e| e.to_string())?;
5850            }
5851        }
5852
5853        // Pick the spec to drive the rebuild. If the per-term geometry cache
5854        // is populated, it carries already-resolved centers
5855        // (`CenterStrategy::UserProvided`) and frozen `input_scales`; reusing
5856        // it short-circuits `select_centers_by_strategy` (KMeans /
5857        // FarthestPoint / EqualMass cluster searches) and
5858        // `compute_spatial_input_scales` (per-axis variance over n rows) in
5859        // the family builders. Centers in the cached spec live in
5860        // standardized coordinates (matching the cached `input_scales`), so
5861        // the same standardization + kernel path runs without recomputation
5862        // of the geometry.
5863        let geometry_slot = self
5864            .spatial_realization_geometry
5865            .get(term_idx)
5866            .ok_or_else(|| format!("incremental realizer geometry slot {term_idx} out of range"))?;
5867        let mut build_spec = match geometry_slot {
5868            Some(cached) => cached.clone(),
5869            None => self
5870                .spec
5871                .smooth_terms
5872                .get(term_idx)
5873                .ok_or_else(|| format!("incremental realizer smooth term {term_idx} out of range"))?
5874                .clone(),
5875        };
5876        if measure_jet_term {
5877            // The cached build spec carries the frozen geometry (UserProvided
5878            // barycenter nodes, frozen quadrature + transform); only the
5879            // dials move per trial.
5880            set_single_term_measure_jet_psi_dials(&mut build_spec, psi)
5881                .map_err(|e| e.to_string())?;
5882        } else if constant_curvature_term {
5883            // The cached build spec carries the κ-fixed geometry (UserProvided
5884            // centers, frozen ℓ and constraint transform); only κ moves per
5885            // trial, written through the raw-κ setter to match the collection
5886            // write-back above.
5887            set_single_term_constant_curvature_kappa(&mut build_spec, psi)
5888                .map_err(|e| e.to_string())?;
5889        } else {
5890            if let Some(length_scale) = next_length_scale {
5891                set_single_term_spatial_length_scale(&mut build_spec, length_scale)
5892                    .map_err(|e| e.to_string())?;
5893            }
5894            if let Some(eta) = next_aniso {
5895                set_single_term_spatial_aniso_log_scales(&mut build_spec, eta)
5896                    .map_err(|e| e.to_string())?;
5897            }
5898        }
5899
5900        let termname = build_spec.name.clone();
5901        let local = build_single_local_smooth_term(
5902            self.data,
5903            &build_spec,
5904            &mut self.basisworkspace,
5905        )
5906        .map_err(|e| {
5907            format!(
5908                "failed to rebuild smooth term '{termname}' during incremental κ realization: {e}"
5909            )
5910        })?;
5911
5912        // Populate the geometry cache from the realized metadata on first use.
5913        // Family auto-promotion (ThinPlate -> Duchon) is detected as a basis /
5914        // metadata mismatch in `freeze_geometry_from_metadata` and leaves the
5915        // cache empty so the next call re-tries with the (now stable) family.
5916        if self.spatial_realization_geometry[term_idx].is_none()
5917            && let Some(frozen) = freeze_geometry_from_metadata(&build_spec, &local.metadata)
5918        {
5919            // Mirror the frozen identifiability (pinned `Z` + double-penalty
5920            // nullspace-shrinkage decision, #787/#860/#1122) back onto the
5921            // collection spec the analytic ψ-gradient reads
5922            // (`try_build_spatial_log_kappa_hyper_dirs(self.spec(), …)`). The
5923            // value rebuild consumes the cached `build_spec`, so without this
5924            // copy the gradient would keep re-running the κ-DEPENDENT spectral
5925            // test on the un-frozen collection spec while the value uses the
5926            // frozen decision — re-introducing the very objective↔gradient
5927            // desync the freeze removes. Pinning both to the same frozen
5928            // transform keeps the per-trial value and its analytic gradient on
5929            // one fixed `Z` and one fixed null dimension `r`.
5930            if let (
5931                SmoothBasisSpec::Matern {
5932                    spec: frozen_spec, ..
5933                },
5934                Some(SmoothBasisSpec::Matern {
5935                    spec: live_spec, ..
5936                }),
5937            ) = (
5938                &frozen.basis,
5939                self.spec
5940                    .smooth_terms
5941                    .get_mut(term_idx)
5942                    .map(|t| &mut t.basis),
5943            ) {
5944                live_spec.identifiability = frozen_spec.identifiability.clone();
5945                live_spec.center_strategy = frozen_spec.center_strategy.clone();
5946            }
5947            self.spatial_realization_geometry[term_idx] = Some(frozen);
5948        }
5949
5950        let realization = wrap_local_build_as_realization(local, &build_spec)?;
5951        self.replace_term_realization(term_idx, realization)?;
5952        Ok(true)
5953    }
5954
5955    fn replace_term_realization(
5956        &mut self,
5957        term_idx: usize,
5958        realization: SingleSmoothTermRealization,
5959    ) -> Result<(), String> {
5960        let t_replace = std::time::Instant::now();
5961        let SingleSmoothTermRealization {
5962            design_local,
5963            term,
5964            dropped_penaltyinfo,
5965        } = realization;
5966        let SmoothTerm {
5967            name,
5968            penalties_local,
5969            nullspace_dims,
5970            penaltyinfo_local,
5971            metadata,
5972            lower_bounds_local,
5973            linear_constraints_local,
5974            joint_null_rotation,
5975            ..
5976        } = term;
5977        let coeff_range = self
5978            .design
5979            .smooth
5980            .terms
5981            .get(term_idx)
5982            .ok_or_else(|| format!("incremental realizer smooth term {term_idx} out of range"))?
5983            .coeff_range
5984            .clone();
5985        if design_local.ncols() != coeff_range.len() {
5986            return Err(SmoothError::dimension_mismatch(format!(
5987                "incremental realizer width mismatch for term {}: rebuilt_cols={}, cached_cols={}",
5988                term_idx,
5989                design_local.ncols(),
5990                coeff_range.len()
5991            ))
5992            .into());
5993        }
5994        if design_local.nrows() != self.design.design.nrows() {
5995            return Err(SmoothError::dimension_mismatch(format!(
5996                "incremental realizer row mismatch for term {}: rebuilt_rows={}, design_rows={}",
5997                term_idx,
5998                design_local.nrows(),
5999                self.design.design.nrows()
6000            ))
6001            .into());
6002        }
6003
6004        let active_penaltyinfo = penaltyinfo_local
6005            .iter()
6006            .filter(|info| info.active)
6007            .cloned()
6008            .collect::<Vec<_>>();
6009        let smooth_penalty_range = self
6010            .smooth_penalty_ranges
6011            .get(term_idx)
6012            .ok_or_else(|| {
6013                format!("incremental realizer missing smooth penalty range for term {term_idx}")
6014            })?
6015            .clone();
6016        let full_penalty_range = self
6017            .full_penalty_ranges
6018            .get(term_idx)
6019            .ok_or_else(|| {
6020                format!("incremental realizer missing full penalty range for term {term_idx}")
6021            })?
6022            .clone();
6023        if active_penaltyinfo.len() != smooth_penalty_range.len()
6024            || penalties_local.len() != smooth_penalty_range.len()
6025            || nullspace_dims.len() != smooth_penalty_range.len()
6026        {
6027            return Err(SmoothError::dimension_mismatch(format!(
6028                "incremental realizer topology changed for term '{}': penalties={}, infos={}, nullspaces={}, cached_penalties={}",
6029                name,
6030                penalties_local.len(),
6031                active_penaltyinfo.len(),
6032                nullspace_dims.len(),
6033                smooth_penalty_range.len()
6034            ))
6035            .into());
6036        }
6037
6038        self.design.smooth.term_designs[term_idx] = design_local;
6039
6040        for (offset, penalty_local) in penalties_local.iter().enumerate() {
6041            let smooth_penalty_idx = smooth_penalty_range.start + offset;
6042            let full_penalty_idx = full_penalty_range.start + offset;
6043            let nullspace_dim = nullspace_dims[offset];
6044            let penalty_info = active_penaltyinfo[offset].clone();
6045
6046            if penalty_local.nrows() != coeff_range.len()
6047                || penalty_local.ncols() != coeff_range.len()
6048            {
6049                return Err(SmoothError::dimension_mismatch(format!(
6050                    "incremental realizer penalty shape mismatch for term '{}' penalty {}: \
6051                     penalty is {}x{} but coeff_range has {} columns",
6052                    name,
6053                    offset,
6054                    penalty_local.nrows(),
6055                    penalty_local.ncols(),
6056                    coeff_range.len()
6057                ))
6058                .into());
6059            }
6060
6061            let smooth_penalty = self
6062                .design
6063                .smooth
6064                .penalties
6065                .get_mut(smooth_penalty_idx)
6066                .ok_or_else(|| {
6067                    format!(
6068                        "incremental realizer smooth penalty {} out of range for term {}",
6069                        smooth_penalty_idx, term_idx
6070                    )
6071                })?;
6072            // With per-term block-local penalties, col_range already targets
6073            // this specific term, so .local is p_k × p_k.
6074            smooth_penalty.local.assign(penalty_local);
6075
6076            let full_bp = self
6077                .design
6078                .penalties
6079                .get_mut(full_penalty_idx)
6080                .ok_or_else(|| {
6081                    format!(
6082                        "incremental realizer full penalty {} out of range for term {}",
6083                        full_penalty_idx, term_idx
6084                    )
6085                })?;
6086            // With per-term block-local penalties, col_range already targets
6087            // this specific term, so .local is p_k × p_k.
6088            full_bp.local.assign(penalty_local);
6089
6090            self.design.smooth.nullspace_dims[smooth_penalty_idx] = nullspace_dim;
6091            self.design.nullspace_dims[full_penalty_idx] = nullspace_dim;
6092
6093            self.design.smooth.penaltyinfo[smooth_penalty_idx].global_index = smooth_penalty_idx;
6094            self.design.smooth.penaltyinfo[smooth_penalty_idx].termname = Some(name.clone());
6095            self.design.smooth.penaltyinfo[smooth_penalty_idx].penalty = penalty_info.clone();
6096
6097            self.design.penaltyinfo[full_penalty_idx].global_index = full_penalty_idx;
6098            self.design.penaltyinfo[full_penalty_idx].termname = Some(name.clone());
6099            self.design.penaltyinfo[full_penalty_idx].penalty = penalty_info;
6100        }
6101
6102        let target_term = self.design.smooth.terms.get_mut(term_idx).ok_or_else(|| {
6103            format!("incremental realizer smooth term {term_idx} disappeared during replacement")
6104        })?;
6105        target_term.penalties_local = penalties_local;
6106        target_term.nullspace_dims = nullspace_dims;
6107        target_term.penaltyinfo_local = penaltyinfo_local;
6108        target_term.metadata = metadata;
6109        target_term.lower_bounds_local = lower_bounds_local;
6110        target_term.linear_constraints_local = linear_constraints_local;
6111        target_term.joint_null_rotation = joint_null_rotation;
6112        self.dropped_penaltyinfo_by_term[term_idx] = dropped_penaltyinfo;
6113        log::info!(
6114            "[STAGE] smooth basis rebuild (term {}, '{}', cols={}): {:.3}s",
6115            term_idx,
6116            target_term.name,
6117            coeff_range.len(),
6118            t_replace.elapsed().as_secs_f64(),
6119        );
6120        Ok(())
6121    }
6122
6123    fn refresh_full_design_operator(&mut self) -> Result<(), String> {
6124        let mut blocks = Vec::<DesignBlock>::with_capacity(
6125            self.fixed_blocks.len() + self.design.smooth.term_designs.len(),
6126        );
6127        blocks.extend(self.fixed_blocks.iter().cloned());
6128        for term_design in &self.design.smooth.term_designs {
6129            blocks.push(DesignBlock::from(term_design));
6130        }
6131        self.design.design = assemble_term_collection_design_matrix(blocks)
6132            .map_err(|e| format!("failed to refresh term-collection design: {e}"))?;
6133        Ok(())
6134    }
6135}
6136
6137fn build_term_collection_fixed_blocks(
6138    data: ArrayView2<'_, f64>,
6139    spec: &TermCollectionSpec,
6140) -> Result<Vec<DesignBlock>, BasisError> {
6141    let mut blocks = Vec::<DesignBlock>::new();
6142    if !term_collection_has_one_sided_anchored_bspline(spec) {
6143        blocks.push(DesignBlock::Intercept(data.nrows()));
6144    }
6145
6146    if !spec.linear_terms.is_empty() {
6147        let mut linear_block = Array2::<f64>::zeros((data.nrows(), spec.linear_terms.len()));
6148        for (j, linear) in spec.linear_terms.iter().enumerate() {
6149            // Single shared realizer: numeric product gated by any
6150            // categorical-level indicators (factor-aware `:` interaction),
6151            // mirroring `build_term_collection_design_inner`.
6152            let column = linear
6153                .realized_design_column(data)
6154                .map_err(BasisError::InvalidInput)?;
6155            linear_block.column_mut(j).assign(&column);
6156        }
6157        blocks.push(DesignBlock::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
6158            linear_block,
6159        )));
6160    }
6161
6162    for term in &spec.random_effect_terms {
6163        let block = build_random_effect_block(data, term)?;
6164        let re_op = RandomEffectOperator::new(block.group_ids, block.num_groups);
6165        blocks.push(DesignBlock::RandomEffect(Arc::new(re_op)));
6166    }
6167
6168    Ok(blocks)
6169}
6170
6171// ---------------------------------------------------------------------------
6172// N-block spatial length-scale optimizer.
6173// ---------------------------------------------------------------------------
6174
6175pub struct SpatialLengthScaleOptimizationResult<FitOut> {
6176    pub resolved_specs: Vec<TermCollectionSpec>,
6177    pub designs: Vec<TermCollectionDesign>,
6178    pub fit: FitOut,
6179    pub timing: Option<SpatialLengthScaleOptimizationTiming>,
6180}
6181
6182/// Exact-joint hyper-parameter setup for N-block spatial length-scale optimization.
6183#[derive(Debug, Clone)]
6184pub struct ExactJointHyperSetup {
6185    rho0: Array1<f64>,
6186    rho_lower: Array1<f64>,
6187    rho_upper: Array1<f64>,
6188    log_kappa0: SpatialLogKappaCoords,
6189    log_kappa_lower: SpatialLogKappaCoords,
6190    log_kappa_upper: SpatialLogKappaCoords,
6191    auxiliary0: Array1<f64>,
6192    auxiliary_lower: Array1<f64>,
6193    auxiliary_upper: Array1<f64>,
6194}
6195
6196impl ExactJointHyperSetup {
6197    fn sanitize_rho_seed(
6198        rho0: Array1<f64>,
6199        rho_lower: &Array1<f64>,
6200        rho_upper: &Array1<f64>,
6201    ) -> Array1<f64> {
6202        Array1::from_iter(rho0.iter().enumerate().map(|(idx, &value)| {
6203            let lo = rho_lower[idx];
6204            let hi = rho_upper[idx];
6205            let fallback = 0.0_f64.clamp(lo, hi);
6206            if value.is_finite() {
6207                value.clamp(lo, hi)
6208            } else {
6209                fallback
6210            }
6211        }))
6212    }
6213
6214    pub(crate) fn new(
6215        rho0: Array1<f64>,
6216        rho_lower: Array1<f64>,
6217        rho_upper: Array1<f64>,
6218        log_kappa0: SpatialLogKappaCoords,
6219        log_kappa_lower: SpatialLogKappaCoords,
6220        log_kappa_upper: SpatialLogKappaCoords,
6221    ) -> Self {
6222        let rho0 = Self::sanitize_rho_seed(rho0, &rho_lower, &rho_upper);
6223        Self {
6224            rho0,
6225            rho_lower,
6226            rho_upper,
6227            log_kappa0,
6228            log_kappa_lower,
6229            log_kappa_upper,
6230            auxiliary0: Array1::zeros(0),
6231            auxiliary_lower: Array1::zeros(0),
6232            auxiliary_upper: Array1::zeros(0),
6233        }
6234    }
6235
6236    pub(crate) fn with_auxiliary(
6237        mut self,
6238        auxiliary0: Array1<f64>,
6239        auxiliary_lower: Array1<f64>,
6240        auxiliary_upper: Array1<f64>,
6241    ) -> Self {
6242        assert_eq!(
6243            auxiliary0.len(),
6244            auxiliary_lower.len(),
6245            "auxiliary lower bound length mismatch"
6246        );
6247        assert_eq!(
6248            auxiliary0.len(),
6249            auxiliary_upper.len(),
6250            "auxiliary upper bound length mismatch"
6251        );
6252        self.auxiliary0 = Self::sanitize_rho_seed(auxiliary0, &auxiliary_lower, &auxiliary_upper);
6253        self.auxiliary_lower = auxiliary_lower;
6254        self.auxiliary_upper = auxiliary_upper;
6255        self
6256    }
6257
6258    pub(crate) fn rho_dim(&self) -> usize {
6259        self.rho0.len()
6260    }
6261
6262    pub(crate) fn log_kappa_dim(&self) -> usize {
6263        self.log_kappa0.len()
6264    }
6265
6266    pub(crate) fn auxiliary_dim(&self) -> usize {
6267        self.auxiliary0.len()
6268    }
6269
6270    pub(crate) fn theta0(&self) -> Array1<f64> {
6271        let mut out =
6272            Array1::<f64>::zeros(self.rho_dim() + self.log_kappa_dim() + self.auxiliary_dim());
6273        out.slice_mut(s![..self.rho_dim()]).assign(&self.rho0);
6274        out.slice_mut(s![self.rho_dim()..self.rho_dim() + self.log_kappa_dim()])
6275            .assign(self.log_kappa0.as_array());
6276        out.slice_mut(s![self.rho_dim() + self.log_kappa_dim()..])
6277            .assign(&self.auxiliary0);
6278        out
6279    }
6280
6281    pub(crate) fn lower(&self) -> Array1<f64> {
6282        let mut out =
6283            Array1::<f64>::zeros(self.rho_dim() + self.log_kappa_dim() + self.auxiliary_dim());
6284        out.slice_mut(s![..self.rho_dim()]).assign(&self.rho_lower);
6285        out.slice_mut(s![self.rho_dim()..self.rho_dim() + self.log_kappa_dim()])
6286            .assign(self.log_kappa_lower.as_array());
6287        out.slice_mut(s![self.rho_dim() + self.log_kappa_dim()..])
6288            .assign(&self.auxiliary_lower);
6289        out
6290    }
6291
6292    pub(crate) fn upper(&self) -> Array1<f64> {
6293        let mut out =
6294            Array1::<f64>::zeros(self.rho_dim() + self.log_kappa_dim() + self.auxiliary_dim());
6295        out.slice_mut(s![..self.rho_dim()]).assign(&self.rho_upper);
6296        out.slice_mut(s![self.rho_dim()..self.rho_dim() + self.log_kappa_dim()])
6297            .assign(self.log_kappa_upper.as_array());
6298        out.slice_mut(s![self.rho_dim() + self.log_kappa_dim()..])
6299            .assign(&self.auxiliary_upper);
6300        out
6301    }
6302
6303    /// Per-term dimensionality layout for the psi block.
6304    pub(crate) fn log_kappa_dims_per_term(&self) -> Vec<usize> {
6305        self.log_kappa0.dims_per_term().to_vec()
6306    }
6307}
6308
6309/// N-block design cache for exact-joint spatial length-scale optimization.
6310///
6311/// Each block owns a `FrozenTermCollectionIncrementalRealizer` and a list of
6312/// spatial term indices within that block's spec. The cache splits the
6313/// combined psi vector into per-block slices using precomputed offsets.
6314struct ExactJointDesignCache<'d> {
6315    realizers: Vec<FrozenTermCollectionIncrementalRealizer<'d>>,
6316    block_term_indices: Vec<Vec<usize>>,
6317    current_theta: Option<Array1<f64>>,
6318    last_cost: Option<f64>,
6319    last_eval: Option<(
6320        f64,
6321        Array1<f64>,
6322        gam_problem::HessianResult,
6323    )>,
6324    rho_dim: usize,
6325    all_dims: Vec<usize>,
6326    log_kappa_dim: usize,
6327    block_term_counts: Vec<usize>,
6328}
6329
6330impl<'d> ExactJointDesignCache<'d> {
6331    fn new(
6332        data: ArrayView2<'d, f64>,
6333        blocks: Vec<(TermCollectionSpec, TermCollectionDesign, Vec<usize>)>,
6334        rho_dim: usize,
6335        all_dims: Vec<usize>,
6336    ) -> Result<Self, String> {
6337        let n_blocks = blocks.len();
6338        let mut realizers = Vec::with_capacity(n_blocks);
6339        let mut block_term_indices = Vec::with_capacity(n_blocks);
6340        let mut block_term_counts = Vec::with_capacity(n_blocks);
6341
6342        for (spec, design, terms) in blocks {
6343            block_term_counts.push(terms.len());
6344            block_term_indices.push(terms);
6345            realizers.push(FrozenTermCollectionIncrementalRealizer::new(
6346                data, spec, design,
6347            )?);
6348        }
6349
6350        Ok(Self {
6351            realizers,
6352            block_term_indices,
6353            current_theta: None,
6354            last_cost: None,
6355            last_eval: None,
6356            rho_dim,
6357            log_kappa_dim: all_dims.iter().sum(),
6358            all_dims,
6359            block_term_counts,
6360        })
6361    }
6362
6363    fn ensure_theta(&mut self, theta: &Array1<f64>) -> Result<(), String> {
6364        if self
6365            .current_theta
6366            .as_ref()
6367            .is_some_and(|cached| theta_values_match(cached, theta))
6368        {
6369            return Ok(());
6370        }
6371
6372        let t_ensure = std::time::Instant::now();
6373        let kappa_theta_len = self.rho_dim + self.log_kappa_dim;
6374        if theta.len() < kappa_theta_len {
6375            return Err(SmoothError::dimension_mismatch(format!(
6376                "exact-joint theta length mismatch: got {}, expected at least {} (rho_dim={}, log_kappa_dim={})",
6377                theta.len(),
6378                kappa_theta_len,
6379                self.rho_dim,
6380                self.log_kappa_dim
6381            ))
6382            .into());
6383        }
6384        let theta_kappa = theta.slice(s![..kappa_theta_len]).to_owned();
6385        let full_log_kappa = SpatialLogKappaCoords::from_theta_tail_with_dims(
6386            &theta_kappa,
6387            self.rho_dim,
6388            self.all_dims.clone(),
6389        );
6390
6391        // Split the full log_kappa into per-block sub-coords using split_at.
6392        // We split from the front iteratively: after extracting block 0..N-2,
6393        // the remainder is the last block.
6394        let n = self.realizers.len();
6395        let mut remaining = full_log_kappa;
6396        for block_idx in 0..n {
6397            let count = self.block_term_counts[block_idx];
6398            if block_idx < n - 1 {
6399                let (block_lk, rest) = remaining.split_at(count);
6400                self.realizers[block_idx]
6401                    .apply_log_kappa(&block_lk, &self.block_term_indices[block_idx])?;
6402                remaining = rest;
6403            } else {
6404                // Last block gets the remainder.
6405                self.realizers[block_idx]
6406                    .apply_log_kappa(&remaining, &self.block_term_indices[block_idx])?;
6407            }
6408        }
6409
6410        log::info!(
6411            "[STAGE] ensure_theta (n-block, {} blocks, {} realizers): {:.3}s",
6412            n,
6413            self.realizers.len(),
6414            t_ensure.elapsed().as_secs_f64(),
6415        );
6416        self.current_theta = Some(theta.clone());
6417        self.last_cost = None;
6418        self.last_eval = None;
6419        Ok(())
6420    }
6421
6422    impl_exact_joint_theta_memo!();
6423
6424    /// Cache a cost-only result. Called after `ensure_theta(theta)` for
6425    /// line-search probes that pay only for the cost evaluation. We
6426    /// intentionally do not populate `last_eval` because no gradient was
6427    /// computed; the next outer evaluation at this θ will recompute
6428    /// (V, ∇V) via `evaluate_with_order` if the optimizer asks for it.
6429    fn store_cost_only(&mut self, theta: &Array1<f64>, cost: f64) {
6430        if self
6431            .current_theta
6432            .as_ref()
6433            .is_some_and(|cached| theta_values_match(cached, theta))
6434        {
6435            self.last_cost = Some(cost);
6436        }
6437    }
6438
6439    fn specs(&self) -> Vec<&TermCollectionSpec> {
6440        self.realizers.iter().map(|r| r.spec()).collect()
6441    }
6442
6443    fn designs(&self) -> Vec<&TermCollectionDesign> {
6444        self.realizers.iter().map(|r| r.design()).collect()
6445    }
6446
6447    /// Combined monotonic design revision across all per-block realizers.
6448    ///
6449    /// Mirrors `SingleBlockExactJointDesignCache::design_revision` for the
6450    /// n-block exact-joint path. Each realizer's `design_revision` counter
6451    /// advances iff `apply_log_kappa` actually rebuilt that block's realized
6452    /// design / smooth penalties; the wrapping sum therefore changes iff
6453    /// *any* block rebuilt. Equal values across two calls imply no realizer
6454    /// has been rebuilt in between, which is the invariant the
6455    /// `ExternalJointHyperEvaluator` canonical-penalty fast path needs.
6456    fn design_revision(&self) -> u64 {
6457        self.realizers
6458            .iter()
6459            .fold(0u64, |acc, r| acc.wrapping_add(r.design_revision()))
6460    }
6461}
6462
6463pub(crate) fn seed_risk_profile_for_likelihood_family(
6464    family: &LikelihoodSpec,
6465) -> gam_problem::SeedRiskProfile {
6466    match &family.response {
6467        ResponseFamily::Gaussian => gam_problem::SeedRiskProfile::Gaussian,
6468        ResponseFamily::RoystonParmar => gam_problem::SeedRiskProfile::Survival,
6469        ResponseFamily::Binomial
6470        | ResponseFamily::Poisson
6471        | ResponseFamily::Tweedie { .. }
6472        | ResponseFamily::NegativeBinomial { .. }
6473        | ResponseFamily::Beta { .. }
6474        | ResponseFamily::Gamma => gam_problem::SeedRiskProfile::GeneralizedLinear,
6475    }
6476}
6477
6478/// Joint-θ dimension above which the single-block exact-joint driver routes
6479/// gradient-only (this doc owns the derivation; the routing site only
6480/// compares against it). The exact outer Hessian builds θ(θ+1)/2 pairwise
6481/// hyper operators, so per-eval cost grows quadratically in θ-dim —
6482/// profiled: `TauTauPairHyperOperator::mul_vec` dominates wall-clock at
6483/// spectral-mode measure-jet candidate counts (θ ≈ 9–11), while θ ≤ 8
6484/// (classic Matérn κ/η fits) keeps cheap exact second-order geometry.
6485const EXACT_JOINT_SECOND_ORDER_THETA_CAP: usize = 8;
6486
6487fn exact_joint_seed_config(
6488    risk_profile: gam_problem::SeedRiskProfile,
6489    auxiliary_dim: usize,
6490) -> gam_problem::SeedConfig {
6491    let mut config = gam_problem::SeedConfig {
6492        risk_profile,
6493        num_auxiliary_trailing: auxiliary_dim,
6494        ..Default::default()
6495    };
6496    match risk_profile {
6497        gam_problem::SeedRiskProfile::Gaussian
6498        | gam_problem::SeedRiskProfile::GaussianLocationScale => {
6499            config.max_seeds = 4;
6500            config.seed_budget = 2;
6501        }
6502        gam_problem::SeedRiskProfile::GeneralizedLinear => {
6503            // Bernoulli marginal-slope Matérn fits use the exact-joint spatial
6504            // driver rather than the family-local BMS outer. Mirror BMS proper:
6505            // screen one principled heuristic seed deeply enough to reach the
6506            // KKT basin instead of spending minutes screening equivalent starts.
6507            config.max_seeds = 1;
6508            config.seed_budget = 1;
6509            config.screen_max_inner_iterations = 8;
6510        }
6511        gam_problem::SeedRiskProfile::Survival => {
6512            // Survival marginal-slope has an additional time/hazard block and
6513            // is the most sensitive Matérn startup regime. Keep more of the
6514            // coherent SPDE candidate manifold alive through truncation and
6515            // validate enough starts that one bad transient does not report
6516            // "no candidate seeds" before reaching a viable basin.
6517            config.max_seeds = 8;
6518            config.seed_budget = 4;
6519            config.screen_max_inner_iterations = 8;
6520        }
6521    }
6522    config
6523}
6524
6525#[cfg(test)]
6526mod exact_joint_seed_config_tests {
6527    use super::*;
6528
6529    #[test]
6530    fn exact_joint_marginal_slope_profiles_get_deeper_startup_validation() {
6531        let bms = exact_joint_seed_config(gam_problem::SeedRiskProfile::GeneralizedLinear, 2);
6532        assert_eq!(bms.max_seeds, 1);
6533        assert_eq!(bms.seed_budget, 1);
6534        assert_eq!(bms.screen_max_inner_iterations, 8);
6535        assert_eq!(bms.num_auxiliary_trailing, 2);
6536
6537        let survival = exact_joint_seed_config(gam_problem::SeedRiskProfile::Survival, 3);
6538        assert_eq!(survival.max_seeds, 8);
6539        assert_eq!(survival.seed_budget, 4);
6540        assert_eq!(survival.screen_max_inner_iterations, 8);
6541        assert_eq!(survival.num_auxiliary_trailing, 3);
6542    }
6543
6544    #[test]
6545    fn exact_joint_gaussian_keeps_tight_historical_multistart_budget() {
6546        let gaussian = exact_joint_seed_config(gam_problem::SeedRiskProfile::Gaussian, 1);
6547        assert_eq!(gaussian.max_seeds, 4);
6548        assert_eq!(gaussian.seed_budget, 2);
6549        assert_eq!(
6550            gaussian.screen_max_inner_iterations,
6551            gam_problem::SeedConfig::default().screen_max_inner_iterations
6552        );
6553        assert_eq!(gaussian.num_auxiliary_trailing, 1);
6554    }
6555}
6556
6557pub(crate) fn exact_joint_multistart_outer_problem(
6558    theta0: &Array1<f64>,
6559    lower: &Array1<f64>,
6560    upper: &Array1<f64>,
6561    rho_dim: usize,
6562    auxiliary_dim: usize,
6563    n_params: usize,
6564    gradient: gam_problem::Derivative,
6565    hessian: gam_problem::DeclaredHessianForm,
6566    prefer_gradient_only: bool,
6567    disable_fixed_point: bool,
6568    risk_profile: gam_problem::SeedRiskProfile,
6569    tolerance: f64,
6570    max_iter: usize,
6571    // BFGS step caps split by parameter type. `bfgs_step_cap` (rho-axis cap)
6572    // bounds first-trial moves on log-λ; documented natural step is ≈ 5.
6573    // `bfgs_step_cap_psi` bounds moves on the trailing `auxiliary_dim`
6574    // psi-axes (kappa / aniso-log-scales), where ≈ ln 2 keeps the kernel
6575    // scale from oscillating across orders of magnitude per iter. Using a
6576    // single uniform cap (the old API) starved rho on the survival-marg-slope
6577    // joint solver because the psi-calibrated value (`ln 2 ≈ 0.69`) was
6578    // applied to log-λ, where |d|≈5 is the natural quasi-Newton magnitude.
6579    bfgs_step_cap: Option<f64>,
6580    bfgs_step_cap_psi: Option<f64>,
6581    screening_cap: Option<Arc<AtomicUsize>>,
6582    // `Some((n_obs, p_cols))` calibrates the outer solver to the n-scaled
6583    // profiled REML/LAML criterion exactly as the primary REML outer
6584    // (`solver/estimate.rs`) does. The profiled criterion is a sum over the n
6585    // observations, so its magnitude is O(n) (|f| ~ thousands at n ~ 10³) for
6586    // EVERY family — Gaussian, binomial, GP/kriging alike. A scale-blind outer
6587    // takes the bare `tolerance` (≈1e-6) as the *absolute* projected-gradient
6588    // floor, which is hopelessly tight against an n-scaled gradient: in-basin
6589    // iterates (e.g. ‖g‖≈7e-2 at |f|≈17, or single-digit ‖g‖ at |f|≈1.3e3)
6590    // never clear it and the fit bails at the iteration cap. Worse, ARC's
6591    // trust-region reduction ratios and default initial regularization are
6592    // referenced against the wrong curvature magnitude, so the first step can
6593    // overshoot and diverge (the ‖g‖≈½|f| blow-ups in #1053/#1066). Threading
6594    // the scale (→ absolute floor = max(tol, n·1e-9)) plus a warm ARC
6595    // regularization (σ₀ = 0.25) and operator trust radius (4.0) makes the
6596    // spatial exact-joint outer converge as robustly as the primary REML outer
6597    // across 1-D Matérn (#1053), 2-D binomial geo (#1066), and GP/kriging
6598    // (#1069). This is NOT a loosening of the `τ·(1+|f|)` REML acceptance gate
6599    // — that relative-to-cost criterion is unchanged; only the nonsensical
6600    // scale-free *absolute* floor and the solver's curvature reference are
6601    // corrected. `None` preserves the prior scale-free calibration.
6602    profiled_objective_size: Option<(usize, usize)>,
6603    // #1464: `true` when the fit carries a constant-curvature `curv()` term. Its
6604    // geodesic-exponential kernel collapses toward the constant function on the
6605    // +κ side, so the joint REML optimum there is a LARGE smoothing λ beyond the
6606    // historical ±12 ρ box. For that case the over-smoothing ρ ceiling is widened
6607    // to `RHO_BOUND` and an explicit high-ρ over-smoothing multistart probe is
6608    // seeded so the joint ARC can reach that basin. `false` keeps the historical
6609    // ±12 box and seed grid byte-for-byte for every other spatial/Matérn/Duchon/
6610    // sphere/survival joint fit.
6611    has_constant_curvature: bool,
6612) -> gam_solve::rho_optimizer::OuterProblem {
6613    let mut seed_heuristic = theta0.to_vec();
6614    for value in &mut seed_heuristic[..rho_dim] {
6615        *value = value.exp();
6616    }
6617    // Over-smoothing ρ ceiling: widened only for a constant-curvature fit (see
6618    // the `has_constant_curvature` param doc). Drives both the scalar saturation
6619    // reference and the seed-grid clamp; the actual box is the per-dim
6620    // `lower`/`upper` arrays passed in.
6621    let rho_ceiling = if has_constant_curvature {
6622        gam_solve::estimate::RHO_BOUND
6623    } else {
6624        12.0
6625    };
6626    let mut problem = gam_solve::rho_optimizer::OuterProblem::new(n_params)
6627        .with_gradient(gradient)
6628        .with_hessian(hessian)
6629        .with_prefer_gradient_only(prefer_gradient_only)
6630        .with_disable_fixed_point(disable_fixed_point)
6631        // Re-enable the automatic fallback ladder for exact joint spatial
6632        // problems. It was previously `Disabled` to suppress a geo-bench
6633        // fallback bug where HybridEFS ψ stagnation degraded silently to
6634        // BfgsApprox on a Charbonnier surface. With the ψ-stagnation guard
6635        // in OuterFixedPointBridge (`MAX_CONSECUTIVE_PSI_STAGNATION`) the
6636        // bridge now surfaces `EFS_FIRST_ORDER_FALLBACK_MARKER` when ψ
6637        // stationarity cannot be enforced, so the ladder routes correctly
6638        // to a joint gradient-based solver instead of grinding HybridEFS
6639        // for thousands of iterations.
6640        .with_fallback_policy(gam_solve::rho_optimizer::FallbackPolicy::Automatic)
6641        .with_psi_dim(auxiliary_dim)
6642        .with_tolerance(tolerance)
6643        .with_max_iter(max_iter)
6644        .with_bounds(lower.clone(), upper.clone())
6645        .with_initial_rho(theta0.clone())
6646        .with_bfgs_step_cap(bfgs_step_cap)
6647        .with_bfgs_step_cap_psi(bfgs_step_cap_psi)
6648        .with_seed_config({
6649            let mut sc = exact_joint_seed_config(risk_profile, auxiliary_dim);
6650            if has_constant_curvature {
6651                // Let the seed grid reach the widened over-smoothing ceiling so a
6652                // smooth whose true REML optimum genuinely lives at large λ can be
6653                // discovered (#1464).
6654                sc.bounds = (sc.bounds.0, rho_ceiling);
6655                // gam#1464: do NOT inject an explicit over-smoothing probe at
6656                // ρ ≈ +15 for constant-curvature terms. The probe seeds the joint
6657                // [ρ, ψ] solve at the collapsed-kernel corner where the geodesic
6658                // exponential exp(−d_κ/L) degenerates to a near-constant. There the
6659                // criterion is flat in κ (the kernel no longer resolves curvature)
6660                // and reduces to the monotone log-det Occam term, so keep-best
6661                // adopts the low-Occam collapsed null regardless of the true κ sign
6662                // — the bit-identical κ̂ → +chart-bound rail for both ±κ datasets
6663                // (the headline #1464 sign-blindness). The κ-sign basin is instead
6664                // seeded from the sign-correct fixed-κ profiled-REML scan
6665                // (`select_constant_curvature_kappa_sign_seed`, applied to
6666                // `log_kappa0` above), which routes through the same κ-opt-OFF
6667                // profiled fit the `curvature_inference_forspec` CI oracle trusts,
6668                // so the joint solve starts inside the correct sign basin with a
6669                // non-degenerate (κ-resolving) kernel rather than at the collapsed
6670                // corner. The widened ρ ceiling is retained: legitimate
6671                // over-smoothing is still reachable via the gradient solve and the
6672                // sweep grid, just not pre-pinned to the collapse point.
6673            }
6674            sc
6675        })
6676        .with_rho_bound(rho_ceiling)
6677        .with_heuristic_lambdas(seed_heuristic);
6678    if let Some((n_obs, p_cols)) = profiled_objective_size {
6679        // Calibrate to the n-scaled profiled criterion (see the param doc):
6680        // n-aware objective scale → sane absolute gradient floor + correct ARC
6681        // reduction-ratio reference, plus a warm ARC regularization / operator
6682        // trust radius that prevents the first-step overshoot. These are the
6683        // knobs the spatial exact-joint path was missing relative to the
6684        // primary REML outer; without them the iso-κ length-scale fit stalls or
6685        // diverges as |f| grows with n (#1053 / #1066 / #1069).
6686        problem = problem
6687            .with_objective_scale(Some(n_obs as f64))
6688            .with_problem_size(n_obs, p_cols)
6689            .with_arc_initial_regularization(Some(0.25))
6690            .with_operator_initial_trust_radius(Some(4.0));
6691    }
6692    if let Some(screening_cap) = screening_cap {
6693        problem = problem
6694            .with_screening_cap(screening_cap)
6695            .with_screen_initial_rho(true);
6696    }
6697    problem
6698}
6699
6700/// True iff a κ-phase (`n-block exact-joint spatial`) optimizer failure is a
6701/// NUMERICAL pathology of the length-scale search that the fixed-κ fallback can
6702/// recover from (gam#787/#860), rather than a structural failure that must
6703/// propagate.
6704///
6705/// By the time the κ optimizer runs, the structural identifiability audits have
6706/// already passed upstream, so an all-seeds-rejected / rho-dimension-mismatch
6707/// here means a κ-driven design-rebuild penalty-topology flip starved the
6708/// startup validation — recoverable by fitting at the bootstrap κ. Any other
6709/// optimizer error (a genuine solver contract violation) still propagates.
6710fn kappa_phase_failure_is_fixed_kappa_recoverable(message: &str) -> bool {
6711    message.contains("no candidate seeds passed outer startup validation")
6712        || message.contains("joint hyper rho dimension mismatch")
6713        || message.contains("objective returned a non-finite cost")
6714}
6715
6716pub fn optimize_spatial_length_scale_exact_joint<FitOut, FitFn, ExactFn, ExactEfsFn, SeedFn>(
6717    data: ArrayView2<'_, f64>,
6718    block_specs: &[TermCollectionSpec],
6719    block_term_indices: &[Vec<usize>],
6720    kappa_options: &SpatialLengthScaleOptimizationOptions,
6721    joint_setup: &ExactJointHyperSetup,
6722    seed_risk_profile: gam_problem::SeedRiskProfile,
6723    analytic_joint_gradient_available: bool,
6724    analytic_joint_hessian_available: bool,
6725    disable_fixed_point: bool,
6726    screening_cap: Option<Arc<AtomicUsize>>,
6727    outer_derivative_policy: gam_model_api::families::custom_family::OuterDerivativePolicy,
6728    mut fit_fn: FitFn,
6729    mut exact_fn: ExactFn,
6730    mut exact_efs_fn: ExactEfsFn,
6731    mut seed_inner_beta_fn: SeedFn,
6732) -> Result<SpatialLengthScaleOptimizationResult<FitOut>, String>
6733where
6734    FitOut: Clone,
6735    FitFn: FnMut(
6736        &Array1<f64>,
6737        &[TermCollectionSpec],
6738        &[TermCollectionDesign],
6739    ) -> Result<FitOut, String>,
6740    ExactFn: FnMut(
6741        &Array1<f64>,
6742        &[TermCollectionSpec],
6743        &[TermCollectionDesign],
6744        gam_solve::estimate::reml::reml_outer_engine::EvalMode,
6745        &gam_problem::outer_subsample::RowSet,
6746    ) -> Result<
6747        (
6748            f64,
6749            Array1<f64>,
6750            gam_problem::HessianResult,
6751        ),
6752        String,
6753    >,
6754    ExactEfsFn: FnMut(
6755        &Array1<f64>,
6756        &[TermCollectionSpec],
6757        &[TermCollectionDesign],
6758    ) -> Result<gam_problem::EfsEval, String>,
6759    SeedFn:
6760        FnMut(&Array1<f64>) -> Result<gam_solve::rho_optimizer::SeedOutcome, EstimationError>,
6761{
6762    let n_blocks = block_specs.len();
6763    if block_term_indices.len() != n_blocks {
6764        return Err(SmoothError::dimension_mismatch(format!(
6765            "block_specs ({}) and block_term_indices ({}) length mismatch",
6766            n_blocks,
6767            block_term_indices.len()
6768        ))
6769        .into());
6770    }
6771
6772    let log_kappa_dim = joint_setup.log_kappa_dim();
6773
6774    log::warn!(
6775        "[OUTER-FD-AUDIT/spatial-exact-joint] driver entry: aux_dim={} log_kappa_dim={} kappa_enabled={} rho_dim={} theta0_len={}",
6776        joint_setup.auxiliary_dim(),
6777        log_kappa_dim,
6778        kappa_options.enabled,
6779        joint_setup.rho_dim(),
6780        joint_setup.theta0().len()
6781    );
6782
6783    // -----------------------------------------------------------------------
6784    // Fast path: kappa disabled or no spatial terms — build designs once.
6785    // -----------------------------------------------------------------------
6786    if joint_setup.auxiliary_dim() == 0 && (!kappa_options.enabled || log_kappa_dim == 0) {
6787        log::warn!(
6788            "[OUTER-FD-AUDIT/spatial-exact-joint] taking FAST path (no outer theta optimization in this driver)"
6789        );
6790        let (designs, resolved_specs) = build_term_collection_designs_and_freeze_joint(
6791            data, block_specs,
6792        )
6793        .map_err(|e| {
6794            format!("failed to build and freeze joint block designs during exact joint kappa optimization: {e}")
6795        })?;
6796        let theta0 = joint_setup.theta0();
6797
6798        // Build temporary owned slices for the closure call.
6799        let spec_refs: Vec<TermCollectionSpec> = resolved_specs.clone();
6800        let design_refs: Vec<TermCollectionDesign> = designs.clone();
6801        let fit = fit_fn(&theta0, &spec_refs, &design_refs)?;
6802        return Ok(SpatialLengthScaleOptimizationResult {
6803            resolved_specs,
6804            designs,
6805            fit,
6806            timing: None,
6807        });
6808    }
6809
6810    // -----------------------------------------------------------------------
6811    // Full optimization path.
6812    // -----------------------------------------------------------------------
6813    let theta0 = joint_setup.theta0();
6814    let lower = joint_setup.lower();
6815    let upper = joint_setup.upper();
6816    if theta0.len() < log_kappa_dim || lower.len() != theta0.len() || upper.len() != theta0.len() {
6817        return Err(SmoothError::dimension_mismatch(format!(
6818            "invalid exact joint theta setup: theta0={}, lower={}, upper={}, required_log_kappa_dim={}",
6819            theta0.len(),
6820            lower.len(),
6821            upper.len(),
6822            log_kappa_dim
6823        ))
6824        .into());
6825    }
6826    let rho_dim = joint_setup.rho_dim();
6827    let all_dims = joint_setup.log_kappa_dims_per_term();
6828
6829    // Build bootstrap designs and frozen specs for each block.
6830    let (boot_designs, best_specs) = build_term_collection_designs_and_freeze_joint(
6831        data,
6832        block_specs,
6833    )
6834    .map_err(|e| {
6835        format!(
6836            "failed to build and freeze joint block designs during exact joint kappa bootstrap: {e}"
6837        )
6838    })?;
6839    // Capability vs realized policy: the family may *advertise* an exact
6840    // analytic outer Hessian, but at this realized (n, psi_dim, rho_dim,
6841    // p_total) the predicted per-eval cost can still exceed the universal
6842    // outer-Hessian work budget. In that regime we route the outer optimizer
6843    // through gradient-only BFGS / L-BFGS, which is **convergent** to the
6844    // exact MLE — it just takes more line-search iterations. This is **not**
6845    // a feature drop: quasi-Newton picks up curvature from successive
6846    // analytic gradients, and the per-eval cost saving (`O(p)` instead of
6847    // `O(p²)`) more than pays for the iteration overhead at large scale.
6848    let policy_hessian_form = outer_derivative_policy.declared_hessian_form();
6849    let analytic_outer_hessian_available = analytic_joint_hessian_available
6850        && matches!(
6851            policy_hessian_form,
6852            gam_problem::DeclaredHessianForm::Either
6853                | gam_problem::DeclaredHessianForm::Dense
6854                | gam_problem::DeclaredHessianForm::Operator { .. }
6855        );
6856    let prefer_gradient_only = !analytic_outer_hessian_available;
6857
6858    let theta_dim = theta0.len();
6859    let psi_dim = theta_dim - rho_dim;
6860
6861    // Build the cache with one realizer per block.
6862    let cache_blocks: Vec<(TermCollectionSpec, TermCollectionDesign, Vec<usize>)> = best_specs
6863        .iter()
6864        .zip(boot_designs.iter())
6865        .zip(block_term_indices.iter())
6866        .map(|((spec, design), terms)| (spec.clone(), design.clone(), terms.clone()))
6867        .collect();
6868
6869    struct NBlockExactJointState<'d> {
6870        cache: ExactJointDesignCache<'d>,
6871    }
6872
6873    let mut state = NBlockExactJointState {
6874        cache: ExactJointDesignCache::new(data, cache_blocks, rho_dim, all_dims.clone())?,
6875    };
6876
6877    // ── P7: staged-κ schedule ────────────────────────────────────────────
6878    //
6879    // The κ MLE for a stationary spatial process is asymptotically
6880    // *invariant* in `n` once `n` is past the Monte-Carlo resolution of
6881    // the cell-moment kernel. At large scale (`n ≥ STAGED_KAPPA_*`) the
6882    // Monte-Carlo error of a `K = 5_000`-row pilot is ≪ the κ posterior
6883    // width, so estimating θ on a stratified `K`-row pilot returns
6884    // statistically the *same* estimate as the full-data fit at a
6885    // fraction of the wall-clock cost. We then do one Gauss-Newton-style
6886    // polish at `K_polish` to absorb residual Monte-Carlo error before
6887    // the final coefficient fit at the polished θ on the full data.
6888    //
6889    // This is **not a heuristic shortcut**. It is the textbook
6890    // pilot-then-refine schedule for stationary-process likelihoods,
6891    // chosen here because the per-eval cost of the κ gradient grows
6892    // linearly in `n` and the pilot subsample reduces that cost by a
6893    // factor of `n / K`. The final coefficient fit at θ̂_polished on the
6894    // full data preserves estimation accuracy for β.
6895    //
6896    // At `n < STAGED_KAPPA_TRIGGER_N` the schedule collapses to one
6897    // full-data stage — identical to the pre-P7 behaviour.
6898    // Note: the n≥30_000 pilot trigger lives in
6899    // `outer_derivative_policy.should_use_staged_kappa(n_total)`; this fn
6900    // only carries the constants it consumes directly.
6901    const KAPPA_PILOT_K: usize = 5_000;
6902    const KAPPA_POLISH_K: usize = 25_000;
6903    const KAPPA_POLISH_TRIGGER_N: usize = 100_000;
6904
6905    let n_total = data.nrows();
6906    let use_staged_kappa = outer_derivative_policy.should_use_staged_kappa(n_total);
6907    if use_staged_kappa {
6908        log::info!(
6909            "[KAPPA-STAGED] auto-engaging pilot+polish schedule: n={} pilot_k={} polish_k={}",
6910            n_total,
6911            KAPPA_PILOT_K,
6912            KAPPA_POLISH_K,
6913        );
6914    }
6915
6916    // Build the initial row mask for the κ optimization.
6917    //
6918    // * `use_staged_kappa = false`: full data (`RowSet::All`). The
6919    //   schedule collapses to the historical single-stage path.
6920    // * `use_staged_kappa = true`: deterministic uniform pilot of size
6921    //   `min(KAPPA_PILOT_K, n_total)`, wrapped as a `RowSet::Subsample`
6922    //   with per-row HT weight `n_total / k_pilot`. The uniform pick is
6923    //   a valid unbiased estimator on its own; the stratified
6924    //   per-decile picker
6925    //   (`marginal_slope_shared::auto_outer_score_subsample`) requires
6926    //   the response vector `z`, which only the family evaluator can
6927    //   produce. **Agent C replaces this with the stratified pick once
6928    //   `exact_fn` exposes the per-row score.**
6929    //
6930    // Sampling RNG is seeded from `n_total` so the pilot is
6931    // deterministic across reruns at fixed `n`.
6932    fn build_uniform_pilot_subsample(
6933        n_total: usize,
6934        k_target: usize,
6935        seed: u64,
6936    ) -> gam_problem::outer_subsample::OuterScoreSubsample {
6937        use gam_problem::outer_subsample::OuterScoreSubsample;
6938        let k = k_target.min(n_total);
6939        if k == 0 || n_total == 0 {
6940            return OuterScoreSubsample::from_uniform_inclusion_mask(Vec::new(), n_total, seed);
6941        }
6942        // Reservoir-free deterministic pick: linear congruential walk
6943        // over a shuffled index set; for the pilot, a fast Floyd-style
6944        // sample is sufficient.
6945        let mut mask: Vec<usize> = Vec::with_capacity(k);
6946        // Splitmix64-driven Floyd's sampler.
6947        let mut state = seed.wrapping_add(0x9E3779B97F4A7C15);
6948        let splitmix = |s: &mut u64| -> u64 { gam_linalg::utils::splitmix64(s) };
6949        let mut taken = std::collections::HashSet::with_capacity(k);
6950        for j in (n_total - k)..n_total {
6951            let r = (splitmix(&mut state) % (j as u64 + 1)) as usize;
6952            if !taken.insert(r) {
6953                taken.insert(j);
6954                mask.push(j);
6955            } else {
6956                mask.push(r);
6957            }
6958        }
6959        mask.sort_unstable();
6960        mask.dedup();
6961        OuterScoreSubsample::from_uniform_inclusion_mask(mask, n_total, seed)
6962    }
6963
6964    let current_row_set: std::cell::RefCell<gam_problem::outer_subsample::RowSet> = if use_staged_kappa {
6965        let pilot = build_uniform_pilot_subsample(n_total, KAPPA_PILOT_K, n_total as u64);
6966        std::cell::RefCell::new(gam_problem::outer_subsample::RowSet::Subsample {
6967            rows: std::sync::Arc::clone(&pilot.rows),
6968            n_full: n_total,
6969        })
6970    } else {
6971        std::cell::RefCell::new(gam_problem::outer_subsample::RowSet::All)
6972    };
6973
6974    let exact_fn_cell = std::cell::RefCell::new(&mut exact_fn);
6975    let exact_efs_fn_cell = std::cell::RefCell::new(&mut exact_efs_fn);
6976
6977    // ── κ-optimization scaling instrumentation ──
6978    //
6979    // Per-phase wall-clock counters for the three kinds of evaluator
6980    // invocation the κ outer drives: cost-only line-search probes,
6981    // value-and-gradient(/Hessian) evaluations at accepted iterates, and
6982    // EFS fixed-point evaluations. Each invocation emits one
6983    // `[KAPPA-PHASE]` log line with a per-call elapsed time, plus the
6984    // running call counter and a summary `theta_norm` /
6985    // `log_kappa_norm` so the bench runner can attribute cost to
6986    // particular trajectory regions. A single `[KAPPA-PHASE-SUMMARY]`
6987    // line is emitted on optimization exit. Grepping these is the
6988    // production-fit κ-scaling probe (task #32) — measurement happens
6989    // in real large-scale fits rather than a synthetic harness, so the
6990    // scaling law reflects the actual workload.
6991    use std::cell::Cell;
6992    let kphase_cost_calls: Cell<usize> = Cell::new(0);
6993    let kphase_cost_total_s: Cell<f64> = Cell::new(0.0);
6994    let kphase_eval_calls: Cell<usize> = Cell::new(0);
6995    let kphase_eval_total_s: Cell<f64> = Cell::new(0.0);
6996    let kphase_efs_calls: Cell<usize> = Cell::new(0);
6997    let kphase_efs_total_s: Cell<f64> = Cell::new(0.0);
6998    let kphase_optim_start = std::time::Instant::now();
6999    let kphase_log_kappa_dim = log_kappa_dim;
7000    let kphase_log_norms = |theta: &Array1<f64>| -> (f64, f64) {
7001        let theta_norm = theta.iter().map(|v| v * v).sum::<f64>().sqrt();
7002        let log_kappa_norm = if kphase_log_kappa_dim > 0 && theta.len() >= kphase_log_kappa_dim {
7003            let start = theta.len() - kphase_log_kappa_dim;
7004            theta.iter().skip(start).map(|v| v * v).sum::<f64>().sqrt()
7005        } else {
7006            0.0
7007        };
7008        (theta_norm, log_kappa_norm)
7009    };
7010
7011    use gam_solve::rho_optimizer::OuterEvalOrder;
7012    use gam_problem::{DeclaredHessianForm, Derivative, OuterEval};
7013
7014    // Joint design width across blocks → the `p` reported to the outer solver's
7015    // operator-vs-dense Hessian crossover. `n_total` is the load-bearing
7016    // profiled-objective scale (see `exact_joint_multistart_outer_problem`).
7017    let joint_p_cols: usize = boot_designs
7018        .iter()
7019        .map(|d| d.design.ncols())
7020        .sum::<usize>()
7021        .max(1);
7022
7023    let problem = exact_joint_multistart_outer_problem(
7024        &theta0,
7025        &lower,
7026        &upper,
7027        rho_dim,
7028        psi_dim,
7029        theta_dim,
7030        if analytic_joint_gradient_available {
7031            Derivative::Analytic
7032        } else {
7033            Derivative::Unavailable
7034        },
7035        if analytic_outer_hessian_available {
7036            DeclaredHessianForm::Either
7037        } else {
7038            DeclaredHessianForm::Unavailable
7039        },
7040        prefer_gradient_only,
7041        disable_fixed_point,
7042        seed_risk_profile,
7043        kappa_options.rel_tol.max(1e-6),
7044        kappa_options.max_outer_iter.max(1),
7045        // Rho-axis cap: log-λ natural step ≈ 5.
7046        Some(5.0),
7047        // Psi-axis cap: kappa scale needs ~ln 2 per iter.
7048        Some(kappa_options.log_step.clamp(0.25, 1.0)),
7049        screening_cap.clone(),
7050        // n-scaled profiled-criterion calibration for every family (#1053 /
7051        // #1066 / #1069 iso-κ non-convergence cure).
7052        Some((n_total, joint_p_cols)),
7053        // #1464: widen the over-smoothing ρ ceiling + seed a high-λ probe when
7054        // any block carries a constant-curvature term.
7055        block_specs
7056            .iter()
7057            .any(|s| !constant_curvature_term_indices(s).is_empty()),
7058    );
7059
7060    // Helper: collect specs and designs from cache into owned Vecs for closure calls.
7061    fn collect_specs(cache: &ExactJointDesignCache<'_>) -> Vec<TermCollectionSpec> {
7062        cache.specs().into_iter().cloned().collect()
7063    }
7064    fn collect_designs(cache: &ExactJointDesignCache<'_>) -> Vec<TermCollectionDesign> {
7065        cache.designs().into_iter().cloned().collect()
7066    }
7067
7068    let result = {
7069        let eval_outer = |ctx: &mut &mut NBlockExactJointState<'_>,
7070                          theta: &Array1<f64>,
7071                          order: OuterEvalOrder|
7072         -> Result<OuterEval, EstimationError> {
7073            if let Some((cost, grad, hess)) = ctx.cache.memoized_eval(theta) {
7074                let cached_satisfies_order = match order {
7075                    OuterEvalOrder::Value => true,
7076                    OuterEvalOrder::ValueAndGradient => true,
7077                    OuterEvalOrder::ValueGradientHessian => hess.is_analytic(),
7078                };
7079                if cached_satisfies_order {
7080                    if !cost.is_finite() {
7081                        return Ok(OuterEval::infeasible(theta.len()));
7082                    }
7083                    // Symmetric with the non-finite-cost guard above: a non-finite
7084                    // gradient marks this θ as infeasible just as a non-finite cost
7085                    // does (e.g. degenerate tied / zero-gap survival times drive the
7086                    // analytic exact-joint gradient channel to NaN/Inf). Return the
7087                    // bounded infeasible sentinel so the outer optimizer rejects the
7088                    // step and shrinks its trust region — instead of hard-failing the
7089                    // entire REML fit and handing the driver an unbroken stream of
7090                    // objective failures whose recovery path deepens once per outer
7091                    // step until the worker stack overflows (the survival
7092                    // location-scale path is the one that routes through this analytic
7093                    // gradient, which is why it crashed where the cost-only paths only
7094                    // stall).
7095                    if grad.iter().any(|v| !v.is_finite()) {
7096                        return Ok(OuterEval::infeasible(theta.len()));
7097                    }
7098                    return Ok(OuterEval {
7099                        cost,
7100                        gradient: grad,
7101                        hessian: hess,
7102                        inner_beta_hint: None,
7103                    });
7104                }
7105            }
7106            // Wall-clock budget guard for the outer length-scale search. The
7107            // inner joint-Newton (its `cycle > 0` break) and the seed-screening
7108            // cascade already abandon work once the armed deadline passes, but
7109            // the κ optimizer that DRIVES those inner solves had no such guard:
7110            // every fresh trial θ still paid a full cycle-0 constrained-Newton
7111            // setup (which never certifies on the monotonicity-pinned baseline),
7112            // and the line search kept proposing new probes, so the total fit
7113            // wall-clock was (#outer evals × cycle-0 cost) — unbounded by the
7114            // budget even though both lower levels honored it. Once the deadline
7115            // is spent, refuse to launch any NEW inner solve: serve only the
7116            // already-cached evaluations (handled above, so the best accepted
7117            // iterate is still returned) and mark every uncached trial θ as the
7118            // bounded-infeasible sentinel the optimizer already knows how to
7119            // reject. The line search then backtracks to its accepted iterate in
7120            // O(1) per probe and the driver returns the best-so-far fit. The
7121            // guard is a no-op when no deadline is armed.
7122            if gam_solve::rho_optimizer::outer_wall_clock_deadline_exceeded() {
7123                return Ok(OuterEval::infeasible(theta.len()));
7124            }
7125            if let Err(err) = ctx.cache.ensure_theta(theta) {
7126                log::warn!(
7127                    "[OUTER] n-block exact-joint spatial: ensure_theta failed during gradient evaluation: {err}"
7128                );
7129                return Ok(OuterEval::infeasible(theta.len()));
7130            }
7131            let design_revision = Some(ctx.cache.design_revision());
7132            let specs = collect_specs(&ctx.cache);
7133            let designs = collect_designs(&ctx.cache);
7134            // Clamp the requested order against the realized outer
7135            // derivative policy. The capability-aware
7136            // `analytic_outer_hessian_available` already encodes the
7137            // policy gate; re-checking through `order_for_evaluation`
7138            // here keeps the per-eval branch in lockstep with the
7139            // top-of-function declaration so the optimizer and the
7140            // evaluator never disagree on what was requested.
7141            let clamped = outer_derivative_policy.order_for_evaluation(order);
7142            let need_hessian = matches!(clamped, OuterEvalOrder::ValueGradientHessian)
7143                && analytic_outer_hessian_available;
7144            let eval_mode = if need_hessian {
7145                gam_solve::estimate::reml::reml_outer_engine::EvalMode::ValueGradientHessian
7146            } else {
7147                gam_solve::estimate::reml::reml_outer_engine::EvalMode::ValueAndGradient
7148            };
7149            let t0 = std::time::Instant::now();
7150            let result = {
7151                let row_set_borrow = current_row_set.borrow();
7152                (*exact_fn_cell.borrow_mut())(theta, &specs, &designs, eval_mode, &row_set_borrow)
7153            };
7154            let elapsed_s = t0.elapsed().as_secs_f64();
7155            kphase_eval_calls.set(kphase_eval_calls.get() + 1);
7156            kphase_eval_total_s.set(kphase_eval_total_s.get() + elapsed_s);
7157            let (theta_norm, log_kappa_norm) = kphase_log_norms(theta);
7158            log::info!(
7159                "[KAPPA-PHASE] phase=eval_outer call={} order={:?} design_revision={:?} theta_norm={:.4e} log_kappa_norm={:.4e} elapsed_s={:.4}",
7160                kphase_eval_calls.get(),
7161                order,
7162                design_revision,
7163                theta_norm,
7164                log_kappa_norm,
7165                elapsed_s,
7166            );
7167            match result {
7168                Ok((cost, grad, hess)) => {
7169                    ctx.cache.store_eval((cost, grad.clone(), hess.clone()));
7170                    if !cost.is_finite() {
7171                        return Ok(OuterEval::infeasible(theta.len()));
7172                    }
7173                    // Symmetric with the non-finite-cost guard above: a non-finite
7174                    // gradient marks this θ as infeasible just as a non-finite cost
7175                    // does (e.g. degenerate tied / zero-gap survival times drive the
7176                    // analytic exact-joint gradient channel to NaN/Inf). Return the
7177                    // bounded infeasible sentinel so the outer optimizer rejects the
7178                    // step and shrinks its trust region — instead of hard-failing the
7179                    // entire REML fit and handing the driver an unbroken stream of
7180                    // objective failures whose recovery path deepens once per outer
7181                    // step until the worker stack overflows (the survival
7182                    // location-scale path is the one that routes through this analytic
7183                    // gradient, which is why it crashed where the cost-only paths only
7184                    // stall).
7185                    if grad.iter().any(|v| !v.is_finite()) {
7186                        return Ok(OuterEval::infeasible(theta.len()));
7187                    }
7188                    Ok(OuterEval {
7189                        cost,
7190                        gradient: grad,
7191                        hessian: hess,
7192                        inner_beta_hint: None,
7193                    })
7194                }
7195                Err(err) => {
7196                    log::warn!(
7197                        "[OUTER] n-block exact-joint spatial: exact evaluation failed: {err}"
7198                    );
7199                    Ok(OuterEval::infeasible(theta.len()))
7200                }
7201            }
7202        };
7203
7204        let obj = problem.build_objective_with_eval_order(
7205            &mut state,
7206            |ctx: &mut &mut NBlockExactJointState<'_>, theta: &Array1<f64>| {
7207                if let Some(cost) = ctx.cache.memoized_cost(theta) {
7208                    return Ok(cost);
7209                }
7210                // Wall-clock budget guard (cost-only line-search probe). See the
7211                // sibling guard in `eval_outer`: once the armed outer deadline is
7212                // spent, refuse to start a new inner solve for an uncached trial
7213                // θ and return the +∞ infeasible cost the line search already
7214                // treats as a rejected step, so the search collapses to its best
7215                // accepted iterate in bounded time instead of paying a full
7216                // cycle-0 inner setup per probe. No-op when no deadline is armed.
7217                if gam_solve::rho_optimizer::outer_wall_clock_deadline_exceeded() {
7218                    return Ok(f64::INFINITY);
7219                }
7220                if let Err(err) = ctx.cache.ensure_theta(theta) {
7221                    log::warn!(
7222                        "[OUTER] n-block exact-joint spatial: ensure_theta failed during cost evaluation: {err}"
7223                    );
7224                    return Ok(f64::INFINITY);
7225                }
7226                let design_revision = Some(ctx.cache.design_revision());
7227                let specs = collect_specs(&ctx.cache);
7228                let designs = collect_designs(&ctx.cache);
7229                // Cost-only line-search probe: pass `ValueOnly` so the closure
7230                // skips gradient and Hessian assembly. This is the principled
7231                // fix for the N-block joint optimization V+G-per-probe waste —
7232                // gradient construction (≈ 6.5·10⁹ FLOPs per CTN step at
7233                // n=320 000, n_grid=293, p_resp=32, p_cov=23) is now paid only
7234                // when the outer evaluator actually requests it.
7235                let t0 = std::time::Instant::now();
7236                let result = {
7237                    let row_set_borrow = current_row_set.borrow();
7238                    (*exact_fn_cell.borrow_mut())(
7239                        theta,
7240                        &specs,
7241                        &designs,
7242                        gam_solve::estimate::reml::reml_outer_engine::EvalMode::ValueOnly,
7243                        &row_set_borrow,
7244                    )
7245                };
7246                let elapsed_s = t0.elapsed().as_secs_f64();
7247                kphase_cost_calls.set(kphase_cost_calls.get() + 1);
7248                kphase_cost_total_s.set(kphase_cost_total_s.get() + elapsed_s);
7249                let (theta_norm, log_kappa_norm) = kphase_log_norms(theta);
7250                log::info!(
7251                    "[KAPPA-PHASE] phase=cost call={} design_revision={:?} theta_norm={:.4e} log_kappa_norm={:.4e} elapsed_s={:.4}",
7252                    kphase_cost_calls.get(),
7253                    design_revision,
7254                    theta_norm,
7255                    log_kappa_norm,
7256                    elapsed_s,
7257                );
7258                match result {
7259                    Ok((cost, _grad, _hess)) => {
7260                        // Don't `store_eval`: that path is only valid when the
7261                        // closure produced a real gradient. The next outer-eval
7262                        // call will recompute (V, ∇V) at this θ if needed; the
7263                        // memoized_cost path covers the common case where the
7264                        // line search returns to an accepted iterate.
7265                        ctx.cache.store_cost_only(theta, cost);
7266                        Ok(cost)
7267                    }
7268                    Err(err) => {
7269                        log::warn!(
7270                            "[OUTER] n-block exact-joint spatial: exact cost evaluation failed: {err}"
7271                        );
7272                        Ok(f64::INFINITY)
7273                    }
7274                }
7275            },
7276            |ctx: &mut &mut NBlockExactJointState<'_>, theta: &Array1<f64>| {
7277                eval_outer(
7278                    ctx,
7279                    theta,
7280                    if analytic_outer_hessian_available {
7281                        OuterEvalOrder::ValueGradientHessian
7282                    } else {
7283                        OuterEvalOrder::ValueAndGradient
7284                    },
7285                )
7286            },
7287            |ctx: &mut &mut NBlockExactJointState<'_>,
7288             theta: &Array1<f64>,
7289             order: OuterEvalOrder| { eval_outer(ctx, theta, order) },
7290            None::<fn(&mut &mut NBlockExactJointState<'_>)>,
7291            Some(
7292                |ctx: &mut &mut NBlockExactJointState<'_>, theta: &Array1<f64>| {
7293                    ctx.cache
7294                        .ensure_theta(theta)
7295                        .map_err(EstimationError::InvalidInput)?;
7296                    let design_revision = Some(ctx.cache.design_revision());
7297                    let specs = collect_specs(&ctx.cache);
7298                    let designs = collect_designs(&ctx.cache);
7299                    let t0 = std::time::Instant::now();
7300                    let eval_result = (*exact_efs_fn_cell.borrow_mut())(
7301                        theta,
7302                        &specs,
7303                        &designs,
7304                    );
7305                    let elapsed_s = t0.elapsed().as_secs_f64();
7306                    kphase_efs_calls.set(kphase_efs_calls.get() + 1);
7307                    kphase_efs_total_s.set(kphase_efs_total_s.get() + elapsed_s);
7308                    let (theta_norm, log_kappa_norm) = kphase_log_norms(theta);
7309                    log::info!(
7310                        "[KAPPA-PHASE] phase=efs call={} design_revision={:?} theta_norm={:.4e} log_kappa_norm={:.4e} elapsed_s={:.4}",
7311                        kphase_efs_calls.get(),
7312                        design_revision,
7313                        theta_norm,
7314                        log_kappa_norm,
7315                        elapsed_s,
7316                    );
7317                    let eval = eval_result.map_err(EstimationError::RemlOptimizationFailed)?;
7318                    Ok(eval)
7319                },
7320            ),
7321        );
7322        let mut obj = obj.with_seed_inner_state(
7323            move |_ctx: &mut &mut NBlockExactJointState<'_>, beta: &Array1<f64>| {
7324                (seed_inner_beta_fn)(beta)
7325            },
7326        );
7327
7328        match problem.run(&mut obj, "n-block exact-joint spatial") {
7329            Ok(result) => result,
7330            Err(e) => {
7331                let message = e.to_string();
7332                // Kappa-phase graceful degradation (gam#787/#860). The
7333                // length-scale (κ) optimizer rebuilds the spatial design at each
7334                // trial κ; a κ-driven matern penalty-topology flip (the
7335                // FrozenTransform spectral-tolerance crossing in
7336                // `build_nullspace_shrinkage_penalty`) can make the rebuilt
7337                // design's learned-penalty count disagree with the frozen
7338                // joint-setup ρ dimension, so EVERY κ seed fails startup
7339                // validation ("joint hyper rho dimension mismatch" → all seeds
7340                // rejected → "no candidate seeds passed outer startup
7341                // validation"). That is a NUMERICAL pathology of the κ search on
7342                // a structurally-well-posed design (the structural audits already
7343                // passed upstream) — NOT a reason to fail the whole fit. Fall
7344                // back to a FIXED κ (the bootstrap length-scale, skipping κ
7345                // optimization): build + freeze the joint designs at the initial
7346                // κ and fit there. We lose κ tuning but return a REAL, valid
7347                // model — graceful degradation, exactly mirroring the
7348                // `kappa_options.enabled == false` fixed-κ path above. Only the
7349                // startup-validation / mismatch class is caught; any other κ
7350                // optimizer error still propagates.
7351                if kappa_phase_failure_is_fixed_kappa_recoverable(&message) {
7352                    drop(obj);
7353                    log::warn!(
7354                        "[KAPPA-PHASE] length-scale optimization could not validate any seed \
7355                         ({message}); falling back to a FIXED bootstrap κ (skipping κ \
7356                         optimization) and fitting there — a real model at the initial \
7357                         length-scale rather than raising (gam#787/#860)."
7358                    );
7359                    let (designs, resolved_specs) =
7360                        build_term_collection_designs_and_freeze_joint(data, block_specs).map_err(
7361                            |build_err| {
7362                                format!(
7363                                    "fixed-κ fallback failed to build and freeze joint block \
7364                                     designs after κ optimization could not validate a seed \
7365                                     ({message}): {build_err}"
7366                                )
7367                            },
7368                        )?;
7369                    let fixed_theta0 = joint_setup.theta0();
7370                    let spec_refs: Vec<TermCollectionSpec> = resolved_specs.clone();
7371                    let design_refs: Vec<TermCollectionDesign> = designs.clone();
7372                    let fit = fit_fn(&fixed_theta0, &spec_refs, &design_refs)?;
7373                    return Ok(SpatialLengthScaleOptimizationResult {
7374                        resolved_specs,
7375                        designs,
7376                        fit,
7377                        timing: None,
7378                    });
7379                }
7380                return Err(message);
7381            }
7382        }
7383    }; // obj dropped here, releasing mutable borrow on state
7384
7385    // ── κ-optimization scaling summary ──
7386    //
7387    // Single line summarizing all per-call wall-clock counters
7388    // accumulated above. The bench runner / scaling-law analyzer
7389    // can pivot on this directly without parsing the per-call
7390    // [KAPPA-PHASE] markers (which remain available for
7391    // attribution).
7392    let kphase_total_s = kphase_optim_start.elapsed().as_secs_f64();
7393    log::info!(
7394        "[KAPPA-PHASE-SUMMARY] log_kappa_dim={} n_cost={} cost_total_s={:.4} n_eval={} eval_total_s={:.4} n_efs={} efs_total_s={:.4} optim_total_s={:.4}",
7395        kphase_log_kappa_dim,
7396        kphase_cost_calls.get(),
7397        kphase_cost_total_s.get(),
7398        kphase_eval_calls.get(),
7399        kphase_eval_total_s.get(),
7400        kphase_efs_calls.get(),
7401        kphase_efs_total_s.get(),
7402        kphase_total_s,
7403    );
7404    let timing = SpatialLengthScaleOptimizationTiming {
7405        log_kappa_dim: kphase_log_kappa_dim,
7406        cost_calls: kphase_cost_calls.get(),
7407        cost_total_s: kphase_cost_total_s.get(),
7408        eval_calls: kphase_eval_calls.get(),
7409        eval_total_s: kphase_eval_total_s.get(),
7410        efs_calls: kphase_efs_calls.get(),
7411        efs_total_s: kphase_efs_total_s.get(),
7412        slow_path_resets: 0,
7413        design_revision_delta: 0,
7414        nfree_miss_shape: 0,
7415        nfree_miss_value: 0,
7416        nfree_miss_gradient: 0,
7417        nfree_miss_penalty: 0,
7418        nfree_miss_revision: 0,
7419        nfree_miss_second_order: 0,
7420        nfree_miss_other: 0,
7421        optim_total_s: kphase_total_s,
7422    };
7423
7424    let theta_star = result.rho;
7425
7426    // ── P7 stage rotation ────────────────────────────────────────────────
7427    //
7428    // The optimization above ran against `current_row_set` — the pilot
7429    // subsample under `use_staged_kappa`, otherwise the full data. We
7430    // now:
7431    //
7432    // 1. If `n_total ≥ KAPPA_POLISH_TRIGGER_N`, rotate to a larger
7433    //    polish subsample and request a single value+gradient evaluation
7434    //    at `theta_star` so the family caches its polished score. This
7435    //    is the Gauss-Newton-style polish in the schedule — one step
7436    //    rather than a full re-run because the pilot has already
7437    //    consumed most of the curvature information.
7438    //
7439    // 2. Always rotate back to `RowSet::All` before the final
7440    //    coefficient fit `fit_fn(theta_star)`. The final β estimate at
7441    //    θ̂ uses the full data so no estimation accuracy is lost.
7442    if use_staged_kappa && n_total >= KAPPA_POLISH_TRIGGER_N {
7443        let polish = build_uniform_pilot_subsample(
7444            n_total,
7445            KAPPA_POLISH_K,
7446            (n_total as u64).wrapping_add(0xA5A5A5A5),
7447        );
7448        *current_row_set.borrow_mut() = gam_problem::outer_subsample::RowSet::Subsample {
7449            rows: std::sync::Arc::clone(&polish.rows),
7450            n_full: n_total,
7451        };
7452        log::info!(
7453            "[KAPPA-STAGED] rotating to polish subsample: k={} at theta_star",
7454            polish.rows.len(),
7455        );
7456        // One V+G evaluation at theta_star on the polish subsample. The
7457        // returned objective pieces must be usable; the family-side cache
7458        // update inside `exact_fn` is consumed by the final fit.
7459        state.cache.ensure_theta(&theta_star)?;
7460        let (polish_cost, polish_grad, _) = {
7461            let specs = collect_specs(&state.cache);
7462            let designs = collect_designs(&state.cache);
7463            let row_set_borrow = current_row_set.borrow();
7464            exact_fn(
7465                &theta_star,
7466                &specs,
7467                &designs,
7468                gam_solve::estimate::reml::reml_outer_engine::EvalMode::ValueAndGradient,
7469                &row_set_borrow,
7470            )?
7471        };
7472        if !polish_cost.is_finite() || polish_grad.iter().any(|value| !value.is_finite()) {
7473            return Err(
7474                "polish subsample exact-joint evaluation produced non-finite objective pieces"
7475                    .to_string(),
7476            );
7477        }
7478    }
7479    *current_row_set.borrow_mut() = gam_problem::outer_subsample::RowSet::All;
7480    if use_staged_kappa {
7481        log::info!(
7482            "[KAPPA-STAGED] rotating to full data for final coefficient fit (n={})",
7483            n_total,
7484        );
7485    }
7486
7487    state.cache.ensure_theta(&theta_star)?;
7488
7489    let resolved_specs: Vec<TermCollectionSpec> = collect_specs(&state.cache);
7490    let designs: Vec<TermCollectionDesign> = collect_designs(&state.cache);
7491
7492    let fit = fit_fn(&theta_star, &resolved_specs, &designs)?;
7493
7494    for spec in &resolved_specs {
7495        log_spatial_aniso_scales(spec);
7496    }
7497
7498    Ok(SpatialLengthScaleOptimizationResult {
7499        resolved_specs,
7500        designs,
7501        fit,
7502        timing: Some(timing),
7503    })
7504}
7505
7506fn try_exact_joint_latent_coord_optimization(
7507    data: ArrayView2<'_, f64>,
7508    y: ArrayView1<'_, f64>,
7509    weights: ArrayView1<'_, f64>,
7510    offset: ArrayView1<'_, f64>,
7511    resolvedspec: &TermCollectionSpec,
7512    best: &FittedTermCollection,
7513    family: LikelihoodSpec,
7514    options: &FitOptions,
7515    latent: &StandardLatentCoordConfig,
7516) -> Result<FittedTermCollectionWithSpec, EstimationError> {
7517    use gam_solve::rho_optimizer::OuterEvalOrder;
7518    use gam_problem::{DeclaredHessianForm, Derivative, OuterEval};
7519
7520    let rho_dim = best.fit.lambdas.len();
7521    let latent_flat_dim = latent.values.len();
7522    if latent_flat_dim == 0 {
7523        crate::bail_invalid_estim!(
7524            "latent-coordinate optimization requires a non-empty latent block"
7525        );
7526    }
7527    let direct_hypers =
7528        latent_coord_initial_direct_hypers(latent.values.id_mode(), latent.values.latent_dim())?;
7529    let analytic_rho_count = latent
7530        .analytic_penalties
7531        .as_ref()
7532        .map_or(0, |registry| registry.total_rho_count());
7533    let latent_coord_ext_dim = latent_flat_dim + analytic_rho_count + direct_hypers.len();
7534
7535    let mut theta0 = Array1::<f64>::zeros(rho_dim + latent_coord_ext_dim);
7536    theta0
7537        .slice_mut(s![..rho_dim])
7538        .assign(&best.fit.lambdas.mapv(f64::ln));
7539    theta0
7540        .slice_mut(s![rho_dim..rho_dim + latent_flat_dim])
7541        .assign(latent.values.as_flat());
7542    if !direct_hypers.is_empty() {
7543        let direct_start = rho_dim + latent_flat_dim + analytic_rho_count;
7544        theta0
7545            .slice_mut(s![direct_start..direct_start + direct_hypers.len()])
7546            .assign(&direct_hypers);
7547    }
7548
7549    let mut lower = Array1::<f64>::from_elem(theta0.len(), -12.0);
7550    let mut upper = Array1::<f64>::from_elem(theta0.len(), 12.0);
7551    let latent_bound = latent
7552        .values
7553        .as_flat()
7554        .iter()
7555        .fold(1.0_f64, |acc, &v| acc.max(v.abs()))
7556        + 10.0;
7557    for axis in rho_dim..rho_dim + latent_flat_dim {
7558        lower[axis] = -latent_bound;
7559        upper[axis] = latent_bound;
7560    }
7561
7562    struct LatentJointContext<'d> {
7563        rho_dim: usize,
7564        cache: SingleBlockLatentCoordDesignCache,
7565        evaluator: gam_solve::estimate::ExternalJointHyperEvaluator<'d>,
7566    }
7567
7568    impl<'d> LatentJointContext<'d> {
7569        fn eval_full(
7570            &mut self,
7571            theta: &Array1<f64>,
7572            order: OuterEvalOrder,
7573        ) -> Result<
7574            (
7575                f64,
7576                Array1<f64>,
7577                gam_problem::HessianResult,
7578            ),
7579            EstimationError,
7580        > {
7581            if let Some(eval) = self.cache.memoized_eval(theta) {
7582                return Ok(eval);
7583            }
7584            self.cache
7585                .ensure_theta(theta)
7586                .map_err(EstimationError::InvalidInput)?;
7587            let hyper_dirs = self
7588                .cache
7589                .hyper_dirs()
7590                .map_err(EstimationError::InvalidInput)?;
7591            let design_revision = Some(self.cache.design_revision());
7592            let registry_for_key = self.cache.analytic_penalties();
7593            self.evaluator
7594                .set_analytic_penalty_registry(registry_for_key.as_deref());
7595            let mut eval = evaluate_joint_reml_outer_eval_at_theta(
7596                &mut self.evaluator,
7597                self.cache.design(),
7598                theta,
7599                self.rho_dim,
7600                hyper_dirs,
7601                None,
7602                order,
7603                design_revision,
7604            )?;
7605            let latent = self.cache.latent().map_err(EstimationError::InvalidInput)?;
7606            if let Some(registry) = registry_for_key {
7607                let mut registry = registry.as_ref().clone();
7608                registry.apply_weight_schedules(
7609                    gam_solve::estimate::reml::outer_eval::current_outer_iter() as usize,
7610                );
7611                add_analytic_penalty_objective_to_eval(
7612                    theta,
7613                    self.rho_dim,
7614                    latent.as_ref(),
7615                    &registry,
7616                    &mut eval,
7617                )?;
7618            }
7619            add_latent_id_objective_to_eval(
7620                theta,
7621                self.rho_dim,
7622                self.cache.analytic_penalty_rho_count(),
7623                latent.as_ref(),
7624                &mut eval,
7625            )?;
7626            self.cache.store_eval(eval.clone());
7627            Ok(eval)
7628        }
7629
7630        fn eval_efs(
7631            &mut self,
7632            theta: &Array1<f64>,
7633        ) -> Result<gam_problem::EfsEval, EstimationError> {
7634            self.cache
7635                .ensure_theta(theta)
7636                .map_err(EstimationError::InvalidInput)?;
7637            let hyper_dirs = self
7638                .cache
7639                .hyper_dirs()
7640                .map_err(EstimationError::InvalidInput)?;
7641            let registry_for_key = self.cache.analytic_penalties();
7642            self.evaluator
7643                .set_analytic_penalty_registry(registry_for_key.as_deref());
7644            let mut efs = evaluate_joint_reml_efs_at_theta(
7645                &mut self.evaluator,
7646                self.cache.design(),
7647                theta,
7648                self.rho_dim,
7649                hyper_dirs,
7650                None,
7651                Some(self.cache.design_revision()),
7652            )?;
7653            if let Some(registry) = registry_for_key {
7654                let mut registry = registry.as_ref().clone();
7655                registry.apply_weight_schedules(
7656                    gam_solve::estimate::reml::outer_eval::current_outer_iter() as usize,
7657                );
7658                let latent = self.cache.latent().map_err(EstimationError::InvalidInput)?;
7659                let contribution = analytic_penalty_objective_contribution(
7660                    theta,
7661                    self.rho_dim,
7662                    latent.as_ref(),
7663                    &registry,
7664                )?;
7665                efs.cost += contribution.cost;
7666                if let (Some(psi_gradient), Some(psi_indices)) =
7667                    (efs.psi_gradient.as_mut(), efs.psi_indices.as_ref())
7668                {
7669                    if psi_gradient.len() != psi_indices.len() {
7670                        crate::bail_invalid_estim!(
7671                            "latent-coordinate analytic penalty EFS psi gradient length mismatch: gradient={}, indices={}",
7672                            psi_gradient.len(),
7673                            psi_indices.len()
7674                        );
7675                    }
7676                    for (local_idx, &theta_idx) in psi_indices.iter().enumerate() {
7677                        psi_gradient[local_idx] += contribution.gradient[theta_idx];
7678                    }
7679                }
7680            }
7681            Ok(efs)
7682        }
7683
7684        fn eval_cost(&mut self, theta: &Array1<f64>) -> f64 {
7685            if let Some(cost) = self.cache.memoized_cost(theta) {
7686                return cost;
7687            }
7688            if self.cache.ensure_theta(theta).is_err() {
7689                return f64::INFINITY;
7690            }
7691            let design_revision = Some(self.cache.design_revision());
7692            let registry_for_key = self.cache.analytic_penalties();
7693            self.evaluator
7694                .set_analytic_penalty_registry(registry_for_key.as_deref());
7695            let result = {
7696                let design = self.cache.design();
7697                self.evaluator.evaluate_cost_only(
7698                    &design.design,
7699                    &design.penalties,
7700                    &design.nullspace_dims,
7701                    design.linear_constraints.clone(),
7702                    theta,
7703                    self.rho_dim,
7704                    None,
7705                    "latent-coordinate-joint cost-only",
7706                    design_revision,
7707                )
7708            };
7709            match result {
7710                Ok(cost) => {
7711                    let latent = match self.cache.latent() {
7712                        Ok(latent) => latent,
7713                        Err(_) => return f64::INFINITY,
7714                    };
7715                    let contribution = match latent_id_objective_contribution(
7716                        theta,
7717                        self.rho_dim,
7718                        self.cache.analytic_penalty_rho_count(),
7719                        latent.as_ref(),
7720                    ) {
7721                        Ok(contribution) => contribution,
7722                        Err(_) => return f64::INFINITY,
7723                    };
7724                    let cost = cost + contribution.cost;
7725                    let cost = if let Some(registry) = registry_for_key {
7726                        let mut registry = registry.as_ref().clone();
7727                        registry.apply_weight_schedules(
7728                            gam_solve::estimate::reml::outer_eval::current_outer_iter()
7729                                as usize,
7730                        );
7731                        match analytic_penalty_objective_contribution(
7732                            theta,
7733                            self.rho_dim,
7734                            latent.as_ref(),
7735                            &registry,
7736                        ) {
7737                            Ok(contribution) => cost + contribution.cost,
7738                            Err(_) => return f64::INFINITY,
7739                        }
7740                    } else {
7741                        cost
7742                    };
7743                    self.cache.store_cost(cost);
7744                    cost
7745                }
7746                Err(_) => f64::INFINITY,
7747            }
7748        }
7749    }
7750
7751    let mut ctx = LatentJointContext {
7752        rho_dim,
7753        cache: SingleBlockLatentCoordDesignCache::new(
7754            data.to_owned(),
7755            resolvedspec.clone(),
7756            best.design.clone(),
7757            latent,
7758            rho_dim,
7759        )
7760        .map_err(EstimationError::InvalidInput)?,
7761        evaluator: gam_solve::estimate::ExternalJointHyperEvaluator::new(
7762            y,
7763            weights,
7764            &best.design.design,
7765            offset,
7766            &best.design.penalties,
7767            &external_opts_for_design(&family, &best.design, options),
7768            "latent-coordinate-joint",
7769        )?,
7770    };
7771    let registry_for_key = ctx.cache.analytic_penalties();
7772    ctx.evaluator
7773        .set_analytic_penalty_registry(registry_for_key.as_deref());
7774    ctx.evaluator
7775        .set_persistent_latent_values_fingerprint(latent.values.id_mode());
7776    if let Some(cached_t) = ctx
7777        .evaluator
7778        .load_persistent_latent_values(latent.values.n_obs(), latent.values.latent_dim())
7779    {
7780        let cached_t: Array2<f64> = cached_t;
7781        for (dst, src) in theta0
7782            .slice_mut(s![rho_dim..rho_dim + latent_flat_dim])
7783            .iter_mut()
7784            .zip(cached_t.iter())
7785        {
7786            *dst = *src;
7787        }
7788    }
7789
7790    let problem = exact_joint_multistart_outer_problem(
7791        &theta0,
7792        &lower,
7793        &upper,
7794        rho_dim,
7795        latent_coord_ext_dim,
7796        theta0.len(),
7797        Derivative::Analytic,
7798        DeclaredHessianForm::Unavailable,
7799        false,
7800        false,
7801        seed_risk_profile_for_likelihood_family(&family),
7802        options.tol,
7803        options.max_iter.max(1),
7804        Some(5.0),
7805        Some(0.5),
7806        None,
7807        // n-scaled profiled-criterion calibration (same absolute-gradient-floor
7808        // correction as the spatial paths; #1053 / #1066 / #1069).
7809        Some((data.nrows(), best.design.design.ncols().max(1))),
7810        // #1464: widen the over-smoothing ρ ceiling and seed the high-ρ probe
7811        // only when a constant-curvature curv() term is present in this fit.
7812        !constant_curvature_term_indices(resolvedspec).is_empty(),
7813    );
7814
7815    let eval_outer = |ctx: &mut &mut LatentJointContext<'_>,
7816                      theta: &Array1<f64>,
7817                      order: OuterEvalOrder|
7818     -> Result<OuterEval, EstimationError> {
7819        let (cost, gradient, hessian) = ctx.eval_full(theta, order)?;
7820        Ok(OuterEval {
7821            cost,
7822            gradient,
7823            hessian,
7824            inner_beta_hint: None,
7825        })
7826    };
7827
7828    let result = {
7829        let mut obj = problem.build_objective_with_eval_order(
7830            &mut ctx,
7831            |ctx: &mut &mut LatentJointContext<'_>, theta: &Array1<f64>| Ok(ctx.eval_cost(theta)),
7832            |ctx: &mut &mut LatentJointContext<'_>, theta: &Array1<f64>| {
7833                eval_outer(ctx, theta, OuterEvalOrder::ValueAndGradient)
7834            },
7835            |ctx: &mut &mut LatentJointContext<'_>, theta: &Array1<f64>, order: OuterEvalOrder| {
7836                eval_outer(ctx, theta, order)
7837            },
7838            Some(|ctx: &mut &mut LatentJointContext<'_>| {
7839                ctx.cache.reset();
7840            }),
7841            Some(|ctx: &mut &mut LatentJointContext<'_>, theta: &Array1<f64>| ctx.eval_efs(theta)),
7842        );
7843
7844        problem
7845            .run(&mut obj, "latent-coordinate joint REML")
7846            .map_err(|e| {
7847                EstimationError::InvalidInput(format!(
7848                    "latent-coordinate joint optimization failed after exhausting strategy fallbacks: {e}"
7849                ))
7850            })?
7851    };
7852    if !result.converged {
7853        crate::bail_invalid_estim!(
7854            "latent-coordinate joint optimization did not converge after {} iterations (final_objective={:.6e}, final_grad_norm={})",
7855            result.iterations,
7856            result.final_value,
7857            result.final_grad_norm_report(),
7858        );
7859    }
7860
7861    let theta_star = result.rho;
7862    let rho_star = theta_star.slice(s![..rho_dim]).mapv(f64::exp);
7863    let mut final_data = data.to_owned();
7864    let flat_t = theta_star
7865        .slice(s![rho_dim..rho_dim + latent_flat_dim])
7866        .to_owned();
7867    let mut fitted_latent_values =
7868        Array2::<f64>::zeros((latent.values.n_obs(), latent.values.latent_dim()));
7869    for n in 0..latent.values.n_obs() {
7870        for axis in 0..latent.values.latent_dim() {
7871            let value = flat_t[n * latent.values.latent_dim() + axis];
7872            fitted_latent_values[[n, axis]] = value;
7873            final_data[[n, latent.feature_cols[axis]]] = value;
7874        }
7875    }
7876    let optimized = fit_term_collection_forspecwith_heuristic_lambdas(
7877        final_data.view(),
7878        y,
7879        weights,
7880        offset,
7881        resolvedspec,
7882        rho_star.as_slice(),
7883        family,
7884        options,
7885    )?;
7886    ctx.evaluator
7887        .store_persistent_latent_values(&fitted_latent_values);
7888    let mut fit = optimized.fit;
7889    fit.reml_score = result.final_value;
7890    fit.penalized_objective = result.final_value;
7891    Ok(FittedTermCollectionWithSpec {
7892        fit,
7893        design: optimized.design,
7894        resolvedspec: resolvedspec.clone(),
7895        adaptive_diagnostics: optimized.adaptive_diagnostics,
7896        kappa_timing: None,
7897    })
7898}
7899
7900pub fn fit_term_collectionwith_latent_coord_optimization(
7901    data: ArrayView2<'_, f64>,
7902    y: Array1<f64>,
7903    weights: Array1<f64>,
7904    offset: Array1<f64>,
7905    spec: &TermCollectionSpec,
7906    latent: &StandardLatentCoordConfig,
7907    family: LikelihoodSpec,
7908    options: &FitOptions,
7909) -> Result<FittedTermCollectionWithSpec, EstimationError> {
7910    let n = data.nrows();
7911    if !(y.len() == n && weights.len() == n && offset.len() == n) {
7912        crate::bail_invalid_estim!(
7913            "fit_term_collectionwith_latent_coord_optimization row mismatch: n={}, y={}, weights={}, offset={}",
7914            n,
7915            y.len(),
7916            weights.len(),
7917            offset.len()
7918        );
7919    }
7920    let best = fit_term_collection_forspec(
7921        data,
7922        y.view(),
7923        weights.view(),
7924        offset.view(),
7925        spec,
7926        family.clone(),
7927        options,
7928    )?;
7929    let resolvedspec = freeze_term_collection_from_design(spec, &best.design)?;
7930    try_exact_joint_latent_coord_optimization(
7931        data,
7932        y.view(),
7933        weights.view(),
7934        offset.view(),
7935        &resolvedspec,
7936        &best,
7937        family,
7938        options,
7939        latent,
7940    )
7941}
7942
7943pub fn fit_term_collectionwith_spatial_length_scale_optimization(
7944    data: ArrayView2<'_, f64>,
7945    y: Array1<f64>,
7946    weights: Array1<f64>,
7947    offset: Array1<f64>,
7948    spec: &TermCollectionSpec,
7949    family: LikelihoodSpec,
7950    options: &FitOptions,
7951    kappa_options: &SpatialLengthScaleOptimizationOptions,
7952) -> Result<FittedTermCollectionWithSpec, EstimationError> {
7953    // Spatial hyperparameters change kernel geometry nonlinearly, so each
7954    // proposal rebuilds the spatial basis. Hybrid/isotropic terms expose a
7955    // scalar κ (= 1/length_scale); pure Duchon anisotropy exposes only
7956    // per-axis shape coordinates.
7957    //
7958    // When exact derivative information is available for the rebuilt basis and
7959    // penalty, kappa is promoted to a first-class outer hyperparameter beside
7960    // rho = log(lambda). In that mode this routine runs a joint outer solve in
7961    // theta = [rho, psi], where psi = log(kappa) = -log(length_scale), and the
7962    // optimizer is expected to consume a real joint Hessian. ARC is not meant
7963    // to run on a gradient-only surrogate here.
7964    //
7965    // Any eligible spatial smooth participates in this outer solve. If an
7966    // eligible spatial basis does not expose derivative information, that is
7967    // now a hard error.
7968    let mut resolvedspec = spec.clone();
7969    let spatial_terms = spatial_length_scale_term_indices(&resolvedspec);
7970    let n = data.nrows();
7971    if !(y.len() == n && weights.len() == n && offset.len() == n) {
7972        crate::bail_invalid_estim!(
7973            "fit_term_collectionwith_spatial_length_scale_optimization row mismatch: n={}, y={}, weights={}, offset={}",
7974            n,
7975            y.len(),
7976            weights.len(),
7977            offset.len()
7978        );
7979    }
7980    if !kappa_options.enabled || spatial_terms.is_empty() {
7981        let out = fit_term_collection_forspec(
7982            data,
7983            y.view(),
7984            weights.view(),
7985            offset.view(),
7986            &resolvedspec,
7987            family,
7988            options,
7989        )?;
7990        let resolvedspec = freeze_term_collection_from_design(&resolvedspec, &out.design)?;
7991        return Ok(FittedTermCollectionWithSpec {
7992            fit: out.fit,
7993            design: out.design,
7994            resolvedspec,
7995            adaptive_diagnostics: out.adaptive_diagnostics,
7996            kappa_timing: None,
7997        });
7998    }
7999    if kappa_options.max_outer_iter == 0 {
8000        crate::bail_invalid_estim!("spatial kappa optimization requires max_outer_iter >= 1");
8001    }
8002    if !(kappa_options.log_step.is_finite() && kappa_options.log_step > 0.0) {
8003        crate::bail_invalid_estim!("spatial kappa optimization requires log_step > 0");
8004    }
8005    if !(kappa_options.min_length_scale.is_finite()
8006        && kappa_options.max_length_scale.is_finite()
8007        && kappa_options.min_length_scale > 0.0
8008        && kappa_options.max_length_scale >= kappa_options.min_length_scale)
8009    {
8010        crate::bail_invalid_estim!(
8011            "spatial kappa optimization requires valid positive length_scale bounds"
8012        );
8013    }
8014
8015    let pilot_threshold = kappa_options.pilot_subsample_threshold;
8016    if pilot_threshold > 0 && n > pilot_threshold * 2 {
8017        log::info!(
8018            "[spatial-kappa] n={n} exceeds pilot threshold {}; using pilot geometry only for deterministic anisotropy initialization",
8019            pilot_threshold * 2,
8020        );
8021        apply_spatial_anisotropy_pilot_initializer(
8022            data,
8023            &mut resolvedspec,
8024            &spatial_terms,
8025            pilot_threshold,
8026            kappa_options,
8027        );
8028    }
8029
8030    // #1376: the geometry-only anisotropy seed (`initial_aniso_contrasts`, from
8031    // per-axis knot-coordinate spread) is blind to the response, so a signal
8032    // axis and a nuisance axis with equal coordinate spread both seed to ~0 and
8033    // the κ optimizer can stall at the symmetric point (it found a weak/flat
8034    // antisymmetric gradient, amplified by double-penalty nullspace shrinkage).
8035    // Add a bounded, response-aware per-axis nudge so the optimizer starts in
8036    // the correct basin. This runs whether or not the pilot initializer fired
8037    // (the pilot path is gated on a large-n threshold).
8038    apply_response_aware_anisotropy_seed(data, y.view(), &mut resolvedspec, &spatial_terms);
8039
8040    // #1464: pin each constant-curvature term's κ to the κ-FAIR sign-scan value
8041    // BEFORE the baseline fit. The production profiled-REML criterion
8042    // (`fixed_kappa_profiled_reml_score`) that drives BOTH the baseline geometry
8043    // and the joint solve's accept-vs-baseline gate (`joint_final_value >
8044    // baseline_score`) is SIGN-BLIND — its data-fit term decreases monotonically
8045    // toward +κ for either truth sign, so a baseline left at κ = 0 always beats a
8046    // correctly-signed-but-negative κ candidate on raw REML, and the gate discards
8047    // the right answer (hyperbolic κ̂ → 0, recovered as spherical). Only the κ-fair
8048    // scan (`constant_curvature_kappa_fair_sign_score`, which subtracts the
8049    // design's generic radial-peak-fitting power) identifies the sign; since the κ
8050    // MAGNITUDE is unidentified (raw V_p rails to a chart bound regardless), the
8051    // scan's argmin is the authoritative κ̂. Pinning the baseline there makes the
8052    // baseline, the frozen joint candidate (see the κ-PIN in
8053    // `try_exact_joint_spatial_length_scale_optimization`), and the gate all agree
8054    // on the sign-correct κ. Byte-identical for genuinely spherical data (the scan
8055    // and the raw criterion both pick the +bound there) and for non-CC spatial
8056    // terms (never entered). A scan result of κ = 0 (genuinely flat) leaves κ as-is.
8057    for term_idx in constant_curvature_term_indices(&resolvedspec) {
8058        if let Some(kappa_seed) =
8059            select_constant_curvature_kappa_sign_seed(data, y.view(), &resolvedspec, term_idx)
8060            && kappa_seed != 0.0
8061            && let Some(SmoothBasisSpec::ConstantCurvature { spec: cc, .. }) =
8062                resolvedspec.smooth_terms.get_mut(term_idx).map(|t| &mut t.basis)
8063        {
8064            log::info!(
8065                "[#1464] pinned CC term {term_idx} baseline κ to κ-fair scan value {kappa_seed} \
8066                 (raw profiled REML is sign-blind; scan is authoritative for the sign)"
8067            );
8068            cc.kappa = kappa_seed;
8069        }
8070    }
8071
8072    let baseline_options = superseded_fit_options(options);
8073    let mut best = fit_term_collection_forspec(
8074        data,
8075        y.view(),
8076        weights.view(),
8077        offset.view(),
8078        &resolvedspec,
8079        family.clone(),
8080        &baseline_options,
8081    )?;
8082    resolvedspec = freeze_term_collection_from_design(&resolvedspec, &best.design)?;
8083    // The freeze step can rewrite a term's basis variant — most notably when
8084    // `build_thin_plate_basis_with_workspace` auto-promotes an infeasible
8085    // canonical-TPS request to a pure Duchon spline (length_scale = None,
8086    // no anisotropy). The pre-fit eligibility list was computed against the
8087    // ThinPlate spec, which has length_scale set, so it included that term.
8088    // After the rewrite the same term is a *pure* Duchon basis with no free
8089    // length-scale parameter to optimize, and the downstream kappa solver
8090    // (which assumes hybrid Duchon for log-κ derivatives) errors out. Refresh
8091    // the index list so it reflects the post-freeze spec.
8092    let mut spatial_terms = spatial_length_scale_term_indices(&resolvedspec);
8093    // Sync knot-cloud-derived aniso contrasts from the basis metadata back
8094    // into the spec so the optimizer starts from the geometry-informed η values
8095    // rather than the zero sentinel from --scale-dimensions.
8096    sync_aniso_contrasts_from_metadata(&mut resolvedspec, &best.design.smooth);
8097    // #1074: kernel-range multi-start. The single midpoint seed can strand the
8098    // joint [ρ, ψ] solver in a long-range local optimum for the roughest kernels
8099    // (Matérn ν=3/2); a coarse log-κ grid restart re-seeds the spec's length
8100    // scale in the globally best-scoring basin before the joint solve refines it.
8101    // Strict-improvement-only, so a fit the midpoint already solved well is left
8102    // byte-identical. Isotropic/non-CC only (gated inside the helper).
8103    let mut prescan_improved = false;
8104    if !spatial_terms.is_empty() {
8105        let baseline_score = fit_score(&best.fit);
8106        let range_overrides = prescan_isotropic_spatial_range_seed(
8107            data,
8108            y.view(),
8109            weights.view(),
8110            offset.view(),
8111            &resolvedspec,
8112            baseline_score,
8113            &family,
8114            &baseline_options,
8115            kappa_options,
8116            &spatial_terms,
8117        )?;
8118        if !range_overrides.is_empty() {
8119            prescan_improved = true;
8120            for (term_idx, length_scale) in range_overrides {
8121                set_spatial_length_scale(&mut resolvedspec, term_idx, length_scale)?;
8122            }
8123            // Recompute the baseline (and re-freeze) at the re-seeded geometry so
8124            // the joint solver's ψ seed, ρ seed, accept/reject gate, and the
8125            // frozen-baseline fallback all start from the better basin.
8126            best = fit_term_collection_forspec(
8127                data,
8128                y.view(),
8129                weights.view(),
8130                offset.view(),
8131                &resolvedspec,
8132                family.clone(),
8133                &baseline_options,
8134            )?;
8135            resolvedspec = freeze_term_collection_from_design(&resolvedspec, &best.design)?;
8136            // A re-seeded length scale can, in rare geometries, re-trigger the
8137            // freeze-time basis promotion (ThinPlate → pure Duchon); refresh the
8138            // spatial-term index list so the joint solve sees the current spec.
8139            spatial_terms = spatial_length_scale_term_indices(&resolvedspec);
8140        }
8141    }
8142    if spatial_terms.is_empty() {
8143        let fitted = fit_term_collection_forspecwith_heuristic_lambdas(
8144            data,
8145            y.view(),
8146            weights.view(),
8147            offset.view(),
8148            &resolvedspec,
8149            best.fit.lambdas.as_slice(),
8150            family,
8151            options,
8152        )?;
8153        return Ok(FittedTermCollectionWithSpec {
8154            fit: fitted.fit,
8155            design: fitted.design,
8156            resolvedspec,
8157            adaptive_diagnostics: fitted.adaptive_diagnostics,
8158            kappa_timing: None,
8159        });
8160    }
8161    let initial_score = fit_score(&best.fit);
8162    if !initial_score.is_finite() {
8163        log::debug!("[spatial-kappa] initial profiled score is non-finite");
8164    }
8165    let joint_result = try_exact_joint_spatial_length_scale_optimization(
8166        data,
8167        y.view(),
8168        weights.view(),
8169        offset.view(),
8170        &resolvedspec,
8171        &best,
8172        family.clone(),
8173        options,
8174        kappa_options,
8175        &spatial_terms,
8176    )
8177    .map(|opt| {
8178        opt.map(|fit| {
8179            let score = fit_score(&fit.fit);
8180            (fit, score)
8181        })
8182    });
8183    // #1074: when the multi-start pre-scan already placed the seed in a good,
8184    // finite basin, a HARD joint-solve failure (e.g. a NaN covariance from κ
8185    // railing into the kernel-collapse corner during the local polish) must not
8186    // sink the whole fit — the pre-scan geometry is itself a valid κ-optimized
8187    // result (ρ profiled at the best-scoring fixed κ). Fall back to it, exactly
8188    // as the NonConverged / worsened-score gates inside the joint solver already
8189    // fall back to the frozen baseline. Only the local polish (a fraction of a
8190    // REML nat) is forgone. Scoped to the pre-scan-improved case so ordinary
8191    // joint failures keep raising as before.
8192    let exact_joint = if prescan_improved && !matches!(joint_result, Ok(Some(_))) {
8193        let reason = match &joint_result {
8194            Err(e) => format!("error: {e}"),
8195            _ => "unavailable".to_string(),
8196        };
8197        log::info!(
8198            "[spatial-kappa] #1074 joint polish yielded no usable candidate \
8199             ({reason}); returning the multi-start pre-scan geometry (REML {initial_score:.5})"
8200        );
8201        FittedTermCollectionWithSpec {
8202            fit: best.fit,
8203            design: best.design,
8204            resolvedspec,
8205            adaptive_diagnostics: best.adaptive_diagnostics,
8206            kappa_timing: None,
8207        }
8208    } else {
8209        require_successful_spatial_optimization_result(initial_score, joint_result)?
8210    };
8211    log_spatial_aniso_scales(&exact_joint.resolvedspec);
8212    Ok(exact_joint)
8213}
8214
8215/// The end-to-end curvature-as-an-estimand report for one `curv(...)` smooth:
8216/// the fitted κ̂, its profile-likelihood confidence interval, the interior
8217/// κ = 0 likelihood-ratio flatness test, and the topology-free geometry
8218/// verdict. This is the #944 headline — it turns "we chose hyperbolic space"
8219/// into "κ̂ = −1.8 (95% CI −2.6, −1.1), flat rejected at p = …".
8220#[derive(Clone, Debug)]
8221pub struct CurvatureInference {
8222    /// Smooth-term index of the `curv(...)` term this report is about.
8223    pub term_idx: usize,
8224    /// The fitted signed sectional curvature κ̂ (the outer optimiser's argmin of
8225    /// the profiled REML/LAML criterion over κ).
8226    pub kappa_hat: f64,
8227    /// Profile-likelihood CI for κ and the geometry verdict from its sign.
8228    pub ci: gam_geometry::curvature_estimand::KappaProfileCi,
8229    /// Interior-point κ = 0 likelihood-ratio flatness test (full χ²₁, no
8230    /// half-χ² boundary correction — κ = 0 is an interior point of the
8231    /// `S^d ← ℝ^d → H^d` family).
8232    pub flatness: gam_geometry::curvature_estimand::FlatnessTest,
8233}
8234
8235/// Compute the #944 curvature inference for the constant-curvature smooth at
8236/// `term_idx`, given the already-fitted resolved spec (carrying κ̂) and the same
8237/// fit inputs used to produce it.
8238///
8239/// The profiled criterion `V_p(κ) = max_{ρ} V(κ, ρ)` is evaluated as an oracle:
8240/// for each probe κ, pin the term's curvature to κ, fit with κ-optimisation
8241/// **disabled** (so only the smoothing parameters ρ are profiled), and read the
8242/// resulting `reml_score` (the negative-log-evidence the outer loop minimises,
8243/// so κ̂ is its argmin). The exact same criterion the joint κ-fit minimised —
8244/// the only difference is which coordinates move — so κ̂ is a genuine stationary
8245/// point of this oracle. The statistics (profile-CI walk, interior κ=0 LR test)
8246/// are then the principled likelihood-set / Wilks constructions in
8247/// [`gam_geometry::curvature_estimand`].
8248///
8249/// `v_pp` (the initial Wald step size) is taken from a central finite difference
8250/// of `V_p` at κ̂; the CI itself is the exact χ²₁ likelihood crossing, not the
8251/// Wald ellipsoid, so this only sizes the first bracket step.
8252pub fn curvature_inference_forspec(
8253    data: ArrayView2<'_, f64>,
8254    y: ArrayView1<'_, f64>,
8255    weights: ArrayView1<'_, f64>,
8256    offset: ArrayView1<'_, f64>,
8257    resolvedspec: &TermCollectionSpec,
8258    term_idx: usize,
8259    family: LikelihoodSpec,
8260    options: &FitOptions,
8261    level: f64,
8262) -> Result<CurvatureInference, EstimationError> {
8263    let kappa_hat = get_constant_curvature_kappa(resolvedspec, term_idx).ok_or_else(|| {
8264        EstimationError::InvalidInput(format!(
8265            "curvature_inference_forspec: term {term_idx} is not a constant-curvature smooth"
8266        ))
8267    })?;
8268    let (kappa_min, kappa_max) = constant_curvature_kappa_bounds(data, resolvedspec, term_idx);
8269
8270    // Profiled criterion oracle V_p(κ) for the CI walk and the κ = 0 flatness LR
8271    // test. This MUST be the same criterion that selected κ̂, otherwise the
8272    // statistics are inconsistent with the point estimate. For a constant-
8273    // curvature smooth κ̂ is chosen by the κ-FAIR criterion
8274    // (`constant_curvature_kappa_fair_sign_score`, #1464) — the raw
8275    // `fixed_kappa_profiled_reml_score` is sign-BLIND in κ on a generic radial
8276    // signal (the +κ chart's distance-compression is a uniformly better
8277    // interpolator regardless of the true sign, so raw V_p rails to the +chart
8278    // bound for both signs and would report `V_p(0) < V_p(κ̂)`, i.e. a flatness
8279    // p-value of 1 even for genuinely curved truth). We therefore evaluate the
8280    // CI/flatness criterion with the κ-fair score, which subtracts the design's
8281    // generic radial-peak-fitting power so only the genuine curvature-shape
8282    // signal remains and `V_fair(κ̂) < V_fair(0)` for curved truth. The κ-fair
8283    // score is the basis-level criterion; resolve this term's feature columns and
8284    // base spec so each κ-probe scores the production constant-curvature basis.
8285    // Use the κ-fair criterion for the CI/flatness ONLY when κ̂ is in the
8286    // hyperbolic basin (κ̂ < 0) — the regime where κ̂ was chosen by the κ-fair
8287    // fast-path (`constant_curvature_kappa_fair_argmin`), so the flatness LR and
8288    // CI must use the SAME criterion to be consistent (raw V_p is sign-blind and
8289    // would report `V_p(0) < V_p(κ̂)`, a flatness p-value of 1 even for genuinely
8290    // hyperbolic truth). For κ̂ ≥ 0 (spherical via the joint solver, or a genuinely
8291    // flat κ̂ ≈ 0) the raw production V_p is the right, scale-correct criterion and
8292    // already sizes flatness correctly, so we keep it — this preserves the
8293    // spherical and flat statistics unchanged.
8294    let cc_fair_inputs: Option<(Array2<f64>, gam_terms::basis::ConstantCurvatureBasisSpec)> =
8295        if kappa_hat < 0.0 {
8296            match resolvedspec.smooth_terms.get(term_idx).map(|t| &t.basis) {
8297                Some(SmoothBasisSpec::ConstantCurvature {
8298                    feature_cols, spec, ..
8299                }) => select_columns(data, feature_cols)
8300                    .ok()
8301                    .map(|x| (x, spec.clone())),
8302                _ => None,
8303            }
8304        } else {
8305            None
8306        };
8307
8308    // Memoize across κ probes. The CI walk's bracketing/bisection, the
8309    // central-difference v_pp seed, and the flatness LR test all re-evaluate
8310    // the criterion at the SAME κ, so caching by the raw bits of κ removes
8311    // redundant evaluations with no change to the statistical answer.
8312    let v_p_cache: std::cell::RefCell<std::collections::HashMap<u64, f64>> =
8313        std::cell::RefCell::new(std::collections::HashMap::new());
8314    let v_p = |kappa: f64| -> Result<f64, String> {
8315        if !kappa.is_finite() {
8316            return Err(format!("V_p probed a non-finite κ = {kappa}"));
8317        }
8318        let key = kappa.to_bits();
8319        if let Some(&cached) = v_p_cache.borrow().get(&key) {
8320            return Ok(cached);
8321        }
8322        let score = if let Some((x_term, base_spec)) = &cc_fair_inputs {
8323            let mut probe_spec = base_spec.clone();
8324            probe_spec.kappa = kappa;
8325            gam_terms::basis::constant_curvature_kappa_fair_sign_score(x_term.view(), y, &probe_spec)
8326                .map_err(|e| format!("κ-fair criterion at κ={kappa} failed: {e}"))?
8327        } else {
8328            fixed_kappa_profiled_reml_score(
8329                data,
8330                y,
8331                weights,
8332                offset,
8333                resolvedspec,
8334                term_idx,
8335                kappa,
8336                family.clone(),
8337                options,
8338            )
8339            .map_err(|e| format!("V_p fixed-κ fit at κ={kappa} failed: {e}"))?
8340        };
8341        v_p_cache.borrow_mut().insert(key, score);
8342        Ok(score)
8343    };
8344
8345    // Wald step seed: central FD of V_p at κ̂ (only sizes the first bracket; the
8346    // CI is the exact likelihood crossing). Step a small fraction of the κ
8347    // window so the FD straddles κ̂ without leaving the chart.
8348    let h = (1e-3 * (kappa_max - kappa_min)).max(1e-4);
8349    let v_pp = match (v_p(kappa_hat + h), v_p(kappa_hat), v_p(kappa_hat - h)) {
8350        (Ok(vp), Ok(v0), Ok(vm)) => (vp - 2.0 * v0 + vm) / (h * h),
8351        _ => f64::NAN, // profile_ci_walk falls back to a default step
8352    };
8353
8354    let ci = gam_geometry::curvature_estimand::profile_ci_walk(
8355        &v_p, kappa_hat, v_pp, kappa_min, kappa_max, level, 1e-4,
8356    )
8357    .map_err(EstimationError::InvalidInput)?;
8358    let flatness = gam_geometry::curvature_estimand::flatness_lr_test(&v_p, kappa_hat)
8359        .map_err(EstimationError::InvalidInput)?;
8360
8361    Ok(CurvatureInference {
8362        term_idx,
8363        kappa_hat,
8364        ci,
8365        flatness,
8366    })
8367}
8368
8369/// Provenance tag for the smooth-term significance correction (#1063): which
8370/// statistic the reported p-value is built from.
8371#[derive(Clone, Copy, Debug, PartialEq, Eq)]
8372pub enum SmoothLrCorrection {
8373    /// A per-term LR statistic corrected by the full estimated-λ Lawley factor,
8374    /// including the ρ̂-sampling-variation contribution from the regularized
8375    /// inverse REML/LAML outer Hessian.
8376    LawleyLrEstimatedLambda,
8377    /// A per-term likelihood-ratio statistic `W = 2(ℓ_full − ℓ_null)` that has
8378    /// been Bartlett-corrected with the fixed-λ Lawley factor `c = E[W|λ]/d`
8379    /// (`W* = W/c`, referenced against `χ²_d`). This is used only when the
8380    /// estimated-λ handoff is unavailable.
8381    LawleyLrFixedLambda,
8382    /// No second-order correction was applied — either the family has no
8383    /// closed-form Lawley cumulant jets or the null refit did not converge — so
8384    /// the uncorrected `χ²_d` of the raw LR statistic stands.
8385    None,
8386}
8387
8388impl SmoothLrCorrection {
8389    /// The serialized provenance label surfaced in the summary table.
8390    pub fn label(self) -> &'static str {
8391        match self {
8392            SmoothLrCorrection::LawleyLrEstimatedLambda => "lawley_lr_estimated_lambda",
8393            SmoothLrCorrection::LawleyLrFixedLambda => "lawley_lr_fixed_lambda",
8394            SmoothLrCorrection::None => "none",
8395        }
8396    }
8397}
8398
8399/// The Bartlett-corrected per-term significance report for one penalized smooth
8400/// term (#1063). Unlike the summary table's Wood rank-truncated **Wald**
8401/// statistic, this is a genuine **likelihood-ratio** statistic from a
8402/// constrained refit (the smooth dropped), so the exact Lawley LR Bartlett
8403/// factor corrects the right quantity.
8404#[derive(Clone, Debug)]
8405pub struct SmoothTermLrInference {
8406    /// Smooth-term name (matches the summary row).
8407    pub name: String,
8408    /// Smooth-term index within `resolvedspec.smooth_terms`.
8409    pub term_idx: usize,
8410    /// The uncorrected likelihood-ratio statistic `W = 2(ℓ_full − ℓ_null)`,
8411    /// floored at zero (a non-negative LR by construction).
8412    pub statistic_lr: f64,
8413    /// Reference degrees of freedom `d` (the Wood truncation `tr(F)²/tr(F²)` on
8414    /// the term's influence block, falling back to the term EDF).
8415    pub ref_df: f64,
8416    /// Lawley LR Bartlett factor `c = E[W]/d = 1 + Δε/d` when computable, else
8417    /// `1.0` (no correction).
8418    pub bartlett_factor: f64,
8419    /// Fixed-λ conditional factor `c_cond = 1 + Δε(ρ̂)/d` when the estimated-λ
8420    /// correction was applied. `None` means the applied factor was either the
8421    /// fixed-λ factor itself or no Lawley correction was available.
8422    pub bartlett_factor_conditional: Option<f64>,
8423    /// Increment in Lawley's LR mean shift due solely to ρ̂ sampling variation,
8424    /// `0.5 * tr(H_Δε Cov(ρ̂))`, when estimated-λ correction was applied.
8425    pub rho_variation_shift: Option<f64>,
8426    /// Bartlett-corrected statistic `W* = W / c`.
8427    pub statistic_corrected: f64,
8428    /// Uncorrected p-value `P(χ²_d > W)`.
8429    pub p_value_uncorrected: f64,
8430    /// Corrected p-value `P(χ²_d > W*)`; equals the uncorrected value when no
8431    /// correction was applied.
8432    pub p_value_corrected: f64,
8433    /// Whether the second-order correction is **material** (#939 deliverable 4):
8434    /// the per-test diagnostic "is `n` too small for first-order inference
8435    /// *here*?". `true` when a correction was applied and it moves the result by
8436    /// more than [`SMOOTH_LR_MATERIAL_THRESHOLD`] — measured as the larger of the
8437    /// relative Bartlett-factor distance from one `|c − 1|` and the relative
8438    /// p-value change `|p* − p| / max(p, p*, ε)`. `false` when `correction` is
8439    /// [`SmoothLrCorrection::None`] (no correction was applied).
8440    pub material: bool,
8441    /// Which statistic the corrected p-value is built from.
8442    pub correction: SmoothLrCorrection,
8443}
8444
8445/// The materiality threshold for [`SmoothTermLrInference::material`] (#939
8446/// deliverable 4): a correction is flagged material when it changes the result
8447/// by more than 10%.
8448pub const SMOOTH_LR_MATERIAL_THRESHOLD: f64 = 0.10;
8449
8450/// Build `S_b = lambda_b * S_b^unit` as global `p_total x p_total` matrices in
8451/// exactly the fitted rho/lambda ordering. This is the narrow handoff the
8452/// estimated-lambda Lawley correction needs: the same `design.penalties` order
8453/// already paired with `fit.lambdas`, without changing #740's outer-Hessian
8454/// algebra or the production penalty assembly.
8455fn fitted_rho_penalty_components(
8456    penalties: &[BlockwisePenalty],
8457    lambdas: &[f64],
8458    p_total: usize,
8459) -> Result<Vec<gam_terms::inference::lawley::RhoPenaltyComponent>, EstimationError> {
8460    if penalties.len() != lambdas.len() {
8461        return Err(EstimationError::InvalidInput(format!(
8462            "smooth_term_lr_inference: penalty/lambda count mismatch ({} penalties, {} lambdas)",
8463            penalties.len(),
8464            lambdas.len()
8465        )));
8466    }
8467    let mut components = Vec::with_capacity(penalties.len());
8468    for (idx, (penalty, &lambda)) in penalties.iter().zip(lambdas.iter()).enumerate() {
8469        if !(lambda.is_finite() && lambda >= 0.0) {
8470            return Err(EstimationError::InvalidInput(format!(
8471                "smooth_term_lr_inference: lambda[{idx}] is invalid: {lambda}"
8472            )));
8473        }
8474        let r = &penalty.col_range;
8475        if r.end > p_total {
8476            return Err(EstimationError::InvalidInput(format!(
8477                "smooth_term_lr_inference: penalty[{idx}] range {:?} exceeds coefficient dimension {p_total}",
8478                r
8479            )));
8480        }
8481        let mut s_component = Array2::<f64>::zeros((p_total, p_total));
8482        s_component
8483            .slice_mut(s![r.start..r.end, r.start..r.end])
8484            .scaled_add(lambda, &penalty.local);
8485        components.push(gam_terms::inference::lawley::RhoPenaltyComponent { s_component });
8486    }
8487    Ok(components)
8488}
8489
8490/// The end-to-end per-term likelihood-ratio significance report for every
8491/// penalized (shape-unconstrained) smooth term in a fitted model, magically
8492/// Bartlett-corrected when the family carries closed-form Lawley cumulant jets
8493/// (#1063, follow-up to #939).
8494///
8495/// # Why an LR statistic (not the summary Wald)
8496///
8497/// The summary table's `wood_smooth_test` is Wood's rank-truncated **Wald**
8498/// statistic `T = β̂'Σ̂⁻β̂`. Lawley's ε corrects the **likelihood-ratio**
8499/// statistic, and under penalization the Wald form is already a weighted χ²
8500/// whose second-order mean is *not* `d + Δε` — dividing `T` by the LR factor
8501/// would correct the wrong statistic. The principled route (#1063 Option 1) is
8502/// to compute a real per-term LR statistic by a constrained refit and correct
8503/// *that*:
8504///
8505/// ```text
8506/// W = 2(ℓ_full − ℓ_null),   W* = W / c,   c = 1 + Δε/d,   p = P(χ²_d > W*).
8507/// ```
8508///
8509/// # Method
8510///
8511/// 1. Fit the full model and read `ℓ_full` and the per-term coefficient ranges /
8512///    EDF / influence block. The full design's column layout fixes the tested
8513///    block for the Lawley factor.
8514/// 2. For each penalized smooth term, refit a null model with that term dropped
8515///    from the spec; `W = max(2(ℓ_full − ℓ_null), 0)`.
8516/// 3. The reference d.f. `d` is the Wood truncation `tr(F)²/tr(F²)` on the
8517///    term's influence block (falling back to the term EDF) — the same `ref_df`
8518///    the summary Wald row reports.
8519/// 4. When the family has closed-form cumulant jets, evaluate Lawley's ε at the
8520///    **null** linear predictor (an expectation evaluated at the null fit), fold
8521///    the full λ-scaled penalty `S_λ` into the information, and Bartlett-correct
8522///    `W` with [`gam_terms::inference::lawley::lawley_lr_bartlett_factor`]. The
8523///    null annihilates the tested block's penalty (`S_λ β₀ = 0` on that block),
8524///    so the penalized Lawley expansion applies verbatim.
8525/// 5. Otherwise (no closed-form jets, or a null refit that did not converge) the
8526///    uncorrected `χ²_d` stands with provenance `none` — never weakened.
8527///
8528/// Random-effect smooths and shape-constrained smooths are skipped (their tests
8529/// are not a central-χ² LR), matching the summary table's policy.
8530pub fn smooth_term_lr_inference_forspec(
8531    data: ArrayView2<'_, f64>,
8532    y: ArrayView1<'_, f64>,
8533    weights: ArrayView1<'_, f64>,
8534    offset: ArrayView1<'_, f64>,
8535    resolvedspec: &TermCollectionSpec,
8536    family: LikelihoodSpec,
8537    options: &FitOptions,
8538) -> Result<Vec<SmoothTermLrInference>, EstimationError> {
8539    use gam_terms::inference::lawley::{
8540        LAWLEY_PAIR_MATRIX_MAX_ROWS, known_scale_expected_jets_with_dispersion,
8541        lawley_lr_bartlett_factor, lawley_lr_mean_shift_with_rho_variation,
8542    };
8543
8544    let n = data.nrows();
8545    // Full fit: ℓ_full, the per-term coefficient ranges/EDF/influence, and the
8546    // full design whose column layout fixes each tested block for Lawley.
8547    let full = fit_term_collection_forspec(
8548        data,
8549        y,
8550        weights,
8551        offset,
8552        resolvedspec,
8553        family.clone(),
8554        options,
8555    )?;
8556    let ll_full = full.fit.log_likelihood;
8557    let p_total = full.design.design.ncols();
8558    let lambdas = full.fit.lambdas.as_slice().ok_or_else(|| {
8559        EstimationError::InvalidInput(
8560            "smooth_term_lr_inference: non-contiguous lambda vector".to_string(),
8561        )
8562    })?;
8563    let s_lambda = weighted_blockwise_penalty_sum(&full.design.penalties, lambdas, p_total);
8564    let rho_penalty_components =
8565        fitted_rho_penalty_components(&full.design.penalties, lambdas, p_total)?;
8566    let rho_covariance = full.fit.artifacts.rho_covariance.as_ref().filter(|cov| {
8567        cov.nrows() == rho_penalty_components.len() && cov.ncols() == rho_penalty_components.len()
8568    });
8569    // Full design as a dense n×p array for the Lawley pair-matrix reduction.
8570    let full_design_dense = full.design.design.to_dense();
8571    let influence = full.fit.coefficient_influence();
8572    let family_disp = lawley_dispersion_for_family(&family, &full.fit);
8573
8574    // The penalty-block cursor walks the same block order the summary table
8575    // uses: random-effect ranges first (skipped here), then smooth terms.
8576    let mut penalty_cursor = full.design.random_effect_ranges.len();
8577    let mut out = Vec::<SmoothTermLrInference>::new();
8578    for (term_idx, design_term) in full.design.smooth.terms.iter().enumerate() {
8579        let k = design_term.penalties_local.len();
8580        let block_start = penalty_cursor;
8581        penalty_cursor += k;
8582        // Shape-constrained smooths get no central-χ² LR (cone-projected
8583        // boundary test); the summary table skips them too.
8584        if design_term.shape != ShapeConstraint::None {
8585            continue;
8586        }
8587        let coeff_range = design_term.coeff_range.clone();
8588        if coeff_range.start >= coeff_range.end || coeff_range.end > p_total {
8589            continue;
8590        }
8591        // Per-term EDF for the χ² reference df FALLBACK (used only when the
8592        // influence matrix `F` is unavailable). Route through `per_term_edf`,
8593        // which uses the ADDITIVE per-block trace channel
8594        // (`|coeff_range| − Σ_{kk∈term} tr_kk`) and caps at the model total,
8595        // rather than the raw `edf_by_block` block-sum `Σ_{kk}(rank_kk − tr_kk)`.
8596        // For a multi-penalty term (te/ti/double-penalty) the penalties share one
8597        // coefficient range, so the rank-based block-sum OVER-COUNTS the term EDF
8598        // (Σ rank_kk > |coeff_range|) and would inflate the LR reference df,
8599        // biasing the smooth-term test conservative on large/sparse fits where `F`
8600        // is not materialised. (Same per-block over-count class as the multinomial
8601        // `edf_per_class` fix.)
8602        let edf = full.fit.per_term_edf(coeff_range.clone(), block_start, k);
8603        let ref_df = wood_reference_df(influence, &coeff_range).unwrap_or(edf.max(1e-12));
8604        if !(ref_df.is_finite() && ref_df > 0.0) {
8605            continue;
8606        }
8607
8608        // Null model: drop this smooth term from the spec and refit. The term's
8609        // name pins which spec entry to remove (design and spec share names).
8610        let mut null_spec = resolvedspec.clone();
8611        let Some(spec_pos) = null_spec
8612            .smooth_terms
8613            .iter()
8614            .position(|t| t.name == design_term.name)
8615        else {
8616            continue;
8617        };
8618        null_spec.smooth_terms.remove(spec_pos);
8619        let null_fit = fit_term_collection_forspec(
8620            data,
8621            y,
8622            weights,
8623            offset,
8624            &null_spec,
8625            family.clone(),
8626            options,
8627        );
8628        let (statistic_lr, eta_null) = match null_fit {
8629            Ok(null) if null.fit.log_likelihood.is_finite() => {
8630                let w = (2.0 * (ll_full - null.fit.log_likelihood)).max(0.0);
8631                // η at the null fit: X_null β_null + offset (per-row linear
8632                // predictor; design-layout independent — Lawley reads it on the
8633                // full design rows).
8634                let mut eta = null.design.design.dot(&null.fit.beta);
8635                eta += &offset;
8636                (w, Some(eta))
8637            }
8638            _ => (f64::NAN, None),
8639        };
8640
8641        let chi2 = statrs::distribution::ChiSquared::new(ref_df).ok();
8642        let p_uncorrected = match (chi2.as_ref(), statistic_lr.is_finite()) {
8643            (Some(dist), true) => {
8644                use statrs::distribution::ContinuousCDF;
8645                (1.0 - dist.cdf(statistic_lr)).clamp(0.0, 1.0)
8646            }
8647            _ => f64::NAN,
8648        };
8649
8650        // Magic Bartlett correction: only when the LR statistic is finite, the
8651        // family has closed-form jets, n is in the resolvable regime, and the
8652        // factor is computable. Otherwise the uncorrected χ² stands.
8653        let mut bartlett_factor = 1.0;
8654        let mut bartlett_factor_conditional = None;
8655        let mut rho_variation_shift = None;
8656        let mut statistic_corrected = statistic_lr;
8657        let mut p_corrected = p_uncorrected;
8658        let mut correction = SmoothLrCorrection::None;
8659        if let (Some(eta), true, true) = (
8660            eta_null.as_ref(),
8661            statistic_lr.is_finite(),
8662            n <= LAWLEY_PAIR_MATRIX_MAX_ROWS,
8663        ) {
8664            let kappas: Option<Vec<_>> = (0..n)
8665                .map(|i| {
8666                    known_scale_expected_jets_with_dispersion(&family, eta[i], family_disp)
8667                        .and_then(|jets| jets.kappas().ok())
8668                })
8669                .collect();
8670            if let (Some(kappas), Some(dist)) = (kappas, chi2.as_ref()) {
8671                let fixed_factor = lawley_lr_bartlett_factor(
8672                    full_design_dense.view(),
8673                    &kappas,
8674                    Some(s_lambda.view()),
8675                    coeff_range.clone(),
8676                    ref_df,
8677                );
8678                if let Ok(c_cond) = fixed_factor
8679                    && c_cond.is_finite()
8680                    && c_cond > 0.0
8681                {
8682                    let mut c_applied = c_cond;
8683                    correction = SmoothLrCorrection::LawleyLrFixedLambda;
8684                    if let Some(cov) = rho_covariance
8685                        && let Ok(total_shift) = lawley_lr_mean_shift_with_rho_variation(
8686                            full_design_dense.view(),
8687                            &kappas,
8688                            s_lambda.view(),
8689                            coeff_range.clone(),
8690                            &rho_penalty_components,
8691                            cov.view(),
8692                        )
8693                    {
8694                        let mean_w = ref_df + total_shift;
8695                        if let Some(c_est) =
8696                            gam_terms::inference::higher_order::bartlett_factor_from_mean(
8697                                mean_w, ref_df,
8698                            )
8699                            && c_est.is_finite()
8700                            && c_est > 0.0
8701                        {
8702                            let conditional_shift = (c_cond - 1.0) * ref_df;
8703                            c_applied = c_est;
8704                            bartlett_factor_conditional = Some(c_cond);
8705                            rho_variation_shift = Some(total_shift - conditional_shift);
8706                            correction = SmoothLrCorrection::LawleyLrEstimatedLambda;
8707                        }
8708                    }
8709                    use statrs::distribution::ContinuousCDF;
8710                    bartlett_factor = c_applied;
8711                    statistic_corrected = statistic_lr / c_applied;
8712                    p_corrected = (1.0 - dist.cdf(statistic_corrected)).clamp(0.0, 1.0);
8713                }
8714            }
8715        }
8716
8717        // Materiality (#939 deliverable 4): only when a correction was actually
8718        // applied, flagged when it moves the result by more than the 10%
8719        // threshold — by the Bartlett factor's distance from one OR the relative
8720        // p-value shift, whichever is larger (a factor near one can still flip a
8721        // p-value sitting on the α boundary, and vice versa).
8722        let material = match correction {
8723            SmoothLrCorrection::LawleyLrEstimatedLambda
8724            | SmoothLrCorrection::LawleyLrFixedLambda => {
8725                let factor_move = (bartlett_factor - 1.0).abs();
8726                let p_denom = p_uncorrected.max(p_corrected).max(f64::MIN_POSITIVE);
8727                let p_move = if p_uncorrected.is_finite() && p_corrected.is_finite() {
8728                    (p_corrected - p_uncorrected).abs() / p_denom
8729                } else {
8730                    0.0
8731                };
8732                factor_move > SMOOTH_LR_MATERIAL_THRESHOLD || p_move > SMOOTH_LR_MATERIAL_THRESHOLD
8733            }
8734            SmoothLrCorrection::None => false,
8735        };
8736
8737        out.push(SmoothTermLrInference {
8738            name: design_term.name.clone(),
8739            term_idx,
8740            statistic_lr,
8741            ref_df,
8742            bartlett_factor,
8743            bartlett_factor_conditional,
8744            rho_variation_shift,
8745            statistic_corrected,
8746            p_value_uncorrected: p_uncorrected,
8747            p_value_corrected: p_corrected,
8748            material,
8749            correction,
8750        });
8751    }
8752    Ok(out)
8753}
8754
8755/// The dispersion `φ` Lawley needs for the family's cumulant scaling: Gaussian
8756/// `σ̂²`, Gamma `1/shape`, and `1` for the scale-free Poisson/Binomial.
8757fn lawley_dispersion_for_family(family: &LikelihoodSpec, fit: &UnifiedFitResult) -> f64 {
8758    match family.response {
8759        gam_spec::ResponseFamily::Gaussian => {
8760            let sd = fit.standard_deviation;
8761            (sd * sd).max(f64::MIN_POSITIVE)
8762        }
8763        gam_spec::ResponseFamily::Gamma => {
8764            let shape = fit.standard_deviation;
8765            if shape.is_finite() && shape > 0.0 {
8766                1.0 / shape
8767            } else {
8768                1.0
8769            }
8770        }
8771        _ => 1.0,
8772    }
8773}
8774
8775/// Wood's rank-corrected reference d.f. `tr(F_jj)² / tr(F_jj²)` on the
8776/// coefficient-influence block `F = H⁻¹ X'WX` restricted to `coeff_range`. This
8777/// is the same reference the summary Wald row uses, so the corrected LR and the
8778/// Wald test reference the *same* `χ²_d`. Returns `None` when the influence
8779/// block is unavailable or degenerate.
8780fn wood_reference_df(influence: Option<&Array2<f64>>, coeff_range: &Range<usize>) -> Option<f64> {
8781    let f = influence?;
8782    let (start, end) = (coeff_range.start, coeff_range.end);
8783    if start >= end || end > f.nrows() || end > f.ncols() {
8784        return None;
8785    }
8786    let block = f.slice(s![start..end, start..end]);
8787    let tr = (0..block.nrows()).map(|i| block[[i, i]]).sum::<f64>();
8788    let tr2 = block.dot(&block).diag().sum();
8789    (tr.is_finite() && tr2.is_finite() && tr > 0.0 && tr2 > 0.0).then(|| (tr * tr / tr2).max(1e-12))
8790}