Skip to main content

gam_models/bms/
block_specs.rs

1use super::family::*;
2use super::gradient_paths::*;
3use super::hessian_paths::{new_cell_moment_cache_stats, new_cell_moment_lru_cache};
4use super::install_flex::validate_spec;
5use super::*;
6use gam_linalg::faer_ndarray::{FaerEigh, fast_ab, fast_atb, fast_xt_diag_x};
7use crate::marginal_slope_orthogonal::influence_absorber_log_lambda;
8use faer::Side;
9
10/// Sup-norm of the FITTED marginal linear predictor `η = X·β` at which the
11/// probit model becomes numerically degenerate. This is the decisive
12/// separation quantity — raw coefficient magnitude is NOT, because the
13/// thin-plate / Duchon marginal bases are non-orthonormal and ill-conditioned,
14/// so a smooth, bounded fitted surface can carry large-and-cancelling
15/// coefficients (e.g. `β=[60,-60]` on two collinear columns yields a bounded
16/// `Xβ`). The probit information `φ(η)²/[Φ(η)(1−Φ(η))]` only collapses once
17/// `|η|` is enormous: `Φ(−35) ≈ 1e−268` is still representable, but by `|η|≈38`
18/// the tail probability underflows to exactly 0 in `f64` and the per-row
19/// Fisher weight vanishes. We trip the guard at 35 — comfortably inside the
20/// representable range yet far beyond any legitimately fitted predictor (a
21/// converged penalized probit surface keeps `|η|` at single/low-double digits),
22/// so a true separating direction (whose `|η|→∞`) still trips it while a
23/// well-fitted ill-conditioned surface does not.
24pub(crate) const BMS_PROBIT_SEPARATION_ETA_INF: f64 = 35.0;
25
26// ── Canonical-gauge priority ladder (issue #322) ─────────────────────────────
27//
28// The priority-ordered RRQR in `canonicalize_for_identifiability` presents
29// higher-priority blocks first and routes any shared cross-block alias drop
30// into the lowest-priority block that still spans the aliased direction. The
31// values below form a single ordered ladder so the relationships that the
32// architecture depends on (anchors > parametric surfaces > flex deviations,
33// marginal > logslope, score_warp > link_dev) are expressed once here rather
34// than re-derived from comments at each `gauge_priority:` site. The ladder
35// mirrors the survival marginal-slope entry (time=200 / marginal=150 /
36// logslope=120 / score_warp=80 / link_dev=60).
37
38/// Audit-only anchor blocks sit at the top of the ladder so the candidate
39/// flex block always yields to them in the cross-block identifiability audit.
40pub(super) const GAUGE_PRIORITY_ANCHOR: u8 = 200;
41/// Marginal surface: strictly above the logslope surface so a shared affine
42/// direction is demoted out of logslope, never out of marginal.
43pub(super) const GAUGE_PRIORITY_MARGINAL: u8 = 150;
44/// Logslope surface: one rung below the marginal surface.
45pub(super) const GAUGE_PRIORITY_LOGSLOPE: u8 = 120;
46/// Candidate flex block under audit: below every parametric anchor so the
47/// audit demotes the candidate when it aliases an anchor.
48pub(super) const GAUGE_PRIORITY_CANDIDATE_FLEX: u8 = 100;
49/// `score_warp_dev`: above `link_dev` because in mixed-flex configurations
50/// link_dev is the residualised block and should yield first.
51pub(super) const GAUGE_PRIORITY_SCORE_WARP_DEV: u8 = 80;
52/// Default for any deviation auxiliary block not otherwise named; below the
53/// parametric default so shared affine directions never demote a parametric
54/// block.
55pub(super) const GAUGE_PRIORITY_DEVIATION_DEFAULT: u8 = 70;
56/// `link_dev`: lowest rung, yields first among the flex deviation blocks.
57pub(super) const GAUGE_PRIORITY_LINK_DEV: u8 = 60;
58
59/// Floor on the relative outer tolerance used by the exact-joint spatial
60/// length-scale optimiser. The user's `rel_tol` drives most of the fit, but
61/// the exact spatial outer loop is a coarse 1-D search over a smooth profiled
62/// objective; tightening it below this floor only burns cycles on noise in the
63/// inner-solve-reported objective without moving the selected length scale.
64pub(crate) const EXACT_SPATIAL_OUTER_TOL_FLOOR: f64 = 1e-6;
65
66// ── BlockEffectiveJacobian impls for BMS ─────────────────────────────────────
67//
68// BMS has a single Bernoulli output per row (n_outputs = 1). The observed η is
69//
70//   η_i = q_i · c_i + s·g_i · z_i
71//
72// where
73//   q_i   = marginal_design[i,:] · β_m + offset_m[i]      (marginal η)
74//   g_i   = logslope_design[i,:] · β_s + offset_s[i]      (log-slope η)
75//   s     = probit_frailty_scale(gaussian_frailty_sd)
76//   c_i   = sqrt(1 + (s·g_i)²)
77//
78// Per-block Jacobians ∂η_i / ∂β_block:
79//
80//   Marginal block  → ∂η_i/∂β_m = c_i · M_i
81//     (M_i = marginal_design row i; c_i is β-dependent but does not involve β_m)
82//
83//   Logslope block  → ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G_i
84//     (G_i = logslope_design row i)
85//
86// score_warp_dev and link_dev blocks use IFT-corrected η, but their
87// contribution to the identifiability audit is captured by the raw design
88// columns (the IFT correction adds a direction already in the anchor span at
89// compile time). These blocks leave jacobian_callback = None and rely on
90// effective_design (= raw design) for the flat audit.
91
92/// β-dependent Jacobian for the BMS marginal block.
93///
94/// ∂η_i/∂β_m = c_i · M[i,:]
95/// where c_i = sqrt(1 + (s · g_i)²),
96///       g_i = G[i,:] · β_s + offset_s[i],
97///       s   = state.probit_frailty_scale.
98///
99/// `probit_frailty_scale` is read from the evaluation state at call time (not
100/// captured at construction) so the callback remains correct across outer-loop
101/// σ updates without rebuilding the block spec.
102///
103/// Designs are pre-densified at construction to avoid repeated materialisation.
104pub struct BmsMarginalJacobian {
105    /// Dense marginal design: n × p_m.
106    pub marginal_dense: Arc<Array2<f64>>,
107    /// Dense logslope design: n × p_s.
108    pub logslope_dense: Arc<Array2<f64>>,
109    pub offset_m: Array1<f64>,
110    pub offset_s: Array1<f64>,
111    /// Number of marginal columns (= size of β_m slice in the full β vector).
112    pub p_marginal: usize,
113}
114
115impl BmsMarginalJacobian {
116    pub fn new(
117        marginal_dense: Arc<Array2<f64>>,
118        logslope_dense: Arc<Array2<f64>>,
119        offset_m: Array1<f64>,
120        offset_s: Array1<f64>,
121        p_marginal: usize,
122    ) -> Self {
123        Self {
124            marginal_dense,
125            logslope_dense,
126            offset_m,
127            offset_s,
128            p_marginal,
129        }
130    }
131}
132
133impl BlockEffectiveJacobian for BmsMarginalJacobian {
134    fn effective_jacobian_rows(
135        &self,
136        state: &FamilyLinearizationState<'_>,
137        rows: std::ops::Range<usize>,
138    ) -> Result<Array2<f64>, String> {
139        let beta = state.beta;
140        let s = state.probit_frailty_scale;
141        let p_m = self.p_marginal;
142        let p_s_block = self.logslope_dense.ncols();
143        let beta_s_raw = if beta.len() > p_m {
144            &beta[p_m..]
145        } else {
146            &[][..]
147        };
148        let p_s_use = p_s_block.min(beta_s_raw.len());
149        let beta_s = &beta_s_raw[..p_s_use];
150        let n = self.marginal_dense.nrows();
151        let rows = rows.start.min(n)..rows.end.min(n);
152        let p_block = self.marginal_dense.ncols();
153
154        // ∂η_i/∂β_m = c_i · M[i,:], with c_i = sqrt(1 + (s·g_i)²) and
155        //   g_i = G[i, :p_s_use] · β_s + offset_s[i].
156        //
157        // This block owns the logslope design and offset, so g_i — and hence
158        // c_i — is fully self-computable at the current β for every row,
159        // including β = 0 (where g_i = offset_s[i], which carries the fitted
160        // logslope baseline and is generically nonzero).  There is no external
161        // scalar this block cannot reconstruct, so the Jacobian is evaluated
162        // directly from owned data with no caller-supplied contract.
163        let mut out = Array2::<f64>::zeros((rows.end - rows.start, p_block));
164        for i in rows.clone() {
165            let g_i = self.offset_s[i]
166                + self
167                    .logslope_dense
168                    .row(i)
169                    .slice(ndarray::s![..p_s_use])
170                    .dot(&ArrayView1::from(beta_s));
171            let sg = s * g_i;
172            let c_i = (1.0 + sg * sg).sqrt();
173            // J[i,:] = c_i · M[i,:]
174            let m_row = self.marginal_dense.row(i);
175            out.row_mut(i - rows.start).assign(&m_row.mapv(|x| c_i * x));
176        }
177        Ok(out)
178    }
179
180    fn n_outputs(&self) -> usize {
181        1
182    }
183}
184
185/// β-dependent Jacobian for the BMS logslope block.
186///
187/// ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G[i,:]
188/// where q_i = M[i,:] · β_m + offset_m[i],
189///       g_i = G[i,:] · β_s + offset_s[i],
190///       c_i = sqrt(1 + (s·g_i)²),
191///       s   = state.probit_frailty_scale.
192///
193/// `probit_frailty_scale` is read from the evaluation state at call time.
194///
195/// Designs are pre-densified at construction to avoid repeated materialisation.
196pub struct BmsLogslopeJacobian {
197    /// Dense marginal design: n × p_m.
198    pub marginal_dense: Arc<Array2<f64>>,
199    /// Dense logslope design: n × p_s.
200    pub logslope_dense: Arc<Array2<f64>>,
201    pub offset_m: Array1<f64>,
202    pub offset_s: Array1<f64>,
203    pub z: Arc<Array1<f64>>,
204    /// Number of marginal columns (= start of β_s in the full β vector).
205    pub p_marginal: usize,
206}
207
208impl BmsLogslopeJacobian {
209    pub fn new(
210        marginal_dense: Arc<Array2<f64>>,
211        logslope_dense: Arc<Array2<f64>>,
212        offset_m: Array1<f64>,
213        offset_s: Array1<f64>,
214        z: Arc<Array1<f64>>,
215        p_marginal: usize,
216    ) -> Self {
217        Self {
218            marginal_dense,
219            logslope_dense,
220            offset_m,
221            offset_s,
222            z,
223            p_marginal,
224        }
225    }
226}
227
228impl BlockEffectiveJacobian for BmsLogslopeJacobian {
229    fn effective_jacobian_rows(
230        &self,
231        state: &FamilyLinearizationState<'_>,
232        rows: std::ops::Range<usize>,
233    ) -> Result<Array2<f64>, String> {
234        let beta = state.beta;
235        let s = state.probit_frailty_scale;
236        let p_m = self.p_marginal;
237        let p_m_use = p_m.min(beta.len());
238        let beta_m = &beta[..p_m_use];
239        let beta_s_raw = if beta.len() > p_m {
240            &beta[p_m..]
241        } else {
242            &[][..]
243        };
244        let p_s_block = self.logslope_dense.ncols();
245        let p_s_use = p_s_block.min(beta_s_raw.len());
246        let beta_s = &beta_s_raw[..p_s_use];
247        let n = self.logslope_dense.nrows();
248        let rows = rows.start.min(n)..rows.end.min(n);
249
250        // ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G[i,:] where
251        //   q_i = M[i,:] · β_m + offset_m[i],
252        //   g_i = G[i,:] · β_s + offset_s[i],
253        //   c_i = sqrt(1 + (s·g_i)²),
254        //   z_i = self.z[i].
255        //
256        // This block owns the marginal design, logslope design, both offsets,
257        // and z, so q_i, g_i, c_i, z_i are all closed-form functions of the
258        // current β and owned data — for every row at every β, including β = 0
259        // (where g_i = offset_s[i] carries the nonzero fitted logslope
260        // baseline).  The Jacobian is therefore evaluated directly from owned
261        // data with no caller-supplied scalar contract.
262        let mut out = Array2::<f64>::zeros((rows.end - rows.start, p_s_block));
263        for i in rows.clone() {
264            let q_i = self.offset_m[i]
265                + self
266                    .marginal_dense
267                    .row(i)
268                    .slice(ndarray::s![..p_m_use])
269                    .dot(&ArrayView1::from(beta_m));
270            let g_i = self.offset_s[i]
271                + self
272                    .logslope_dense
273                    .row(i)
274                    .slice(ndarray::s![..p_s_use])
275                    .dot(&ArrayView1::from(beta_s));
276            let sg = s * g_i;
277            let c_i = (1.0 + sg * sg).sqrt();
278            let z_i = self.z[i];
279            // per-row scalar factor: q_i · s²·g_i / c_i + s·z_i
280            let factor = q_i * s * s * g_i / c_i + s * z_i;
281            // J[i,:] = factor · G[i,:]
282            let g_row = self.logslope_dense.row(i);
283            out.row_mut(i - rows.start)
284                .assign(&g_row.mapv(|x| factor * x));
285        }
286        Ok(out)
287    }
288
289    fn n_outputs(&self) -> usize {
290        1
291    }
292}
293
294/// Horizontally stack the absorbed influence columns `Z̃_infl` onto the raw
295/// marginal design `M`, yielding the widened additive marginal-index design
296/// `[M | Z̃_infl]` (#461). When `influence_columns` is `None` the original
297/// dense design is returned unchanged. The influence columns shift the marginal
298/// index `α(x)` additively, so the de-nested probit kernel — which reads the
299/// marginal index from `block_states[0].eta` and reconstructs `∂q/∂β_m` from
300/// `self.marginal_design` (a matched (design, β) pair) — picks them up with no
301/// kernel-site change; the widened `p_marginal` keeps every per-row Jacobian /
302/// gradient / Hessian projection consistent.
303pub(crate) fn widen_marginal_dense_with_influence(
304    marginal_dense: &Arc<Array2<f64>>,
305    influence_columns: Option<&Array2<f64>>,
306) -> Result<Arc<Array2<f64>>, String> {
307    let Some(z_infl) = influence_columns else {
308        return Ok(Arc::clone(marginal_dense));
309    };
310    let n = marginal_dense.nrows();
311    if z_infl.nrows() != n {
312        return Err(format!(
313            "influence block: residualised columns have {} rows, marginal design has {n}",
314            z_infl.nrows()
315        ));
316    }
317    let p_m = marginal_dense.ncols();
318    let p1 = z_infl.ncols();
319    let mut widened = Array2::<f64>::zeros((n, p_m + p1));
320    widened
321        .slice_mut(s![.., ..p_m])
322        .assign(marginal_dense.as_ref());
323    widened.slice_mut(s![.., p_m..]).assign(z_infl);
324    Ok(Arc::new(widened))
325}
326
327/// Tolerance (relative to the dominant retained eigenvalue) below which a
328/// reduced-basis direction of the W-orthogonalised effective logslope Gram is
329/// treated as a confounded null direction and dropped. Directions whose
330/// effective weighted image is (near-)explained by the marginal span collapse to
331/// ~0 eigenvalue in `Gtt` (see [`build_reduced_logslope_reparam`]); this keeps
332/// the cut well above floating-point noise but well below any genuine surviving
333/// logslope curvature.
334pub(crate) const LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL: f64 = 1.0e-6;
335
336/// An exact reduced-basis reparameterization of the BMS logslope design through
337/// the family's OWN internal `logslope_design` geometry, expressed as a single
338/// linear map `T` (`p_logslope × r`, `r ≤ p_logslope`).
339///
340/// # Why a reduced basis (not a dense design swap)
341///
342/// The structural confound is that the score-weighted logslope channel
343/// `diag(factor)·G·β_s` overlaps the effective marginal channel `diag(c)·M·β_m`
344/// in the PIRLS row metric `W`, leaving the joint penalised Hessian rank-soft
345/// along the shared direction. The shared-solver primitive
346/// [`OrthogonalReparam`](gam_solve::orthogonal_reparam::OrthogonalReparam)
347/// forms `C̃ = C − M·B`, exactly W-orthogonal to `span(M)` — but `C̃` is a dense
348/// design the BMS family's row kernel does NOT consume: the family reads
349/// `η_logslope = G·β_s` from its own `logslope_design` and reconstructs the
350/// per-row Jacobian `factor_i · G_i` from that same matrix. A block-level design
351/// swap is therefore ignored by the family, and feeding a rank-deficient `C̃` at
352/// full width desynchronises the inner identifiable-subspace reduction from the
353/// stored design width.
354///
355/// This builds instead a TRUE reparameterization the family consumes: a
356/// full-rank reduced logslope design `G_reduced = G·T` (width `r`) plus the
357/// penalty projection `S_reduced = Tᵀ S T`. The map is constructed so that the
358/// directions of raw logslope coefficient space whose effective weighted image
359/// is W-explained by the marginal span are removed (they carry ~zero curvature
360/// in the W-orthogonalised effective Gram), and the surviving `r` directions are
361/// full-rank.
362///
363/// # The math
364///
365/// At the rigid pilot, the effective Jacobians are
366///
367/// ```text
368///     M_eff = diag(c) · M        (n × p_m),   c_i = sqrt(1 + (s·g_i)²)
369///     G_eff = diag(f) · G        (n × p_g),   f_i = q_i·s²·g_i/c_i + s·z_i
370/// ```
371///
372/// In the row metric `W` the component of the effective logslope design that is
373/// W-orthogonal to `span(M_eff)` has the raw-coordinate Gram
374///
375/// ```text
376///     Gtt = G_effᵀ W G_eff − (G_effᵀ W M_eff)(M_effᵀ W M_eff + εI)⁻¹(M_effᵀ W G_eff)
377/// ```
378///
379/// (a `p_g × p_g` PSD matrix in the raw logslope coefficient coordinates). Its
380/// range = the logslope directions that survive the confound removal; its null
381/// space = the confounded directions absorbed by the marginal span. The reduced
382/// transform `T` is the orthonormal eigenbasis of `Gtt` for eigenvalues above a
383/// relative tolerance; `r = rank(Gtt)`. The new design `G_reduced = G·T`, the
384/// reparameterized penalty `S_reduced = Tᵀ S T`, and the round-trip
385/// `β_logslope = T·β'` make the family's geometry consistent at width `r` and
386/// recover the original-basis logslope coefficients for prediction/reporting.
387#[derive(Debug, Clone)]
388pub(super) struct ReducedLogslopeReparam {
389    /// Reduced transform `T` (`p_logslope × r`). `G_reduced = G·T`,
390    /// `β_logslope = T·β'`, `S_reduced = Tᵀ S T`.
391    transform: Array2<f64>,
392}
393
394impl ReducedLogslopeReparam {
395    /// Original (full) logslope width `p_logslope`.
396    #[inline]
397    pub(super) fn original_cols(&self) -> usize {
398        self.transform.nrows()
399    }
400
401    /// Reduced width `r`.
402    #[inline]
403    pub(super) fn reduced_cols(&self) -> usize {
404        self.transform.ncols()
405    }
406
407    /// Map a reduced-basis logslope coefficient `β'` (length `r`) back to the
408    /// original logslope basis `β_logslope = T·β'` (length `p_logslope`), so
409    /// prediction/reporting are unchanged-in-meaning.
410    pub(super) fn recover_original_logslope_beta(
411        &self,
412        beta_reduced: &Array1<f64>,
413    ) -> Result<Array1<f64>, String> {
414        if beta_reduced.len() != self.reduced_cols() {
415            return Err(format!(
416                "reduced logslope reparam: β' length ({}) != reduced width ({})",
417                beta_reduced.len(),
418                self.reduced_cols()
419            ));
420        }
421        Ok(self.transform.dot(beta_reduced))
422    }
423}
424
425/// Build the reduced-basis logslope reparameterization (see
426/// [`ReducedLogslopeReparam`]) from the rigid-pilot EFFECTIVE Jacobian geometry,
427/// in the PIRLS row metric `W`. Extracts the dense pilot designs and delegates
428/// the geometry to [`reduced_logslope_transform_effective`]. Returns `Ok(None)`
429/// when there is no logslope/marginal span, no effective-confounded direction to
430/// remove (`r == p_g`), or the whole effective logslope image is in the effective
431/// marginal span (`r == 0`); in those cases the caller keeps the raw design.
432fn build_reduced_logslope_reparam(
433    marginal_design: &TermCollectionDesign,
434    logslope_design: &TermCollectionDesign,
435    z: &Array1<f64>,
436    row_metric: &Array1<f64>,
437    marginal_offset: &Array1<f64>,
438    logslope_offset: &Array1<f64>,
439    marginal_baseline: f64,
440    logslope_baseline: f64,
441    probit_scale: f64,
442) -> Result<Option<ReducedLogslopeReparam>, String> {
443    let marginal = marginal_design
444        .design
445        .try_to_dense_arc("build_reduced_logslope_reparam::marginal")?;
446    let logslope = logslope_design
447        .design
448        .try_to_dense_arc("build_reduced_logslope_reparam::logslope")?;
449    let n = marginal.nrows();
450    if logslope.nrows() != n
451        || z.len() != n
452        || row_metric.len() != n
453        || marginal_offset.len() != n
454        || logslope_offset.len() != n
455    {
456        return Err(format!(
457            "reduced logslope reparam row mismatch: marginal={}, logslope={}, z={}, row_metric={}, marginal_offset={}, logslope_offset={}",
458            marginal.nrows(),
459            logslope.nrows(),
460            z.len(),
461            row_metric.len(),
462            marginal_offset.len(),
463            logslope_offset.len(),
464        ));
465    }
466    let p_m = marginal.ncols();
467    let p_g = logslope.ncols();
468    if p_m == 0 || p_g == 0 {
469        return Ok(None);
470    }
471    if !marginal_baseline.is_finite()
472        || !logslope_baseline.is_finite()
473        || !probit_scale.is_finite()
474        || probit_scale <= 0.0
475        || z.iter().any(|v| !v.is_finite())
476        || row_metric.iter().any(|v| !v.is_finite() || *v < 0.0)
477        || marginal_offset.iter().any(|v| !v.is_finite())
478        || logslope_offset.iter().any(|v| !v.is_finite())
479    {
480        return Err(
481            "reduced logslope reparam requires finite pilot geometry and finite non-negative row metric"
482                .to_string(),
483        );
484    }
485
486    // The joint Hessian the inner solve factorises is built from the EFFECTIVE
487    // BMS Jacobians, not the raw design. At the rigid pilot,
488    //   ∂η_i/∂β_m = c_i · M_i,   c_i = sqrt(1 + (s·g_i)²)
489    //   ∂η_i/∂β_s = f_i · G_i,   f_i = q_i·s²·g_i/c_i + s·z_i
490    // so a raw logslope direction `v` is rank-soft in the joint Hessian iff its
491    // EFFECTIVE image `diag(f)·G·v` is W-explained by `span(diag(c)·M)` — NOT iff
492    // raw `G·v` is W-explained by `span(M)`. Auditing the raw design removes the
493    // wrong directions; the reduced basis is built from the effective Schur Gram.
494    // The pure-array geometry lives in `reduced_logslope_transform_effective` so
495    // it can be unit-tested directly against the raw-vs-effective counterexample.
496    match reduced_logslope_transform_effective(
497        marginal.view(),
498        logslope.view(),
499        z,
500        row_metric,
501        marginal_offset,
502        logslope_offset,
503        marginal_baseline,
504        logslope_baseline,
505        probit_scale,
506    )? {
507        Some(transform) => Ok(Some(ReducedLogslopeReparam { transform })),
508        None => Ok(None),
509    }
510}
511
512/// Build the reduced logslope basis `T` (p_g × r) from the EFFECTIVE BMS pilot
513/// geometry, in the PIRLS row metric `W`. `T`'s columns span the raw logslope
514/// coefficient directions whose effective image `diag(f)·G·v` is NOT W-explained
515/// by `span(diag(c)·M)` — i.e. the directions the joint Hessian retains real
516/// curvature along. Returns `Ok(None)` when there is nothing to reduce
517/// (`r == p_g`) or when the entire effective logslope image collapses into the
518/// effective marginal span (`r == 0`); in both cases the caller keeps the raw
519/// design (a zero-width reduction would silently delete the score-effect surface,
520/// which is the estimand — the REML penalty regularises any residual softness).
521///
522/// At the rigid pilot the effective Jacobians are
523///     M_eff = diag(c) · M,   c_i = sqrt(1 + (s·g_i)²)
524///     G_eff = diag(f) · G,   f_i = q_i·s²·g_i/c_i + s·z_i
525/// and the raw-coordinate Gram of the logslope component W-orthogonal to
526/// `span(M_eff)` is the Schur complement
527///     Gtt = G_effᵀ W G_eff − (G_effᵀ W M_eff)(M_effᵀ W M_eff + εI)⁻¹(M_effᵀ W G_eff).
528/// `T` is the orthonormal eigenbasis of `Gtt` for eigenvalues above a tolerance
529/// relative to the effective logslope energy scale.
530pub(crate) fn reduced_logslope_transform_effective(
531    marginal: ArrayView2<'_, f64>,
532    logslope: ArrayView2<'_, f64>,
533    z: &Array1<f64>,
534    row_metric: &Array1<f64>,
535    marginal_offset: &Array1<f64>,
536    logslope_offset: &Array1<f64>,
537    marginal_baseline: f64,
538    logslope_baseline: f64,
539    probit_scale: f64,
540) -> Result<Option<Array2<f64>>, String> {
541    let n = marginal.nrows();
542    let p_m = marginal.ncols();
543    let p_g = logslope.ncols();
544    if p_m == 0 || p_g == 0 {
545        return Ok(None);
546    }
547
548    // Effective pilot Jacobians M_eff = diag(c)·M and G_eff = diag(f)·G.
549    let mut m_eff = Array2::<f64>::zeros((n, p_m));
550    let mut g_eff = Array2::<f64>::zeros((n, p_g));
551    for i in 0..n {
552        let q_i = marginal_offset[i] + marginal_baseline;
553        let g_i = logslope_offset[i] + logslope_baseline;
554        let sg = probit_scale * g_i;
555        let c_i = (1.0 + sg * sg).sqrt();
556        let f_i = q_i * probit_scale * probit_scale * g_i / c_i + probit_scale * z[i];
557        for j in 0..p_m {
558            m_eff[[i, j]] = c_i * marginal[[i, j]];
559        }
560        for j in 0..p_g {
561            g_eff[[i, j]] = f_i * logslope[[i, j]];
562        }
563    }
564
565    // C = G_effᵀ W G_eff (raw-coordinate effective logslope Gram); its diagonal
566    // sets the energy scale for the relative kept-direction tolerance.
567    let c_gram = fast_xt_diag_x(&g_eff, row_metric);
568    let energy_scale = (0..p_g).map(|i| c_gram[[i, i]]).fold(0.0_f64, f64::max);
569    if !energy_scale.is_finite() || energy_scale <= 0.0 {
570        return Ok(None);
571    }
572
573    // A = M_effᵀ W M_eff + εI (ridge relative to the marginal effective energy so
574    // the Schur solve is well-posed even when the marginal pilot Gram is
575    // rank-soft; the ridge only under-removes, i.e. is conservative).
576    let mut a_gram = fast_xt_diag_x(&m_eff, row_metric);
577    let a_scale = (0..p_m).map(|i| a_gram[[i, i]]).fold(0.0_f64, f64::max);
578    let a_ridge = (a_scale * LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL).max(f64::EPSILON);
579    for i in 0..p_m {
580        a_gram[[i, i]] += a_ridge;
581    }
582
583    // B = M_effᵀ W G_eff (p_m × p_g);  Gtt = C − Bᵀ A⁻¹ B (p_g × p_g, PSD).
584    let b_cross = gam_linalg::faer_ndarray::fast_xt_diag_y(&m_eff, row_metric, &g_eff);
585    let a_view = gam_linalg::faer_ndarray::FaerArrayView::new(&a_gram);
586    let a_factor =
587        gam_linalg::faer_ndarray::factorize_symmetricwith_fallback(a_view.as_ref(), Side::Lower)
588            .map_err(|e| {
589                format!(
590                    "reduced logslope reparam: effective marginal Gram factorization failed: {e}"
591                )
592            })?;
593    let b_view = gam_linalg::faer_ndarray::FaerArrayView::new(&b_cross);
594    let solved = a_factor.solve(b_view.as_ref()); // A⁻¹ B  (p_m × p_g)
595    let a_inv_b = Array2::from_shape_fn((p_m, p_g), |(i, j)| solved[(i, j)]);
596    let schur = fast_atb(&b_cross, &a_inv_b); // Bᵀ A⁻¹ B  (p_g × p_g)
597    let mut stt = &c_gram - &schur;
598    stt = (&stt + &stt.t()) * 0.5;
599    if stt.iter().any(|v| !v.is_finite()) {
600        return Err(
601            "reduced logslope reparam: effective Schur Gram produced non-finite entries"
602                .to_string(),
603        );
604    }
605
606    let (evals, evecs) = stt
607        .eigh(Side::Lower)
608        .map_err(|e| format!("reduced logslope reparam: eigendecomposition failed: {e:?}"))?;
609    // A `Gtt` eigenvalue far below the effective logslope energy scale means that
610    // direction's effective logslope column is W-explained by the effective
611    // marginal span — exactly the joint-Hessian rank-soft confounded direction.
612    let tol = energy_scale * LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL;
613    let mut kept: Vec<usize> = (0..evals.len()).filter(|&i| evals[i] > tol).collect();
614    kept.sort_by(|&a, &b| {
615        evals[b]
616            .partial_cmp(&evals[a])
617            .unwrap_or(std::cmp::Ordering::Equal)
618    });
619    let r = kept.len();
620    // r == p_g: no effective-confounded direction to remove. r == 0: the whole
621    // effective logslope image is in the effective marginal span. In both cases
622    // install no transform (see fn-level doc) and keep the raw design.
623    if r == p_g || r == 0 {
624        return Ok(None);
625    }
626    let mut transform = Array2::<f64>::zeros((p_g, r));
627    for (out_col, &src) in kept.iter().enumerate() {
628        transform.column_mut(out_col).assign(&evecs.column(src));
629    }
630    if transform.iter().any(|v| !v.is_finite()) {
631        return Err(
632            "reduced logslope reparam: reduced transform produced non-finite entries".to_string(),
633        );
634    }
635    Ok(Some(transform))
636}
637
638/// Apply a [`ReducedLogslopeReparam`] to a logslope `TermCollectionDesign`,
639/// producing a new design at the reduced width `r`: the design becomes
640/// `G_reduced = G·T`, and every blockwise penalty `S` is reparameterized to
641/// `S_reduced = Tᵀ S T` over the full reduced column range `0..r`. The reduced
642/// penalty's null space is recomputed from its numerical rank so the REML
643/// log-determinant accounting stays consistent at the reduced width.
644fn reparameterize_logslope_design_reduced(
645    logslope_design: &TermCollectionDesign,
646    reparam: &ReducedLogslopeReparam,
647) -> Result<TermCollectionDesign, String> {
648    let g = logslope_design
649        .design
650        .try_to_dense_arc("reparameterize_logslope_design_reduced::logslope")?;
651    let p_g = g.ncols();
652    if p_g != reparam.original_cols() {
653        return Err(format!(
654            "reduced logslope reparam width mismatch: design has {p_g} cols, transform expects {}",
655            reparam.original_cols()
656        ));
657    }
658    let t = &reparam.transform;
659    let r = reparam.reduced_cols();
660    // G_reduced = G·T   (n × r).
661    let g_reduced = fast_ab(&g, t);
662
663    // Reparameterize each penalty: embed its local block at full width p_g, then
664    // form S_reduced = Tᵀ S T (r × r) over the whole reduced column range.
665    let mut new_penalties: Vec<gam_terms::smooth::BlockwisePenalty> =
666        Vec::with_capacity(logslope_design.penalties.len());
667    let mut new_nullspace_dims: Vec<usize> = Vec::with_capacity(logslope_design.penalties.len());
668    for bp in &logslope_design.penalties {
669        let mut full = Array2::<f64>::zeros((p_g, p_g));
670        full.slice_mut(s![bp.col_range.clone(), bp.col_range.clone()])
671            .assign(&bp.local);
672        // S_reduced = Tᵀ (S) T.
673        let st = fast_ab(&full, t); // p_g × r
674        let mut s_reduced = fast_atb(t, &st); // r × r
675        s_reduced = (&s_reduced + &s_reduced.t()) * 0.5;
676        // Null-space dimension of the reduced penalty = r − rank(S_reduced).
677        let (evals, _) = s_reduced
678            .eigh(Side::Lower)
679            .map_err(|e| format!("reduced logslope penalty eigendecomposition failed: {e:?}"))?;
680        let max_eval = evals.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs()));
681        let pen_tol = (max_eval * 1.0e-12).max(f64::EPSILON);
682        let rank = evals.iter().filter(|&&v| v.abs() > pen_tol).count();
683        let nullspace_dim = r.saturating_sub(rank);
684        new_penalties.push(gam_terms::smooth::BlockwisePenalty::new(0..r, s_reduced));
685        new_nullspace_dims.push(nullspace_dim);
686    }
687
688    let new_design = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(g_reduced));
689    // The reduced logslope block is a single dense smooth-like surface over the
690    // reparameterized coordinates; it carries no parametric/random-effect/
691    // intercept structure of its own (those live in the marginal block), so the
692    // structural ranges collapse to empty and the smooth metadata is cleared.
693    // The penalties + nullspace_dims above are what the joint REML consumes.
694    Ok(TermCollectionDesign {
695        design: new_design,
696        penalties: new_penalties,
697        nullspace_dims: new_nullspace_dims,
698        penaltyinfo: Vec::new(),
699        dropped_penaltyinfo: Vec::new(),
700        coefficient_lower_bounds: None,
701        linear_constraints: None,
702        intercept_range: 0..0,
703        linear_ranges: Vec::new(),
704        random_effect_ranges: Vec::new(),
705        random_effect_levels: Vec::new(),
706        smooth: gam_terms::smooth::SmoothDesign {
707            term_designs: Vec::new(),
708            penalties: Vec::new(),
709            nullspace_dims: Vec::new(),
710            penaltyinfo: Vec::new(),
711            dropped_penaltyinfo: Vec::new(),
712            terms: Vec::new(),
713            coefficient_lower_bounds: None,
714            linear_constraints: None,
715        },
716    })
717}
718
719/// Re-embed the term-collection marginal penalties at the (possibly widened)
720/// block dimension `p_m [+ p₁]`, then append the #461 fixed-ridge absorber:
721///
722///  (#461, only with influence columns) the fixed-ridge absorber identity on
723///  the influence columns `p_m..p_m+p₁`.
724///
725/// The two former gam#754 pinned ridges — the marginal nullspace-shrinkage ridge
726/// and the marginal↔logslope overlap ridge — are DELETED: robustness is now
727/// unconditional, so the full-identifiable-span Jeffreys term (`Z_J = I`, see
728/// `jeffreys_subspace_from_penalty`) supplies automatic O(n)-scaled curvature on
729/// every under-identified direction (subsuming the nullspace ridge), and the
730/// exact orthogonal reparameterization of the logslope design (now unconditional,
731/// see `build_reduced_logslope_reparam`) resolves the marginal↔logslope confound
732/// by construction (subsuming the overlap ridge).
733///
734/// The genuine marginal smooth penalties keep their `col_range` (marginal
735/// columns stay in `0..p_m`). Returns `(penalties, nullspace_dims,
736/// initial_log_lambdas)` to install on the marginal block. Fixed ridges remain
737/// in this physical penalty layout and are removed from every REML/outer
738/// coordinate vector by [`PenaltyMatrix::Fixed`].
739pub(crate) fn marginal_penalties_with_influence_ridge(
740    design: &TermCollectionDesign,
741    rho_marginal: &Array1<f64>,
742    influence_columns: Option<&Array2<f64>>,
743    influence_ridge_log_lambda: f64,
744) -> Result<(Vec<PenaltyMatrix>, Vec<usize>, Array1<f64>), String> {
745    let p_m = design.design.ncols();
746    let p1 = influence_columns.map(|z| z.ncols()).unwrap_or(0);
747    let total_dim = p_m + p1;
748    // Re-embed each marginal penalty at the (widened) total dimension (col_range
749    // unchanged: marginal columns remain 0..p_m).
750    let mut penalties: Vec<PenaltyMatrix> = design
751        .penalties
752        .iter()
753        .map(|bp| bp.to_penalty_matrix(total_dim))
754        .collect();
755    let mut nullspace_dims = design.nullspace_dims.clone();
756    let mut log_lambdas = rho_marginal.to_vec();
757
758    // (#461) fixed-ridge absorber: identity on the influence columns only.
759    // Full rank (nullspace 0); its log λ is pinned out of REML by a degenerate
760    // ρ box.
761    if p1 > 0 {
762        penalties.push(
763            PenaltyMatrix::Blockwise {
764                local: Array2::<f64>::eye(p1),
765                col_range: p_m..total_dim,
766                total_dim,
767            }
768            .with_fixed_log_lambda(influence_ridge_log_lambda),
769        );
770        nullspace_dims.push(0);
771        log_lambdas.push(influence_ridge_log_lambda);
772    }
773
774    Ok((penalties, nullspace_dims, Array1::from_vec(log_lambdas)))
775}
776
777/// Widen an optional β warm-start hint to the influence-widened marginal
778/// dimension, zero-filling the absorber coefficients `γ` (#461).
779pub(crate) fn widen_marginal_beta_hint(
780    beta_hint: Option<Array1<f64>>,
781    p_marginal_widened: usize,
782) -> Option<Array1<f64>> {
783    beta_hint.map(|hint| {
784        if hint.len() == p_marginal_widened {
785            hint
786        } else {
787            let mut widened = Array1::<f64>::zeros(p_marginal_widened);
788            let copy = hint.len().min(p_marginal_widened);
789            widened
790                .slice_mut(s![..copy])
791                .assign(&hint.slice(s![..copy]));
792            widened
793        }
794    })
795}
796
797/// Sup-norm of the fitted marginal linear predictor `η = X·β` restricted to a
798/// subset of the marginal design's columns. The mask selects which columns of
799/// `design.design` (length `design.design.ncols()`) contribute; coefficients
800/// beyond `ncols` (the fixed-ridge influence absorber) never enter the marginal
801/// predictor and are excluded by construction. Returns `0.0` for an empty
802/// design. This is the decisive separation quantity: the probit Fisher weight
803/// collapses with `|η|`, not with `‖β‖` (an ill-conditioned non-orthonormal
804/// Duchon/thin-plate basis carries large cancelling coefficients on a smooth,
805/// bounded surface).
806fn marginal_fitted_eta_sup_norm(design: &TermCollectionDesign, masked_beta: &Array1<f64>) -> f64 {
807    let x = &design.design;
808    let n = x.nrows();
809    if n == 0 || x.ncols() == 0 {
810        return 0.0;
811    }
812    let mut sup = 0.0_f64;
813    for row in 0..n {
814        let eta = x.dot_row_view(row, masked_beta.view());
815        if eta.is_finite() {
816            sup = sup.max(eta.abs());
817        }
818    }
819    sup
820}
821
822/// Build a copy of the marginal block β truncated to `design.design.ncols()`
823/// (drops any fixed-ridge absorber tail) so it can drive `X·β`.
824fn marginal_design_beta(
825    design: &TermCollectionDesign,
826    block_beta: ArrayView1<'_, f64>,
827) -> Array1<f64> {
828    let ncols = design.design.ncols();
829    let mut masked = Array1::<f64>::zeros(ncols);
830    let copy = ncols.min(block_beta.len());
831    masked
832        .slice_mut(s![..copy])
833        .assign(&block_beta.slice(s![..copy]));
834    masked
835}
836
837/// Zero every entry of `beta` outside the parametric (penalty-nullspace)
838/// marginal columns — the intercept and the single-penalty linear terms. These
839/// are the directions an unpenalized fit can genuinely separate along (no
840/// smoothness penalty bounds them), so their fitted contribution is tested on
841/// the same η scale.
842fn mask_parametric_columns(
843    design: &TermCollectionDesign,
844    spec: &TermCollectionSpec,
845    full: &Array1<f64>,
846) -> Array1<f64> {
847    let ncols = design.design.ncols();
848    let mut masked = Array1::<f64>::zeros(ncols);
849    if design.intercept_range.len() == 1 {
850        let idx = design.intercept_range.start;
851        if idx < ncols {
852            masked[idx] = full[idx];
853        }
854    }
855    for (linear, (_, range)) in spec.linear_terms.iter().zip(design.linear_ranges.iter()) {
856        if linear.double_penalty {
857            continue;
858        }
859        for col in range.clone() {
860            if col < ncols {
861                masked[col] = full[col];
862            }
863        }
864    }
865    masked
866}
867
868/// Decide whether the converged marginal fit has genuinely separated, using the
869/// FITTED predictor sup-norm `|η|∞` (not raw `|β|∞`). Two arms share the
870/// numerical-degeneracy threshold [`BMS_PROBIT_SEPARATION_ETA_INF`]:
871///   - parametric arm: the penalty-nullspace columns' fitted contribution
872///     (an unpenalized direction can run to infinity);
873///   - full arm: the whole marginal surface's fitted predictor.
874/// The raw `|β|∞` (and its term label) is reported only as diagnostic context;
875/// it never gates the abort. When `|η|∞` is below threshold the converged
876/// penalized fit is numerically trustworthy and this returns `None` — no error,
877/// even if individual coefficients are large.
878pub(crate) fn bernoulli_marginal_slope_runaway_error_from_beta(
879    block_beta: ArrayView1<'_, f64>,
880    design: &TermCollectionDesign,
881    spec: &TermCollectionSpec,
882    inner_converged: bool,
883    eval_label: &str,
884) -> Option<String> {
885    let full_beta = marginal_design_beta(design, block_beta);
886    let parametric_beta = mask_parametric_columns(design, spec, &full_beta);
887
888    let eta_parametric = marginal_fitted_eta_sup_norm(design, &parametric_beta);
889    let eta_full = marginal_fitted_eta_sup_norm(design, &full_beta);
890
891    let (eta_inf, explanation) = if eta_parametric >= BMS_PROBIT_SEPARATION_ETA_INF {
892        (
893            eta_parametric,
894            "an unpenalized parametric marginal direction has no stable finite probit optimum and its fitted predictor has run to the probit underflow scale",
895        )
896    } else if eta_full >= BMS_PROBIT_SEPARATION_ETA_INF {
897        (
898            eta_full,
899            "a marginal direction is trading off against the logslope surface; this is the under-constrained marginal/logslope coupling that appears when the score is correlated with the shared surface covariates",
900        )
901    } else {
902        // |η|∞ is bounded: even if raw coefficients are large (ill-conditioned
903        // non-orthonormal basis with cancellation), the converged penalized
904        // probit fit is numerically trustworthy. Do NOT abort.
905        return None;
906    };
907
908    let inner_status = if inner_converged {
909        "the inner solve reached a KKT certificate at this separation-scale predictor"
910    } else {
911        "the inner solve failed while already carrying a separation-scale predictor"
912    };
913    // Raw |β|∞ context (decisive quantity is |η|∞ above).
914    let beta_abs = full_beta
915        .iter()
916        .copied()
917        .filter(|v| v.is_finite())
918        .fold(0.0_f64, |acc, v| acc.max(v.abs()));
919
920    Some(format!(
921        "bernoulli marginal-slope probit marginal/logslope runaway detected in block \
922         'marginal_surface' during {eval_label}: the fitted marginal predictor has \
923         |η|∞={eta_inf:.3e} (numerical-degeneracy threshold \
924         {BMS_PROBIT_SEPARATION_ETA_INF:.1}; raw |β|∞={beta_abs:.3e} is reported for \
925         context only and does not gate this diagnostic). The joint design is \
926         identifiable; {explanation}. {inner_status}. The robust Jeffreys curvature \
927         path is already installed for this fit, so this diagnostic means the current \
928         coupled surface still drives the linear predictor to the probit underflow \
929         scale rather than a request for an external bias-reduction prior. Reduce or \
930         reparameterize the coupled marginal/logslope surface, or use a \
931         lower-dimensional logslope interaction. This is not a \
932         Matérn/Duchon polynomial-nullspace or cross-block gauge-priority \
933         failure."
934    ))
935}
936
937pub(crate) fn bernoulli_marginal_slope_runaway_error(
938    warm_start: &CustomFamilyWarmStart,
939    design: &TermCollectionDesign,
940    spec: &TermCollectionSpec,
941    inner_converged: bool,
942    eval_label: &str,
943) -> Option<String> {
944    let block_beta = warm_start.block_beta_view(0)?;
945    bernoulli_marginal_slope_runaway_error_from_beta(
946        block_beta,
947        design,
948        spec,
949        inner_converged,
950        eval_label,
951    )
952}
953
954#[cfg(test)]
955mod runaway_tests {
956    use super::*;
957    use gam_linalg::faer_ndarray::{FaerArrayView, factorize_symmetricwith_fallback, fast_xt_diag_y};
958    use gam_terms::smooth::{LinearCoefficientGeometry, LinearTermSpec};
959
960    // The marginal↔logslope overlap penalty is no longer installed as a pinned
961    // ridge (subsumed by the now-unconditional exact logslope orthogonalisation in
962    // `build_reduced_logslope_reparam`). The geometry helper is retained here under
963    // the test module because the basis-independence/weight-orthogonality unit tests
964    // below exercise it directly as the canonical overlap-direction reference.
965    pub(crate) fn marginal_logslope_overlap_penalty(
966        marginal_design: &DesignMatrix,
967        logslope_design: &DesignMatrix,
968        z: &Array1<f64>,
969        row_metric: &Array1<f64>,
970        marginal_offset: &Array1<f64>,
971        logslope_offset: &Array1<f64>,
972        marginal_baseline: f64,
973        logslope_baseline: f64,
974        probit_scale: f64,
975    ) -> Result<Option<Array2<f64>>, String> {
976        let marginal =
977            marginal_design.try_to_dense_arc("marginal_logslope_overlap_penalty::marginal")?;
978        let logslope =
979            logslope_design.try_to_dense_arc("marginal_logslope_overlap_penalty::logslope")?;
980        let n = marginal.nrows();
981        if logslope.nrows() != n
982            || z.len() != n
983            || row_metric.len() != n
984            || marginal_offset.len() != n
985            || logslope_offset.len() != n
986        {
987            return Err(format!(
988                "marginal/logslope overlap penalty row mismatch: marginal={}, logslope={}, z={}, row_metric={}, marginal_offset={}, logslope_offset={}",
989                marginal.nrows(),
990                logslope.nrows(),
991                z.len(),
992                row_metric.len(),
993                marginal_offset.len(),
994                logslope_offset.len(),
995            ));
996        }
997        let p_m = marginal.ncols();
998        let p_g = logslope.ncols();
999        if p_m == 0 || p_g == 0 {
1000            return Ok(None);
1001        }
1002        if !marginal_baseline.is_finite()
1003            || !logslope_baseline.is_finite()
1004            || !probit_scale.is_finite()
1005            || probit_scale <= 0.0
1006            || z.iter().any(|v| !v.is_finite())
1007            || row_metric.iter().any(|v| !v.is_finite() || *v < 0.0)
1008            || marginal_offset.iter().any(|v| !v.is_finite())
1009            || logslope_offset.iter().any(|v| !v.is_finite())
1010        {
1011            return Err(
1012                "marginal/logslope overlap penalty requires finite pilot geometry and finite non-negative row metric"
1013                    .to_string(),
1014            );
1015        }
1016
1017        let mut marginal_effective = Array2::<f64>::zeros((n, p_m));
1018        let mut effective_logslope = Array2::<f64>::zeros((n, p_g));
1019        for i in 0..n {
1020            let q_i = marginal_offset[i] + marginal_baseline;
1021            let g_i = logslope_offset[i] + logslope_baseline;
1022            let sg = probit_scale * g_i;
1023            let c_i = (1.0 + sg * sg).sqrt();
1024            let logslope_factor =
1025                q_i * probit_scale * probit_scale * g_i / c_i + probit_scale * z[i];
1026            for j in 0..p_m {
1027                marginal_effective[[i, j]] = c_i * marginal[[i, j]];
1028            }
1029            for j in 0..p_g {
1030                effective_logslope[[i, j]] = logslope_factor * logslope[[i, j]];
1031            }
1032        }
1033        if effective_logslope.iter().all(|v| v.abs() <= f64::EPSILON) {
1034            return Ok(None);
1035        }
1036
1037        let mut gram = fast_xt_diag_x(&effective_logslope, row_metric);
1038        let gram_scale = gram.diag().iter().copied().fold(0.0_f64, f64::max);
1039        if !gram_scale.is_finite() || gram_scale <= 0.0 {
1040            return Ok(None);
1041        }
1042        let projection_ridge = (gram_scale * 1.0e-10).max(f64::EPSILON);
1043        for i in 0..p_g {
1044            gram[[i, i]] += projection_ridge;
1045        }
1046        let cross = fast_xt_diag_y(&effective_logslope, row_metric, &marginal_effective);
1047        let gram_view = FaerArrayView::new(&gram);
1048        let factor = factorize_symmetricwith_fallback(gram_view.as_ref(), Side::Lower)
1049            .map_err(|e| format!("marginal/logslope overlap Gram factorization failed: {e}"))?;
1050        let rhsview = FaerArrayView::new(&cross);
1051        let coeffs_mat = factor.solve(rhsview.as_ref());
1052        let coeffs = Array2::from_shape_fn((p_g, p_m), |(i, j)| coeffs_mat[(i, j)]);
1053        let projected_marginal = fast_ab(&effective_logslope, &coeffs);
1054        let mut penalty = fast_xt_diag_y(&marginal_effective, row_metric, &projected_marginal);
1055        penalty = (&penalty + &penalty.t()) * 0.5;
1056        let max_abs = penalty.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
1057        if !max_abs.is_finite() || max_abs <= 1.0e-12 {
1058            return Ok(None);
1059        }
1060        Ok(Some(penalty))
1061    }
1062
1063    // The raw-vs-effective counterexample. With q=g=0, s=1: c_i=1 (so M_eff=M)
1064    // and f_i=z_i (so G_eff=diag(z)·G). Pick M=[1,1,1]ᵀ and a two-column logslope
1065    // G whose RAW columns are both linearly independent of M (raw orthogonalising
1066    // [M|G] would keep BOTH — the old code returned None, no reduction), but whose
1067    // first EFFECTIVE column diag(z)·G[:,0] equals M_eff exactly. The effective
1068    // audit must therefore drop exactly one direction (r=1), proving it removes
1069    // the joint-Hessian rank-soft direction the raw audit could not see.
1070    #[test]
1071    pub(crate) fn effective_reduction_drops_score_weighted_confound_raw_audit_misses() {
1072        // G col0 = [1,2,3], col1 = [1,2,9]  (row-major rows: [1,1],[2,2],[3,9]).
1073        let m = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1074        let g = Array2::<f64>::from_shape_vec((3, 2), vec![1.0, 1.0, 2.0, 2.0, 3.0, 9.0]).unwrap();
1075        let z = Array1::from_vec(vec![1.0, 0.5, 1.0 / 3.0]);
1076        let w = Array1::<f64>::ones(3);
1077        let zero = Array1::<f64>::zeros(3);
1078
1079        // diag(z)·G[:,0] = [1·1, 0.5·2, (1/3)·3] = [1,1,1] = M_eff (fully aliased);
1080        // diag(z)·G[:,1] = [1·1, 0.5·2, (1/3)·9] = [1,1,3] (NOT in span([1,1,1])).
1081        let reparam = reduced_logslope_transform_effective(
1082            m.view(),
1083            g.view(),
1084            &z,
1085            &w,
1086            &zero,
1087            &zero,
1088            0.0,
1089            0.0,
1090            1.0,
1091        )
1092        .expect("effective reduction must succeed")
1093        .expect("effective audit must reduce the score-weighted confound (raw audit would not)");
1094        assert_eq!(
1095            reparam.ncols(),
1096            1,
1097            "exactly one effective-identifiable logslope direction should survive"
1098        );
1099
1100        // The surviving raw direction's EFFECTIVE image diag(z)·G·t must carry the
1101        // non-constant ([1,1,3]) content — i.e. it is the identifiable direction,
1102        // not the [1,1,1] confound. Its row variance must be clearly positive.
1103        let g_eff = {
1104            let mut e = Array2::<f64>::zeros((3, 2));
1105            for i in 0..3 {
1106                for j in 0..2 {
1107                    e[[i, j]] = z[i] * g[[i, j]];
1108                }
1109            }
1110            e
1111        };
1112        let img = g_eff.dot(&reparam.column(0));
1113        let mean = img.iter().sum::<f64>() / 3.0;
1114        let var = img.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / 3.0;
1115        assert!(
1116            var > 1.0e-6,
1117            "kept direction must be the identifiable (non-constant) effective column, var={var}"
1118        );
1119    }
1120
1121    // The single-column fully-confounded case: G=[1,2,3]ᵀ, z=[1,1/2,1/3] gives
1122    // G_eff=[1,1,1]=M_eff, so the entire effective logslope image is in the
1123    // effective marginal span (r==0). The helper returns None (keep the raw
1124    // design) rather than emitting a zero-width transform that would delete the
1125    // score-effect surface.
1126    #[test]
1127    pub(crate) fn effective_reduction_fully_confounded_single_column_returns_none() {
1128        let m = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1129        let g = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
1130        let z = Array1::from_vec(vec![1.0, 0.5, 1.0 / 3.0]);
1131        let w = Array1::<f64>::ones(3);
1132        let zero = Array1::<f64>::zeros(3);
1133        let reparam = reduced_logslope_transform_effective(
1134            m.view(),
1135            g.view(),
1136            &z,
1137            &w,
1138            &zero,
1139            &zero,
1140            0.0,
1141            0.0,
1142            1.0,
1143        )
1144        .expect("effective reduction must succeed");
1145        assert!(
1146            reparam.is_none(),
1147            "fully effective-confounded logslope must keep raw design (None), not a 0-width block"
1148        );
1149    }
1150
1151    // No effective confound: both effective logslope columns stay independent of
1152    // M_eff, so nothing is reduced (r==p_g ⇒ None) and healthy fits are untouched.
1153    #[test]
1154    pub(crate) fn effective_reduction_no_confound_returns_none() {
1155        let m = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1156        // diag(z)·col gives non-constant images for both columns under z below.
1157        let g = Array2::<f64>::from_shape_vec((3, 2), vec![1.0, 0.0, 0.0, 1.0, 0.0, 0.0]).unwrap();
1158        let z = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1159        let w = Array1::<f64>::ones(3);
1160        let zero = Array1::<f64>::zeros(3);
1161        let reparam = reduced_logslope_transform_effective(
1162            m.view(),
1163            g.view(),
1164            &z,
1165            &w,
1166            &zero,
1167            &zero,
1168            0.0,
1169            0.0,
1170            1.0,
1171        )
1172        .expect("effective reduction must succeed");
1173        assert!(
1174            reparam.is_none(),
1175            "no effective confound ⇒ no reduction (raw design kept unchanged)"
1176        );
1177    }
1178
1179    #[test]
1180    pub(crate) fn spatial_joint_setup_counts_only_learned_penalties_in_rho() {
1181        let data = Array2::<f64>::zeros((3, 1));
1182        let empty_terms = TermCollectionSpec {
1183            linear_terms: Vec::new(),
1184            random_effect_terms: Vec::new(),
1185            smooth_terms: Vec::new(),
1186        };
1187        let setup = joint_setup(
1188            data.view(),
1189            &empty_terms,
1190            &empty_terms,
1191            2,
1192            3,
1193            &[0.4],
1194            &SpatialLengthScaleOptimizationOptions::default(),
1195        );
1196
1197        assert_eq!(
1198            setup.rho_dim(),
1199            6,
1200            "BMS spatial setup rho must contain only learned marginal/logslope/auxiliary penalties; fixed physical ridges are carried by PenaltyMatrix::Fixed"
1201        );
1202    }
1203
1204    #[test]
1205    pub(crate) fn overlap_penalty_targets_score_weighted_logslope_span() {
1206        let marginal = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1207            Array2::from_shape_vec((4, 1), vec![0.0, 1.0, 2.0, 3.0]).unwrap(),
1208        ));
1209        let logslope = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1210            Array2::from_shape_vec((4, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap(),
1211        ));
1212        let z = Array1::from_vec(vec![0.0, 1.0, 2.0, 3.0]);
1213        let row_metric = Array1::ones(4);
1214        let offsets = Array1::zeros(4);
1215
1216        let penalty = marginal_logslope_overlap_penalty(
1217            &marginal,
1218            &logslope,
1219            &z,
1220            &row_metric,
1221            &offsets,
1222            &offsets,
1223            0.0,
1224            0.0,
1225            1.0,
1226        )
1227        .expect("overlap penalty should build")
1228        .expect("marginal signal lies in the pilot logslope Jacobian span");
1229
1230        assert_eq!(penalty.dim(), (1, 1));
1231        assert!((penalty[[0, 0]] - 14.0).abs() < 1.0e-6);
1232    }
1233
1234    #[test]
1235    pub(crate) fn overlap_penalty_skips_weight_orthogonal_channels() {
1236        let marginal = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1237            Array2::from_shape_vec((4, 1), vec![-1.0, 1.0, -1.0, 1.0]).unwrap(),
1238        ));
1239        let logslope = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1240            Array2::from_shape_vec((4, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap(),
1241        ));
1242        let z = Array1::ones(4);
1243        let row_metric = Array1::ones(4);
1244        let offsets = Array1::zeros(4);
1245
1246        let penalty = marginal_logslope_overlap_penalty(
1247            &marginal,
1248            &logslope,
1249            &z,
1250            &row_metric,
1251            &offsets,
1252            &offsets,
1253            0.0,
1254            0.0,
1255            1.0,
1256        )
1257        .expect("overlap penalty should build");
1258
1259        assert!(penalty.is_none());
1260    }
1261
1262    // ── Fitted-η separation guard fixtures ───────────────────────────────
1263    //
1264    // The runaway guard tests the FITTED marginal predictor sup-norm
1265    // `|η|∞ = max_i |X[i,:]·β|`, not raw `|β|∞`. These helpers build minimal
1266    // `TermCollectionDesign` / `TermCollectionSpec` pairs from a dense design so
1267    // the criterion is exercised deterministically with no data files.
1268
1269    fn dense_marginal_design(
1270        x: Array2<f64>,
1271        intercept_range: std::ops::Range<usize>,
1272        linear_ranges: Vec<(String, std::ops::Range<usize>)>,
1273    ) -> TermCollectionDesign {
1274        TermCollectionDesign {
1275            design: DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(x)),
1276            penalties: Vec::new(),
1277            nullspace_dims: Vec::new(),
1278            penaltyinfo: Vec::new(),
1279            dropped_penaltyinfo: Vec::new(),
1280            coefficient_lower_bounds: None,
1281            linear_constraints: None,
1282            intercept_range,
1283            linear_ranges,
1284            random_effect_ranges: Vec::new(),
1285            random_effect_levels: Vec::new(),
1286            smooth: gam_terms::smooth::SmoothDesign {
1287                term_designs: Vec::new(),
1288                penalties: Vec::new(),
1289                nullspace_dims: Vec::new(),
1290                penaltyinfo: Vec::new(),
1291                dropped_penaltyinfo: Vec::new(),
1292                terms: Vec::new(),
1293                coefficient_lower_bounds: None,
1294                linear_constraints: None,
1295            },
1296        }
1297    }
1298
1299    fn linear_term(name: &str, feature_col: usize) -> LinearTermSpec {
1300        LinearTermSpec {
1301            name: name.to_string(),
1302            feature_col,
1303            feature_cols: vec![feature_col],
1304            categorical_levels: vec![],
1305            double_penalty: false,
1306            coefficient_geometry: LinearCoefficientGeometry::default(),
1307            coefficient_min: None,
1308            coefficient_max: None,
1309        }
1310    }
1311
1312    fn empty_spec() -> TermCollectionSpec {
1313        TermCollectionSpec {
1314            linear_terms: Vec::new(),
1315            random_effect_terms: Vec::new(),
1316            smooth_terms: Vec::new(),
1317        }
1318    }
1319
1320    /// Regression lock for the false-positive fix: an ill-conditioned
1321    /// non-orthonormal basis (two identical/collinear columns) with a large
1322    /// cancelling coefficient `β=[60,-60]` yields `Xβ ≡ 0` — a perfectly
1323    /// bounded fitted predictor. The guard MUST NOT fire even though raw
1324    /// `|β|∞=60` is far above the old `40.0` coefficient threshold. This is the
1325    /// exact pathology that aborted valid biobank fits.
1326    #[test]
1327    pub(crate) fn runaway_guard_silent_when_huge_beta_cancels_to_bounded_eta() {
1328        // Two identical columns ⇒ Xβ = (β0+β1)·col; β=[60,-60] ⇒ Xβ ≡ 0.
1329        let x = Array2::<f64>::from_shape_vec((4, 2), vec![1.0; 8]).unwrap();
1330        let design = dense_marginal_design(x, 0..0, Vec::new());
1331        let beta = Array1::from_vec(vec![60.0, -60.0]);
1332
1333        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1334            beta.view(),
1335            &design,
1336            &empty_spec(),
1337            true,
1338            "regression-fixture",
1339        );
1340        assert!(
1341            msg.is_none(),
1342            "huge cancelling β with bounded fitted η must NOT trip the runaway guard; got {msg:?}"
1343        );
1344    }
1345
1346    /// A genuinely separating full marginal surface: a single column with a
1347    /// large coefficient drives `|η|∞ = 40 ≥ 35`, so the guard fires and names
1348    /// the marginal/logslope coupling explanation.
1349    #[test]
1350    pub(crate) fn runaway_guard_fires_when_fitted_eta_exceeds_threshold() {
1351        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1352        let design = dense_marginal_design(x, 0..0, Vec::new());
1353        let beta = Array1::from_vec(vec![40.0]);
1354
1355        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1356            beta.view(),
1357            &design,
1358            &empty_spec(),
1359            true,
1360            "separation-fixture",
1361        )
1362        .expect("fitted |η|∞=40 ≥ 35 must trip the runaway guard");
1363
1364        assert!(msg.contains("marginal/logslope runaway"));
1365        assert!(msg.contains("|η|∞"));
1366        assert!(msg.contains("4.000e1"));
1367        assert!(msg.contains("score is correlated with the shared surface covariates"));
1368        assert!(msg.contains("not a Matérn/Duchon polynomial-nullspace"));
1369        assert!(msg.contains("KKT certificate"));
1370    }
1371
1372    /// An unpenalized parametric direction can genuinely separate (no smoothness
1373    /// penalty bounding it). When its fitted contribution reaches the η scale
1374    /// the parametric arm fires first and names the parametric explanation.
1375    #[test]
1376    pub(crate) fn runaway_guard_names_unpenalized_parametric_direction_via_fitted_eta() {
1377        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1378        let design = dense_marginal_design(x, 0..0, vec![("sex".to_string(), 0..1)]);
1379        let mut spec = empty_spec();
1380        spec.linear_terms.push(linear_term("sex", 0));
1381        let beta = Array1::from_vec(vec![41.0]);
1382
1383        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1384            beta.view(),
1385            &design,
1386            &spec,
1387            true,
1388            "parametric-fixture",
1389        )
1390        .expect("parametric fitted |η|∞=41 ≥ 35 must trip the runaway guard");
1391
1392        assert!(msg.contains("unpenalized parametric marginal direction"));
1393        assert!(msg.contains("|η|∞"));
1394        assert!(msg.contains("robust Jeffreys curvature path is already installed"));
1395        assert!(msg.contains("not a Matérn/Duchon polynomial-nullspace"));
1396    }
1397
1398    /// A non-converged inner solve with a BOUNDED fitted predictor must NOT
1399    /// surface the separation error — the non-convergence is reported through
1400    /// the existing downstream path, not as a runaway.
1401    #[test]
1402    pub(crate) fn runaway_guard_silent_for_nonconverged_but_bounded_eta() {
1403        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1404        let design = dense_marginal_design(x, 0..0, Vec::new());
1405        let beta = Array1::from_vec(vec![5.0]);
1406
1407        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1408            beta.view(),
1409            &design,
1410            &empty_spec(),
1411            false,
1412            "nonconverged-fixture",
1413        );
1414        assert!(
1415            msg.is_none(),
1416            "bounded fitted η must not raise the separation error even when the inner solve did not converge; got {msg:?}"
1417        );
1418    }
1419
1420    /// A genuinely separating fit that ALSO failed to converge still surfaces
1421    /// the runaway error, and reports the non-converged inner status.
1422    #[test]
1423    pub(crate) fn runaway_guard_fires_for_nonconverged_separating_eta() {
1424        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1425        let design = dense_marginal_design(x, 0..0, Vec::new());
1426        let beta = Array1::from_vec(vec![50.0]);
1427
1428        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1429            beta.view(),
1430            &design,
1431            &empty_spec(),
1432            false,
1433            "nonconverged-separating-fixture",
1434        )
1435        .expect("separating |η|∞ at non-convergence must still trip the guard");
1436
1437        assert!(msg.contains(
1438            "the inner solve failed while already carrying a separation-scale predictor"
1439        ));
1440    }
1441
1442    /// Regression lock for gam#370: the pre-fit identifiability audit evaluates
1443    /// every block's effective Jacobian at the empty/zero β with
1444    /// `family_scalars: None`. The BMS marginal and logslope blocks own the
1445    /// logslope design + offset, so `g_i = offset_s[i]` (the fitted logslope
1446    /// baseline, generically nonzero) at β = 0 — and the old code hard-errored
1447    /// ("requires BmsFamilyScalars when beta != 0 … got family_scalars: None")
1448    /// because it read `any_nonzero_g` and demanded a caller-supplied scalar it
1449    /// could in fact reconstruct itself. Both blocks must now self-compute the
1450    /// closed-form `c_i = sqrt(1 + (s·g_i)²)` (and the logslope factor) from
1451    /// owned data and return a finite Jacobian, NOT an error. This makes every
1452    /// `logslope_formula` / `linkwiggle(...)` BMS fit reachable through the
1453    /// Python `gamfit.fit` API (the audit no longer aborts before the fit).
1454    #[test]
1455    pub(crate) fn bms_block_jacobians_self_compute_at_audit_empty_beta_nonzero_logslope_baseline() {
1456        use std::sync::Arc;
1457        let n = 4usize;
1458        let marginal =
1459            Arc::new(Array2::<f64>::from_shape_vec((n, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap());
1460        let logslope =
1461            Arc::new(Array2::<f64>::from_shape_vec((n, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap());
1462        let offset_m = Array1::<f64>::zeros(n);
1463        // Nonzero logslope baseline absorbed into offset_s — the exact pooled
1464        // probit pilot value that is "essentially never exactly 0".
1465        let g_baseline = 0.3_f64;
1466        let offset_s = Array1::<f64>::from_elem(n, g_baseline);
1467        let z = Arc::new(Array1::from_vec(vec![-0.7, 0.2, 0.9, 1.4]));
1468        let s = 1.0_f64;
1469
1470        // The pre-fit audit linearization state: EMPTY β, no family scalars.
1471        let beta: Vec<f64> = Vec::new();
1472        let state = FamilyLinearizationState {
1473            beta: &beta,
1474            family_scalars: None,
1475            channel_hessian: None,
1476            probit_frailty_scale: s,
1477        };
1478
1479        let marginal_jac = BmsMarginalJacobian::new(
1480            Arc::clone(&marginal),
1481            Arc::clone(&logslope),
1482            offset_m.clone(),
1483            offset_s.clone(),
1484            1,
1485        );
1486        let j_m = marginal_jac
1487            .effective_jacobian_rows(&state, 0..n)
1488            .expect("BMS marginal Jacobian must self-compute at audit empty β (gam#370)");
1489        // ∂η_i/∂β_m = c_i · M[i,:], c_i = sqrt(1 + (s·g_baseline)²), M[i,0]=1.
1490        let c_expected = (1.0 + (s * g_baseline).powi(2)).sqrt();
1491        assert_eq!(j_m.dim(), (n, 1));
1492        for i in 0..n {
1493            assert!(
1494                (j_m[[i, 0]] - c_expected).abs() < 1e-12,
1495                "marginal J[{i}] = {} != closed-form c_i = {c_expected}",
1496                j_m[[i, 0]]
1497            );
1498        }
1499
1500        let logslope_jac = BmsLogslopeJacobian::new(
1501            Arc::clone(&marginal),
1502            Arc::clone(&logslope),
1503            offset_m,
1504            offset_s,
1505            Arc::clone(&z),
1506            1,
1507        );
1508        let j_s = logslope_jac
1509            .effective_jacobian_rows(&state, 0..n)
1510            .expect("BMS logslope Jacobian must self-compute at audit empty β (gam#370)");
1511        // ∂η_i/∂β_s = (q_i·s²·g_i/c_i + s·z_i)·G[i,:]; at empty β, q_i = 0 and
1512        // g_i = g_baseline, so the factor is s²·0·… + s·z_i = s·z_i (G[i,0]=1).
1513        assert_eq!(j_s.dim(), (n, 1));
1514        for i in 0..n {
1515            let expected = s * z[i];
1516            assert!(
1517                (j_s[[i, 0]] - expected).abs() < 1e-12,
1518                "logslope J[{i}] = {} != closed-form factor {expected}",
1519                j_s[[i, 0]]
1520            );
1521            assert!(j_s[[i, 0]].is_finite());
1522        }
1523    }
1524}
1525
1526pub(crate) fn build_marginal_blockspec_bms(
1527    design: &TermCollectionDesign,
1528    baseline: f64,
1529    offset: &Array1<f64>,
1530    rho: Array1<f64>,
1531    beta_hint: Option<Array1<f64>>,
1532    logslope_design: &TermCollectionDesign,
1533    logslope_offset: &Array1<f64>,
1534    logslope_baseline: f64,
1535    p_marginal: usize,
1536    influence_columns: Option<&Array2<f64>>,
1537    influence_ridge_log_lambda: f64,
1538) -> Result<ParameterBlockSpec, String> {
1539    let offset_m = offset + baseline;
1540    let offset_s = logslope_offset + logslope_baseline;
1541    let raw_marginal_dense = design
1542        .design
1543        .try_to_dense_arc("build_marginal_blockspec_bms::marginal")?;
1544    let marginal_dense =
1545        widen_marginal_dense_with_influence(&raw_marginal_dense, influence_columns)?;
1546    let logslope_dense = logslope_design
1547        .design
1548        .try_to_dense_arc("build_marginal_blockspec_bms::logslope")?;
1549    let callback: Arc<dyn BlockEffectiveJacobian> = Arc::new(BmsMarginalJacobian {
1550        marginal_dense: Arc::clone(&marginal_dense),
1551        logslope_dense,
1552        offset_m: offset_m.clone(),
1553        offset_s,
1554        p_marginal,
1555    });
1556    let (penalties, nullspace_dims, initial_log_lambdas) = marginal_penalties_with_influence_ridge(
1557        design,
1558        &rho,
1559        influence_columns,
1560        influence_ridge_log_lambda,
1561    )?;
1562    Ok(ParameterBlockSpec {
1563        name: "marginal_surface".to_string(),
1564        design: DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1565            (*marginal_dense).clone(),
1566        )),
1567        offset: offset_m,
1568        penalties,
1569        nullspace_dims,
1570        initial_log_lambdas,
1571        initial_beta: widen_marginal_beta_hint(beta_hint, p_marginal),
1572        // Canonical-gauge architecture (issue #322): give marginal_surface
1573        // strictly higher priority than logslope_surface so the priority-
1574        // ordered RRQR in `canonicalize_for_identifiability` presents
1575        // marginal columns first and routes any cross-block alias drop into
1576        // logslope.  Equal priorities (the previous default of 100/100)
1577        // produced a same-priority `hard_alias_pair` whenever a
1578        // high-dimensional smooth — e.g. `s(x, type=duchon, centers>=6)`
1579        // in the location block — accidentally spanned the logslope basis
1580        // direction, leaving the joint Hessian with a structural null and
1581        // the spectral Newton solve refusing to step.  The values mirror
1582        // the canonical-gauge entry for survival marginal-slope
1583        // (marginal=150, logslope=120).
1584        gauge_priority: GAUGE_PRIORITY_MARGINAL,
1585        jacobian_callback: Some(callback),
1586        stacked_design: None,
1587        stacked_offset: None,
1588    })
1589}
1590
1591pub(crate) fn build_logslope_blockspec_bms(
1592    design: &TermCollectionDesign,
1593    baseline: f64,
1594    offset: &Array1<f64>,
1595    rho: Array1<f64>,
1596    beta_hint: Option<Array1<f64>>,
1597    marginal_design: &TermCollectionDesign,
1598    marginal_offset: &Array1<f64>,
1599    marginal_baseline: f64,
1600    z: Arc<Array1<f64>>,
1601    p_marginal: usize,
1602    influence_columns: Option<&Array2<f64>>,
1603) -> Result<ParameterBlockSpec, String> {
1604    let offset_s = offset + baseline;
1605    let offset_m = marginal_offset + marginal_baseline;
1606    let raw_marginal_dense = marginal_design
1607        .design
1608        .try_to_dense_arc("build_logslope_blockspec_bms::marginal")?;
1609    // The logslope Jacobian reconstructs q_i = M·β_m + offset_m; with the
1610    // absorbed influence columns folded into the marginal index, the marginal
1611    // reference design and p_marginal MUST be the widened [M | Z̃] / (p_m+p₁)
1612    // so β_m slices to the absorber and q_i carries the Z̃·γ shift (#461).
1613    let marginal_dense =
1614        widen_marginal_dense_with_influence(&raw_marginal_dense, influence_columns)?;
1615    let logslope_dense = design
1616        .design
1617        .try_to_dense_arc("build_logslope_blockspec_bms::logslope")?;
1618    let callback: Arc<dyn BlockEffectiveJacobian> = Arc::new(BmsLogslopeJacobian {
1619        marginal_dense,
1620        logslope_dense: Arc::clone(&logslope_dense),
1621        offset_m,
1622        offset_s: offset_s.clone(),
1623        z,
1624        p_marginal,
1625    });
1626    Ok(ParameterBlockSpec {
1627        name: "logslope_surface".to_string(),
1628        design: DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1629            (*logslope_dense).clone(),
1630        )),
1631        offset: offset_s,
1632        penalties: design.penalties_as_penalty_matrix(),
1633        nullspace_dims: design.nullspace_dims.clone(),
1634        initial_log_lambdas: rho,
1635        initial_beta: beta_hint,
1636        // Canonical-gauge architecture (issue #322): logslope is strictly
1637        // lower priority than marginal so the priority-ordered RRQR in
1638        // `canonicalize_for_identifiability` demotes a shared cross-block
1639        // direction here, not in marginal.  Mirrors the survival-mgs
1640        // value (marginal=150, logslope=120).  See the matching comment
1641        // on `build_marginal_blockspec_bms` for the failure mode this
1642        // resolves.
1643        gauge_priority: GAUGE_PRIORITY_LOGSLOPE,
1644        jacobian_callback: Some(callback),
1645        stacked_design: None,
1646        stacked_offset: None,
1647    })
1648}
1649
1650pub(crate) fn build_deviation_aux_blockspec(
1651    name: &str,
1652    prepared: &DeviationPrepared,
1653    rho: Array1<f64>,
1654    beta_hint: Option<Array1<f64>>,
1655) -> Result<ParameterBlockSpec, String> {
1656    let mut block = prepared.block.clone();
1657    block.initial_log_lambdas = Some(rho);
1658    let candidate_beta = beta_hint.or_else(|| Some(Array1::<f64>::zeros(block.design.ncols())));
1659    block.initial_beta = candidate_beta
1660        .map(|beta| {
1661            let zero = Array1::<f64>::zeros(beta.len());
1662            project_monotone_feasible_beta(&prepared.runtime, &zero, &beta, name)
1663        })
1664        .transpose()?;
1665    let mut spec = block.intospec(name)?;
1666    // Deviation auxiliary blocks (score_warp_dev, link_dev, and any
1667    // future flex block routed through this builder) model pure
1668    // shape modifications on top of parametric anchors. They must
1669    // never own a shared affine direction with the parametric
1670    // (time / marginal / logslope) blocks. The canonical-gauge
1671    // selector drops shared directions from blocks with lower
1672    // gauge_priority first; assigning a value below the parametric
1673    // default (GAUGE_PRIORITY_CANDIDATE_FLEX) realises that contract
1674    // automatically.
1675    spec.gauge_priority = match name {
1676        "link_dev" => GAUGE_PRIORITY_LINK_DEV,
1677        // score_warp_dev gets a slightly higher priority than link_dev
1678        // because in mixed-flex configurations (both blocks present)
1679        // link_dev is the residualised one (orthogonalised against the
1680        // parametric anchors PLUS the already-prepared score_warp
1681        // basis at construction time); link_dev should therefore yield
1682        // first when an alias still survives into the joint design.
1683        "score_warp_dev" => GAUGE_PRIORITY_SCORE_WARP_DEV,
1684        _ => GAUGE_PRIORITY_DEVIATION_DEFAULT,
1685    };
1686    Ok(spec)
1687}
1688
1689pub(crate) fn push_deviation_aux_blockspecs(
1690    blocks: &mut Vec<ParameterBlockSpec>,
1691    rho: &Array1<f64>,
1692    cursor: &mut usize,
1693    score_warp_prepared: Option<&DeviationPrepared>,
1694    link_dev_prepared: Option<&DeviationPrepared>,
1695    score_warp_beta_hint: Option<Array1<f64>>,
1696    link_dev_beta_hint: Option<Array1<f64>>,
1697) -> Result<(), String> {
1698    if let Some(prepared) = score_warp_prepared {
1699        let rho_h = rho
1700            .slice(s![*cursor..*cursor + prepared.block.penalties.len()])
1701            .to_owned();
1702        *cursor += prepared.block.penalties.len();
1703        blocks.push(build_deviation_aux_blockspec(
1704            "score_warp_dev",
1705            prepared,
1706            rho_h,
1707            score_warp_beta_hint,
1708        )?);
1709    }
1710    if let Some(prepared) = link_dev_prepared {
1711        let rho_w = rho
1712            .slice(s![*cursor..*cursor + prepared.block.penalties.len()])
1713            .to_owned();
1714        blocks.push(build_deviation_aux_blockspec(
1715            "link_dev",
1716            prepared,
1717            rho_w,
1718            link_dev_beta_hint,
1719        )?);
1720    }
1721    Ok(())
1722}
1723
1724fn inner_fit(
1725    family: &BernoulliMarginalSlopeFamily,
1726    blocks: &[ParameterBlockSpec],
1727    options: &BlockwiseFitOptions,
1728) -> Result<UnifiedFitResult, String> {
1729    let mut options = options.clone();
1730    // BMS carries fixed physical ridge penalties that regularize coefficient
1731    // geometry but are not REML coordinates. The exact hyper-Hessian route can
1732    // stall after that projection; the family has a dedicated exact-gradient
1733    // path with full-data polish, so make it the primary nested smoother.
1734    options.use_outer_hessian = false;
1735    options.outer_tol = options.outer_tol.max(2.0e-5);
1736    fit_custom_family(family, blocks, &options).map_err(|e| e.to_string())
1737}
1738
1739pub fn fit_bernoulli_marginal_slope_terms(
1740    data: ArrayView2<'_, f64>,
1741    spec: BernoulliMarginalSlopeTermSpec,
1742    options: &BlockwiseFitOptions,
1743    kappa_options: &SpatialLengthScaleOptimizationOptions,
1744    policy: &gam_runtime::resource::ResourcePolicy,
1745) -> Result<BernoulliMarginalSlopeFitResult, String> {
1746    let mut spec = spec;
1747    let data_view = data;
1748    validate_spec(data_view, &spec)?;
1749    // Freeze the measure-jet representer length-scale dial on the coupled
1750    // marginal + log-slope surfaces (#1116). A shared mjs basis feeds BOTH
1751    // blocks; a design-moving ℓ on those shared covariates lets the outer
1752    // search reach a sharp ℓ where a marginal smooth direction trades off
1753    // against the log-slope into a separation-scale runaway (|β|→1e3). The auto
1754    // (frozen) ℓ is well-conditioned here — the pre-#1116 behavior this
1755    // restores. ℓ-learning stays on for single-surface (e.g. Gaussian) fits,
1756    // where there is no marginal/log-slope coupling to destabilize.
1757    let mjs_frozen_marginal =
1758        gam_terms::smooth::freeze_measure_jet_length_scale_learning(&mut spec.marginalspec);
1759    let mjs_frozen_logslope =
1760        gam_terms::smooth::freeze_measure_jet_length_scale_learning(&mut spec.logslopespec);
1761    if mjs_frozen_marginal + mjs_frozen_logslope > 0 {
1762        log::info!(
1763            "[BMS spatial] froze measure-jet length-scale learning on {} marginal + {} log-slope \
1764             term(s): the coupled surface keeps ℓ at its conditioned auto value (#1116)",
1765            mjs_frozen_marginal,
1766            mjs_frozen_logslope
1767        );
1768    }
1769    let mut effective_kappa_options = kappa_options.clone();
1770    // Honor explicit `length_scale=X` in the user's formula: when every
1771    // spatial term in BOTH the marginal mean and log-slope blocks carries
1772    // a user-supplied scalar length scale and no per-axis anisotropy is
1773    // requested, there is nothing for the joint-spatial outer optimizer
1774    // to do. Routing through it anyway spends ~80 outer ARC iters stalled
1775    // at the user's chosen ρ (the n-block ARC's first proposed step lands
1776    // at the box corner and never recovers), then falls through to the
1777    // ρ-only "custom family" path which is what we wanted all along.
1778    // Short-circuit straight to the ρ-only path.
1779    let kappa_locked_marginal = gam_terms::smooth::all_spatial_terms_kappa_fixed(&spec.marginalspec);
1780    let kappa_locked_logslope = gam_terms::smooth::all_spatial_terms_kappa_fixed(&spec.logslopespec);
1781    if effective_kappa_options.enabled && kappa_locked_marginal && kappa_locked_logslope {
1782        log::info!(
1783            "[BMS spatial] disabling κ/ψ optimization: every spatial term has an \
1784             explicit length_scale and no anisotropy; user-supplied kernel scale is fixed"
1785        );
1786        effective_kappa_options.enabled = false;
1787    }
1788    let flex_spatial_pilot_path = (spec.score_warp.is_some() || spec.link_dev.is_some())
1789        && spec.y.len() >= BMS_FLEX_SPATIAL_OUTER_PILOT_ROW_THRESHOLD
1790        && effective_kappa_options.enabled;
1791    if flex_spatial_pilot_path {
1792        let marginal_terms = spatial_length_scale_term_indices(&spec.marginalspec);
1793        let logslope_terms = spatial_length_scale_term_indices(&spec.logslopespec);
1794        let marginal_updates = apply_spatial_anisotropy_pilot_initializer(
1795            data_view,
1796            &mut spec.marginalspec,
1797            &marginal_terms,
1798            effective_kappa_options.pilot_subsample_threshold,
1799            &effective_kappa_options,
1800        );
1801        let logslope_updates = apply_spatial_anisotropy_pilot_initializer(
1802            data_view,
1803            &mut spec.logslopespec,
1804            &logslope_terms,
1805            effective_kappa_options.pilot_subsample_threshold,
1806            &effective_kappa_options,
1807        );
1808        effective_kappa_options.enabled = false;
1809        log::info!(
1810            "[BMS spatial] n={} flex=true pilot_geometry_updates={} iterative_spatial_outer=false reason=large-flex-spatial-pilot",
1811            spec.y.len(),
1812            marginal_updates + logslope_updates,
1813        );
1814    }
1815    let (z_standardized, z_normalization) = standardize_latent_z_with_policy(
1816        &spec.z,
1817        &spec.weights,
1818        "bernoulli-marginal-slope",
1819        &spec.latent_z_policy,
1820    )?;
1821    spec.z = z_standardized;
1822    let sigma_learnable = matches!(
1823        &spec.frailty,
1824        FrailtySpec::GaussianShift { sigma_fixed: None }
1825    );
1826    let initial_sigma = match &spec.frailty {
1827        FrailtySpec::GaussianShift {
1828            sigma_fixed: Some(s),
1829        } => Some(*s),
1830        FrailtySpec::GaussianShift { sigma_fixed: None } => Some(0.5),
1831        FrailtySpec::None => None,
1832        FrailtySpec::HazardMultiplier { .. } => {
1833            return Err(
1834                "internal: validate_spec should have rejected unsupported marginal-slope frailty"
1835                    .to_string(),
1836            );
1837        }
1838    };
1839    let probit_scale = probit_frailty_scale(initial_sigma);
1840    let (_raw_joint_designs, mut joint_specs) = build_term_collection_designs_and_freeze_joint(
1841        data_view,
1842        &[spec.marginalspec.clone(), spec.logslopespec.clone()],
1843    )
1844    .map_err(|e| e.to_string())?;
1845    let marginalspec_boot = joint_specs.remove(0);
1846    let logslopespec_boot = joint_specs.remove(0);
1847    // Rebuild the probe designs from the frozen `*_boot` specs so the probe's
1848    // penalty topology matches the topology produced by every other build path
1849    // in this optimization. The spatial optimizer's own bootstrap
1850    // (`build_term_collection_designs_and_freeze_joint(data, &[marginalspec_boot,
1851    // logslopespec_boot])` inside `optimize_spatial_length_scale_exact_joint`)
1852    // and every subsequent kappa-driven rebuild feed the basis builder the
1853    // captured `FrozenTransform` identifiability. Applying that captured
1854    // transform to the same kernel can land the structural null-space block on
1855    // the other side of `build_nullspace_shrinkage_penalty`'s spectral
1856    // tolerance, so the raw and frozen builds disagree on whether the trend
1857    // ridge survives as an active penalty candidate. Without this rebuild,
1858    // `marginal_penalty_count` / `logslope_design.penalties.len()` are taken
1859    // from the raw build but every subsequent evaluator measures the frozen
1860    // build, and `evaluate_custom_family_joint_hyper` refuses with a
1861    // "joint hyper rho dimension mismatch". Mirrors the CTN-side fix in
1862    // `fit_transformation_normal`.
1863    let (mut joint_designs, _) = build_term_collection_designs_and_freeze_joint(
1864        data_view,
1865        &[marginalspec_boot.clone(), logslopespec_boot.clone()],
1866    )
1867    .map_err(|e| format!("failed to rebuild frozen probe BMS joint designs: {e}"))?;
1868    let marginal_design = joint_designs.remove(0);
1869    let logslope_design = joint_designs.remove(0);
1870    // #905: the conditional `E[z|C]`/`Var(z|C)` Rao gate conditions on the
1871    // marginal-index span a(C) (= the marginal design columns), which is
1872    // exactly where the `b(C)·m(C)` leakage lives. It is engaged only on the
1873    // raw-z path (no CTN Stage-1 influence absorber); when an absorber is
1874    // active the conditional leakage is already absorbed (#461) and the
1875    // widened-marginal predict seam must not be perturbed by replacing z.
1876    let absorber_active = spec
1877        .score_influence_jacobian
1878        .as_ref()
1879        .is_some_and(|j| j.ncols() > 0);
1880    let conditioning_dense = if absorber_active {
1881        None
1882    } else {
1883        Some(
1884            marginal_design
1885                .design
1886                .try_to_dense_arc("bernoulli marginal-slope conditional latent-z gate")?,
1887        )
1888    };
1889    let (latent_measure, latent_z_calibration) = build_latent_measure_with_geometry(
1890        &spec.z,
1891        &spec.weights,
1892        &spec.latent_z_policy,
1893        conditioning_dense.as_ref().map(|d| d.view()),
1894    )?;
1895    if latent_measure.is_empirical() && sigma_learnable {
1896        return Err("empirical latent-measure marginal-slope calibration requires fixed GaussianShift sigma; learnable sigma derivatives must be fit under the standard-normal latent measure"
1897                    .to_string());
1898    }
1899
1900    let y = Arc::new(spec.y.clone());
1901    let weights = Arc::new(spec.weights.clone());
1902    // Apply rank-INT calibration to training z before any downstream
1903    // consumer (pooled probit baseline, term-collection designs, the
1904    // family's PIRLS loops) sees it. The calibration is persisted on the
1905    // fit result so prediction applies the identical monotone map.
1906    let z = match &latent_z_calibration {
1907        LatentMeasureCalibration::None => Arc::new(spec.z.clone()),
1908        LatentMeasureCalibration::RankInverseNormal(cal) => {
1909            Arc::new(cal.apply_to_training(&spec.z)?)
1910        }
1911        LatentMeasureCalibration::ConditionalLocationScale(cal) => {
1912            // ζ = (z − m(C))/√v(C) on the marginal-index span. The conditioning
1913            // block was built above (raw-z path only), so it is present here.
1914            let a_block = conditioning_dense.as_ref().ok_or_else(|| {
1915                "conditional latent calibration requires the marginal conditioning block"
1916                    .to_string()
1917            })?;
1918            Arc::new(cal.apply(spec.z.view(), a_block.view())?)
1919        }
1920    };
1921    let z_train = z.as_ref();
1922    let pilot_baseline = pooled_probit_baseline(&spec.y, z_train, &spec.weights)?;
1923    let baseline = (
1924        bernoulli_marginal_slope_eta_from_probability(
1925            &spec.base_link,
1926            normal_cdf(pilot_baseline.0),
1927            "bernoulli marginal-slope baseline link inversion",
1928        )?,
1929        pilot_baseline.1 / probit_scale,
1930    );
1931
1932    // Score-warp basis construction is β-independent (identifiability is
1933    // provided by the smoothness-null-space drop on the basis transform,
1934    // not by a data-distribution moment anchor at the rigid-pilot η₀), so
1935    // the standard-normal and empirical latent-measure branches build the
1936    // same block. There is no row-weight pilot to thread into the basis;
1937    // the latent-measure split is enforced upstream via the empirical
1938    // intercept solve in `build_row_exact_context_with_stats`, not in the
1939    // deviation basis.
1940    // Score-warp basis is built first, then immediately reparameterised
1941    // against the parametric span (marginal + logslope columns at the n
1942    // training rows) so its column span is orthogonal to span(X_marginal,
1943    // X_logslope) by construction. This is the first half of the joint-
1944    // design identifiability invariant; the second half (link-deviation
1945    // orthogonalised against parametric + the now-reparameterised score-
1946    // warp) runs inside the link-deviation closure below. Together they
1947    // ensure `[X_marginal | X_logslope | Φ_score_warp · T_sw |
1948    // Φ_link_dev · T_lw]` has full numerical column rank, structurally
1949    // bounding `σ_min(joint H + S) ≥ λ_min(S) > 0` regardless of how β
1950    // drifts the linear predictor distribution during PIRLS.
1951    // Cross-block W-metric pilot. The joint penalised Hessian during PIRLS
1952    // uses the probit-style data Hessian row metric
1953    //
1954    //   W_pirls[i] = spec.weights[i] · φ(η_i)² / (μ_i·(1−μ_i))
1955    //
1956    // which is the canonical IRLS row weight. The cross-block
1957    // orthogonalisation below must use this metric (not uniform
1958    // spec.weights) so that `Aᵀ W C̃ = 0` holds in the same inner product
1959    // the joint Hessian sees — otherwise A and C̃ are merely Euclidean-
1960    // orthogonal, `Aᵀ W_pirls C̃ ≠ 0`, the joint Hessian carries a near-
1961    // null direction along the W-metric alias, and REML can drive the
1962    // flex block's λ small enough that the alias direction's joint
1963    // Hessian eigenvalue collapses. β then runs away along the alias
1964    // (manifest as `rho≈2.0`, constant `step_inf`, growing `beta_inf`
1965    // during PIRLS, and the inner solve hitting `inner_max_cycles`
1966    // without satisfying the KKT residual).
1967    //
1968    // Use the rigid pooled-probit pilot η for score-warp (its basis is
1969    // β-independent in z, so the rigid pilot suffices) and the one-GN-
1970    // stepped pilot η for link-deviation (its basis is evaluated at the
1971    // same eta_pilot used here, so the orthogonalisation metric matches
1972    // the basis evaluation point exactly). Both are β-independent so the
1973    // orthogonalisation remains a one-shot construction-time step.
1974    let rigid_pilot_eta = rigid_pooled_probit_pilot_eta(
1975        &spec.base_link,
1976        z_train,
1977        &spec.marginal_offset,
1978        &spec.logslope_offset,
1979        baseline.0,
1980        baseline.1,
1981        probit_scale,
1982    )?;
1983    let cross_block_pilot_w_score_warp =
1984        pilot_irls_hessian_row_metric_at_eta(&rigid_pilot_eta, &spec.weights);
1985
1986    // Absorbed Stage-1 influence columns (#461, design §3). When the workflow
1987    // chained a CTN Stage-1 into this marginal-slope fit, `spec.score_influence_
1988    // jacobian` carries the out-of-fold `J = ∂z/∂θ₁`; the realized leakage
1989    // directions `Z_infl = diag(s_f·β̂₀)·J` are residualised against the fitted
1990    // marginal+logslope target span and appended to the additive marginal-index
1991    // block as a fixed-ridge absorber, so the joint penalised solve makes the
1992    // (α,β) score orthogonal to the remaining nuisance span without letting the
1993    // absorber compete for identifiable β(x) signal. `None` ⇒ raw z, and the
1994    // free score_warp spline below is the x-free-column fallback. β̂₀(x_i) is
1995    // the rigid-pilot logslope `baseline.1 + logslope_offset[i]`; s_f =
1996    // probit_scale.
1997    let influence_columns = if let Some(jac) = spec
1998        .score_influence_jacobian
1999        .as_ref()
2000        .filter(|j| j.ncols() > 0)
2001    {
2002        let protected_design = DesignMatrix::hstack(vec![
2003            marginal_design.design.clone(),
2004            logslope_design.design.clone(),
2005        ])
2006        .map_err(|e| {
2007            format!(
2008                "bernoulli marginal-slope influence-block protected projection stack failed to concatenate marginal + logslope design: {e}"
2009            )
2010        })?;
2011        let protected_dense_for_proj = protected_design
2012            .try_to_dense_arc("bernoulli marginal-slope influence-block protected projection")?;
2013        let protected_dense = protected_dense_for_proj.as_ref();
2014        if jac.nrows() != protected_dense.nrows() {
2015            return Err(format!(
2016                "influence block: Jacobian has {} rows, protected design has {}",
2017                jac.nrows(),
2018                protected_dense.nrows()
2019            ));
2020        }
2021        // Z̃ = residualize(diag(s_f·β̂₀)·J) against the fitted target span in
2022        // the rigid-pilot W-metric.  For BMS the absorbed columns are installed
2023        // in the same additive predictor as the marginal surface; if we protect
2024        // only M, any component of Z_infl aligned with the logslope design G can
2025        // be assigned to the fixed-ridge absorber by the joint solve, erasing
2026        // genuine β(x) heterogeneity.  Projecting out [M | G] keeps the nuisance
2027        // absorber orthogonal to both parametric target surfaces while still
2028        // absorbing Stage-1 leakage directions outside that identifiable target
2029        // span. β̂₀(x_i) = baseline.1 + logslope_offset[i]; s_f = probit_scale;
2030        // W = the rigid-pilot PIRLS row metric.
2031        let rigid_logslope_at_rows = &spec.logslope_offset + baseline.1;
2032        let residualized = crate::marginal_slope_orthogonal::residualized_influence_block(
2033            jac,
2034            z_train,
2035            &rigid_logslope_at_rows,
2036            probit_scale,
2037            protected_dense.view(),
2038            &cross_block_pilot_w_score_warp,
2039        )?;
2040        Some(residualized)
2041    } else {
2042        None
2043    };
2044    let mut cross_block_warnings: Vec<CrossBlockIdentifiabilityWarning> = Vec::new();
2045    let score_warp_prepared = if let Some(cfg) = spec.score_warp.as_ref() {
2046        use super::deviation_runtime::ParametricAnchorBlock;
2047        let mut prepared = build_score_warp_deviation_block_from_seed(z_train, cfg)?;
2048        // `install_compiled_flex_block_into_runtime` now delegates
2049        // its math body to `identifiability::families::compiler::compile` (commit
2050        // 4e20b8dc8); the prior Phase-4a shadow compile here was a
2051        // duplicate of that internal call and has been removed.
2052        let outcome = install_compiled_flex_block_into_runtime(
2053            &mut prepared,
2054            z_train,
2055            cfg,
2056            &[
2057                (&marginal_design.design, ParametricAnchorBlock::Marginal),
2058                (&logslope_design.design, ParametricAnchorBlock::Logslope),
2059            ],
2060            &[],
2061            &cross_block_pilot_w_score_warp,
2062        )?;
2063        match outcome {
2064            FlexCompileOutcome::Reparameterised => Some(prepared),
2065            FlexCompileOutcome::FullyAliased { reason } => {
2066                // Record via the structured channel. Keep the original
2067                // (non-compiled) design so the unified audit sees score_warp_dev
2068                // and attributes the drop via dropped_columns (gauge_priority=80
2069                // is below marginal=150 / logslope=120, so RRQR correctly
2070                // demotes score_warp_dev when it aliases those blocks).
2071                cross_block_warnings.push(CrossBlockIdentifiabilityWarning {
2072                    candidate_label: "score_warp",
2073                    anchor_summary: "marginal+logslope".to_string(),
2074                    reason,
2075                });
2076                Some(prepared)
2077            }
2078        }
2079    } else {
2080        None
2081    };
2082    // Build the link-deviation block. The basis lives in η-space, and at
2083    // PIRLS time `runtime.design(η_current)` is re-evaluated at the
2084    // current β-dependent η, so the basis is genuinely β-dependent during
2085    // optimisation. The construction-time seed is used only for (a) knot
2086    // placement in η-space and (b) the cross-block identifiability check
2087    // that computes the basis-space transform `T` orthogonalising the
2088    // candidate against the parametric and score-warp anchors at training
2089    // rows.
2090    //
2091    // Using the rigid pooled probit pilot directly (`q0 = a₀·√(…) + s_f·
2092    // b₀·z`) is structurally degenerate: with zero per-row offsets it is
2093    // affine in z, so a degree-3 I-spline of `q0` spans the same column
2094    // space at training rows as a degree-3 I-spline of z, and the cross-
2095    // block check finds the candidate fully aliased by the score-warp
2096    // anchor even though at any non-rigid β the link-deviation carries
2097    // PC/age structure the score-warp cannot represent.
2098    //
2099    // Instead, seed both knot placement and the orthogonalisation pivot at
2100    // a non-rigid pilot η computed via one probit Gauss-Newton step from
2101    // the rigid pilot onto the full marginal design (see
2102    // `pilot_eta_for_link_dev_orthogonalisation`). The pilot is row-varying
2103    // in PCs/age and the resulting `T` drops only directions aliased
2104    // across all β. The score-warp basis at training rows is also threaded
2105    // in as a flex anchor when active so the kept directions are jointly
2106    // orthogonal to parametric ⊕ score-warp.
2107    let link_dev_prepared = if let Some(cfg) = spec.link_dev.as_ref() {
2108        let eta_pilot = pilot_eta_for_link_dev_orthogonalisation(
2109            &spec.base_link,
2110            &spec.y,
2111            z_train,
2112            &spec.weights,
2113            &marginal_design.design,
2114            &spec.marginal_offset,
2115            &spec.logslope_offset,
2116            baseline.0,
2117            baseline.1,
2118            probit_scale,
2119        )?;
2120        let link_dev_seed = padded_deviation_seed(&eta_pilot, 1.0, 0.5);
2121        let mut prepared = build_link_deviation_block_from_knots_design_seed_and_weights(
2122            &link_dev_seed,
2123            &eta_pilot,
2124            cfg,
2125        )?;
2126        // Cross-block identifiability for the link-deviation basis. The
2127        // anchor union covers BOTH possible aliasing channels:
2128        //
2129        //  - Parametric: location and logslope designs evaluated at the n
2130        //    training rows. Columns of `Φ_link_dev(q0)` that reproduce
2131        //    parametric features become null-direction targets in the
2132        //    joint penalised Hessian since `S_link_dev` has no mass on
2133        //    them.
2134        //
2135        //  - Score-warp (when active): the now-reparameterised score-warp
2136        //    basis, also evaluated at training rows. Both flex bases are
2137        //    cubic I-spline cubic combinations of an η-pilot scalar, and
2138        //    even with each block's own smoothness-null-space drop their
2139        //    column spans can still overlap inside the orthogonal
2140        //    complement of `{1, η_pilot}`.
2141        //
2142        // After the orthogonalisation, `[X_marginal | X_logslope |
2143        // Φ_score_warp · T_sw | Φ_link_dev · T_lw]` has full numerical
2144        // column rank at training rows, so `σ_min(joint H+S) ≥ λ_min(S)
2145        // > 0` for every β. This is the standard GAM `gam.side`
2146        // convention generalised to multi-anchor unions (mgcv applies it
2147        // sequentially across smooths sharing a covariate).
2148        // When `install_compiled_flex_block_into_runtime`
2149        // reparameterised the score-warp runtime against the parametric
2150        // anchor union (marginal + logslope), it installed an
2151        // `anchor_residual` and cached the training-row parametric
2152        // anchor matrix on the runtime. `runtime.design()` on a
2153        // residualised runtime returns the *raw* basis evaluation,
2154        // which `assert`s the caller hasn't conflated with the
2155        // reparameterised basis — we want the reparameterised one
2156        // here, so go through `design_at_training_with_residual` so
2157        // the cached anchor rows are folded in. For score-warp
2158        // configurations where reparameterisation was a no-op (no
2159        // residual installed) the same call falls back to the raw
2160        // `design()` path, so the residual-vs-no-residual branches
2161        // converge on the right matrix.
2162        let score_warp_anchor_design = score_warp_prepared
2163            .as_ref()
2164            .map(|sw| sw.runtime.design_at_training_with_residual(z_train))
2165            .transpose()?;
2166        use super::deviation_runtime::ParametricAnchorBlock;
2167        let parametric_anchors: [(&DesignMatrix, ParametricAnchorBlock); 2] = [
2168            (&marginal_design.design, ParametricAnchorBlock::Marginal),
2169            (&logslope_design.design, ParametricAnchorBlock::Logslope),
2170        ];
2171        let flex_anchor_slot: Option<&Array2<f64>> = score_warp_anchor_design.as_ref();
2172        let flex_anchors: Vec<&Array2<f64>> = flex_anchor_slot.into_iter().collect();
2173        // W-metric for link-deviation orthogonalisation: same IRLS-style
2174        // probit Hessian row weight as the score-warp path, but evaluated at
2175        // `eta_pilot` (the one-GN-stepped pilot at which the link-dev basis
2176        // itself is anchored).
2177        let cross_block_pilot_w_link_dev =
2178            pilot_irls_hessian_row_metric_at_eta(&eta_pilot, &spec.weights);
2179        let outcome = install_compiled_flex_block_into_runtime(
2180            &mut prepared,
2181            &eta_pilot,
2182            cfg,
2183            &parametric_anchors,
2184            &flex_anchors,
2185            &cross_block_pilot_w_link_dev,
2186        )?;
2187        match outcome {
2188            FlexCompileOutcome::Reparameterised => Some(prepared),
2189            FlexCompileOutcome::FullyAliased { reason } => {
2190                // Record via the structured channel. Keep the original
2191                // (non-compiled) design so the unified audit sees link_dev
2192                // and attributes the drop via dropped_columns (gauge_priority=60
2193                // is below all parametric blocks so RRQR correctly demotes
2194                // link_dev when it aliases marginal / logslope / score_warp).
2195                cross_block_warnings.push(CrossBlockIdentifiabilityWarning {
2196                    candidate_label: "link_deviation",
2197                    anchor_summary: "marginal+logslope+score_warp".to_string(),
2198                    reason,
2199                });
2200                Some(prepared)
2201            }
2202        }
2203    } else {
2204        None
2205    };
2206    let extra_rho0 = {
2207        let mut out = Vec::new();
2208        if let Some(ref prepared) = score_warp_prepared {
2209            out.extend(std::iter::repeat_n(0.0, prepared.block.penalties.len()));
2210        }
2211        if let Some(ref prepared) = link_dev_prepared {
2212            out.extend(std::iter::repeat_n(0.0, prepared.block.penalties.len()));
2213        }
2214        out
2215    };
2216    // Reduced-basis orthogonalisation of the logslope design through the BMS
2217    // family's OWN internal `logslope_design` geometry (robust cure for the
2218    // marginal↔logslope structural confound). Robustness is unconditional, so we
2219    // always reparameterize the logslope coordinate space to a full-rank reduced
2220    // basis `T` whose effective weighted columns are W-orthogonal to the marginal
2221    // span at the rigid pilot — removing the rank-soft confounded direction the
2222    // former pinned overlap ridge merely penalised. The transform is
2223    // β/ρ-independent (pilot geometry only), so it is a one-shot construction-
2224    // time map applied to every per-iteration logslope design inside
2225    // `build_blocks` / `make_family`, and inverted at fit-result assembly so the
2226    // reported logslope β is in the original basis. `None` ⇒ nothing to reduce
2227    // (no rank-soft confounded direction) ⇒ raw design used everywhere.
2228    let logslope_reduced_reparam: Option<ReducedLogslopeReparam> = build_reduced_logslope_reparam(
2229        &marginal_design,
2230        &logslope_design,
2231        z.as_ref(),
2232        &cross_block_pilot_w_score_warp,
2233        &spec.marginal_offset,
2234        &spec.logslope_offset,
2235        baseline.0,
2236        baseline.1,
2237        probit_scale,
2238    )?;
2239    // Apply the reduced reparam to a logslope `TermCollectionDesign`, or return
2240    // the raw design clone when the reparam is absent (flag off / nothing to
2241    // reduce). Used by both `build_blocks` and `make_family` so the family's
2242    // internal design, the block design, β width, jacobian, penalty, and the
2243    // `validate_exact_block_state_shapes` check all agree at the reduced width.
2244    let reduce_logslope_design =
2245        |logslope_design: &TermCollectionDesign| -> Result<TermCollectionDesign, String> {
2246            match logslope_reduced_reparam.as_ref() {
2247                Some(reparam) => reparameterize_logslope_design_reduced(logslope_design, reparam),
2248                None => Ok(logslope_design.clone()),
2249            }
2250        };
2251
2252    let marginal_penalty_count = marginal_design.penalties.len();
2253    let setup = joint_setup(
2254        data_view,
2255        &marginalspec_boot,
2256        &logslopespec_boot,
2257        marginal_penalty_count,
2258        logslope_design.penalties.len(),
2259        &extra_rho0,
2260        &effective_kappa_options,
2261    );
2262    let setup = if sigma_learnable {
2263        setup.with_auxiliary(
2264            Array1::from_vec(vec![initial_sigma.expect("learnable sigma seed").ln()]),
2265            Array1::from_vec(vec![0.01_f64.ln()]),
2266            Array1::from_vec(vec![5.0_f64.ln()]),
2267        )
2268    } else {
2269        setup
2270    };
2271    let final_sigma_cell = std::cell::Cell::new(initial_sigma);
2272    let exact_warm_start = RefCell::new(None::<CustomFamilyWarmStart>);
2273    let runaway_error = RefCell::new(None::<String>);
2274    // Outer ρ-cache β-seed staging slot. On a cache hit the spatial-joint
2275    // optimizer invokes `seed_inner_beta_fn` before the first eval at the
2276    // restored ρ: per-block column widths aren't known until the first
2277    // `build_blocks(rho, …)` runs, so we stash the flat β here and the eval
2278    // closures promote it into `exact_warm_start` (the slot the inner
2279    // PIRLS / Newton solve actually consumes) on their first invocation.
2280    let pending_beta_seed = RefCell::new(None::<Array1<f64>>);
2281    let hints = RefCell::new(ThetaHints::default());
2282    let score_warp_runtime = score_warp_prepared.as_ref().map(|p| p.runtime.clone());
2283    let link_dev_runtime = link_dev_prepared.as_ref().map(|p| p.runtime.clone());
2284
2285    let build_blocks = |rho: &Array1<f64>,
2286                        marginal_design: &TermCollectionDesign,
2287                        logslope_design: &TermCollectionDesign|
2288     -> Result<Vec<ParameterBlockSpec>, String> {
2289        let hints = hints.borrow();
2290        let mut cursor = 0usize;
2291        // Reduced-basis orthogonalisation: replace the per-iteration logslope
2292        // design with its full-rank reduced reparameterization `G·T` (flag ON);
2293        // a no-op clone when off. The reduced design carries the SAME number of
2294        // penalties (each S → Tᵀ S T), so the `rho_logslope` slice width below
2295        // is unchanged. Every consumer (marginal jacobian's c_i, logslope
2296        // blockspec design/β/penalty/jacobian) now agrees at the reduced width.
2297        let logslope_design_reduced = reduce_logslope_design(logslope_design)?;
2298        let logslope_design = &logslope_design_reduced;
2299        // Fixed #754/#461 ridges are appended inside
2300        // `marginal_penalties_with_influence_ridge` as physical penalties and
2301        // are excluded from `rho`; only genuine REML-learned smooth penalties
2302        // appear in the spatial joint setup.
2303        let rho_marginal = rho
2304            .slice(s![cursor..cursor + marginal_design.penalties.len()])
2305            .to_owned();
2306        cursor += marginal_design.penalties.len();
2307        let rho_logslope = rho
2308            .slice(s![cursor..cursor + logslope_design.penalties.len()])
2309            .to_owned();
2310        cursor += logslope_design.penalties.len();
2311        let p_m = marginal_design.design.ncols()
2312            + influence_columns.as_ref().map(|z| z.ncols()).unwrap_or(0);
2313        let mut blocks = vec![
2314            build_marginal_blockspec_bms(
2315                marginal_design,
2316                baseline.0,
2317                &spec.marginal_offset,
2318                rho_marginal,
2319                hints.marginal_beta.clone(),
2320                logslope_design,
2321                &spec.logslope_offset,
2322                baseline.1,
2323                p_m,
2324                influence_columns.as_ref(),
2325                influence_absorber_log_lambda(spec.z.len()),
2326            )?,
2327            build_logslope_blockspec_bms(
2328                logslope_design,
2329                baseline.1,
2330                &spec.logslope_offset,
2331                rho_logslope,
2332                hints.logslope_beta.clone(),
2333                marginal_design,
2334                &spec.marginal_offset,
2335                baseline.0,
2336                Arc::clone(&z),
2337                p_m,
2338                influence_columns.as_ref(),
2339            )?,
2340        ];
2341        push_deviation_aux_blockspecs(
2342            &mut blocks,
2343            rho,
2344            &mut cursor,
2345            score_warp_prepared.as_ref(),
2346            link_dev_prepared.as_ref(),
2347            hints.score_warp_beta.clone(),
2348            hints.link_dev_beta.clone(),
2349        )?;
2350        Ok(blocks)
2351    };
2352
2353    let intercept_warm_starts = new_intercept_warm_start_cache(y.len());
2354    let cell_moment_lru = new_cell_moment_lru_cache(policy);
2355    let cell_moment_cache_stats = new_cell_moment_cache_stats();
2356    let make_family = |marginal_design: &TermCollectionDesign,
2357                       logslope_design: &TermCollectionDesign,
2358                       sigma: Option<f64>|
2359     -> BernoulliMarginalSlopeFamily {
2360        // The kernel reads the marginal index from a matched (self.marginal_
2361        // design, β_m) pair. When the Stage-1 influence absorber is active the
2362        // marginal β is widened to [β_m; γ], so the family's marginal design
2363        // MUST be the widened [M | Z̃] for every per-row projection to slice
2364        // correctly (#461). With no absorber it is the raw design unchanged.
2365        let kernel_marginal_design = match influence_columns.as_ref() {
2366            Some(z_infl) => {
2367                let raw = marginal_design
2368                    .design
2369                    .try_to_dense_arc("make_family::widened-marginal")
2370                    .expect("dense marginal design for influence widening");
2371                let widened = widen_marginal_dense_with_influence(&raw, Some(z_infl))
2372                    .expect("widen marginal design with influence columns");
2373                DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from((*widened).clone()))
2374            }
2375            None => marginal_design.design.clone(),
2376        };
2377        // The family's row kernel reconstructs η_logslope = G·β_s and the
2378        // logslope Jacobian factor_i·G_i from this matched (logslope_design,
2379        // β_s) pair, so it MUST be the SAME reduced design `G·T` the block specs
2380        // fit against — otherwise β_s (reduced width) and the family design
2381        // (full width) desync. A no-op clone when the reparam is absent.
2382        let kernel_logslope_design = reduce_logslope_design(logslope_design)
2383            .expect("reduce logslope design for family construction")
2384            .design;
2385        BernoulliMarginalSlopeFamily {
2386            y: Arc::clone(&y),
2387            weights: Arc::clone(&weights),
2388            z: Arc::clone(&z),
2389            latent_measure: latent_measure.clone(),
2390            gaussian_frailty_sd: sigma,
2391            base_link: spec.base_link.clone(),
2392            marginal_design: kernel_marginal_design,
2393            logslope_design: kernel_logslope_design,
2394            score_warp: score_warp_runtime.clone(),
2395            link_dev: link_dev_runtime.clone(),
2396            policy: policy.clone(),
2397            cell_moment_lru: Arc::clone(&cell_moment_lru),
2398            cell_moment_cache_stats: Arc::clone(&cell_moment_cache_stats),
2399            intercept_warm_starts: Some(Arc::clone(&intercept_warm_starts)),
2400            auto_subsample_phase_counter: Arc::new(std::sync::atomic::AtomicUsize::new(0)),
2401            auto_subsample_last_rho: Arc::new(Mutex::new(None)),
2402        }
2403    };
2404
2405    let marginal_terms = spatial_length_scale_term_indices(&marginalspec_boot);
2406    let logslope_terms = spatial_length_scale_term_indices(&logslopespec_boot);
2407    let marginal_has_spatial = !marginal_terms.is_empty();
2408    let logslope_has_spatial = !logslope_terms.is_empty();
2409    let analytic_joint_derivatives_available =
2410        marginal_has_spatial || logslope_has_spatial || setup.log_kappa_dim() == 0;
2411    if setup.log_kappa_dim() > 0 && !analytic_joint_derivatives_available {
2412        return Err("exact bernoulli marginal-slope spatial optimization requires analytic joint psi derivatives"
2413                    .to_string());
2414    }
2415    let initial_rho = setup.theta0().slice(s![..setup.rho_dim()]).to_owned();
2416    let initial_blocks = build_blocks(&initial_rho, &marginal_design, &logslope_design)?;
2417    let initial_family = make_family(&marginal_design, &logslope_design, initial_sigma);
2418    let (joint_gradient, joint_hessian) =
2419        custom_family_outer_derivatives(&initial_family, &initial_blocks, options);
2420    let analytic_joint_gradient_available = analytic_joint_derivatives_available
2421        && matches!(joint_gradient, gam_problem::Derivative::Analytic);
2422    // Keep the analytic outer Hessian advertised at large scale. The
2423    // row-tensor terms below are represented through block-local
2424    // `HyperOperator`s and cached exact-Hessian workspaces, so ARC/trust-region
2425    // can consume exact HVPs without falling back to BFGS merely because the
2426    // realized problem is large.
2427    let analytic_joint_hessian_available =
2428        analytic_joint_derivatives_available && joint_hessian.is_analytic();
2429    let kappa_options_ref: &SpatialLengthScaleOptimizationOptions = &effective_kappa_options;
2430    let sigma_from_theta = |theta: &Array1<f64>| -> Option<f64> {
2431        if sigma_learnable {
2432            Some(theta[setup.rho_dim() + setup.log_kappa_dim()].exp())
2433        } else {
2434            initial_sigma
2435        }
2436    };
2437    let derivative_block_cache = RefCell::new(
2438        None::<(
2439            Array1<f64>,
2440            Arc<Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>>,
2441        )>,
2442    );
2443    let theta_matches = |left: &Array1<f64>, right: &Array1<f64>| -> bool {
2444        left.len() == right.len()
2445            && left
2446                .iter()
2447                .zip(right.iter())
2448                .all(|(lhs, rhs)| (*lhs - *rhs).abs() <= 1e-12 * (1.0 + lhs.abs().max(rhs.abs())))
2449    };
2450    let get_derivative_blocks = |theta: &Array1<f64>,
2451                                 specs: &[TermCollectionSpec],
2452                                 designs: &[TermCollectionDesign]|
2453     -> Result<
2454        Arc<Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>>,
2455        String,
2456    > {
2457        if let Some((cached_theta, cached_blocks)) = derivative_block_cache.borrow().as_ref()
2458            && theta_matches(cached_theta, theta)
2459        {
2460            return Ok(Arc::clone(cached_blocks));
2461        }
2462
2463        let built = |specs: &[TermCollectionSpec],
2464                     designs: &[TermCollectionDesign]|
2465         -> Result<
2466            Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>,
2467            String,
2468        > {
2469            let marginal_psi_derivs = if marginal_has_spatial {
2470                build_block_spatial_psi_derivatives(data_view, &specs[0], &designs[0])?.ok_or_else(
2471                    || {
2472                        "bernoulli marginal-slope: marginal block has spatial terms \
2473                         but spatial psi derivatives are unavailable"
2474                            .to_string()
2475                    },
2476                )?
2477            } else {
2478                Vec::new()
2479            };
2480            let logslope_psi_derivs = if logslope_has_spatial {
2481                build_block_spatial_psi_derivatives(data_view, &specs[1], &designs[1])?.ok_or_else(
2482                    || {
2483                        "bernoulli marginal-slope: logslope block has spatial terms \
2484                         but spatial psi derivatives are unavailable"
2485                            .to_string()
2486                    },
2487                )?
2488            } else {
2489                Vec::new()
2490            };
2491            let mut derivative_blocks = vec![marginal_psi_derivs, logslope_psi_derivs];
2492            if score_warp_runtime.is_some() {
2493                derivative_blocks.push(Vec::new());
2494            }
2495            if link_dev_runtime.is_some() {
2496                derivative_blocks.push(Vec::new());
2497            }
2498            if sigma_learnable {
2499                derivative_blocks
2500                    .last_mut()
2501                    .expect("bernoulli derivative block list is non-empty")
2502                    .push(crate::custom_family::CustomFamilyBlockPsiDerivative::new(
2503                        None,
2504                        Array2::zeros((0, 0)),
2505                        Array2::zeros((0, 0)),
2506                        None,
2507                        None,
2508                        None,
2509                        None,
2510                    ));
2511            }
2512            Ok(derivative_blocks)
2513        }(specs, designs)?;
2514        let built = Arc::new(built);
2515        derivative_block_cache.replace(Some((theta.clone(), Arc::clone(&built))));
2516        Ok(built)
2517    };
2518
2519    // Bernoulli marginal-slope is a multi-block family with β-dependent
2520    // joint Hessian: EFS/HybridEFS fixed-point structural invariant fails,
2521    // so we disable fixed-point at plan time rather than burning cycles on
2522    // a stalled first attempt that silently falls back.
2523    let outer_policy = {
2524        let psi_dim = setup.theta0().len() - setup.rho_dim();
2525        initial_family.outer_derivative_policy(&initial_blocks, psi_dim, options)
2526    };
2527    let exact_spatial_outer_tol = kappa_options_ref.rel_tol.max(EXACT_SPATIAL_OUTER_TOL_FLOOR);
2528    let solved = optimize_spatial_length_scale_exact_joint(
2529        data_view,
2530        &[marginalspec_boot.clone(), logslopespec_boot.clone()],
2531        &[marginal_terms.clone(), logslope_terms.clone()],
2532        kappa_options_ref,
2533        &setup,
2534        gam_solve::seeding::SeedRiskProfile::GeneralizedLinear,
2535        analytic_joint_gradient_available,
2536        analytic_joint_hessian_available,
2537        true,
2538        None,
2539        outer_policy,
2540        |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
2541            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
2542                return Err(err);
2543            }
2544            assert_eq!(
2545                specs.len(),
2546                designs.len(),
2547                "spatial joint optimizer must supply one spec per design",
2548            );
2549            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
2550            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
2551            let sigma = sigma_from_theta(theta);
2552            final_sigma_cell.set(sigma);
2553            let family = make_family(&designs[0], &designs[1], sigma);
2554            let fit = inner_fit(&family, &blocks, options)?;
2555            if let Some(block) = fit.block_states.first()
2556                && let Some(err) = bernoulli_marginal_slope_runaway_error_from_beta(
2557                    block.beta.view(),
2558                    &designs[0],
2559                    &specs[0],
2560                    fit.outer_converged,
2561                    "final fit",
2562                )
2563            {
2564                runaway_error.replace(Some(err.clone()));
2565                return Err(err);
2566            }
2567            let mut hints_mut = hints.borrow_mut();
2568            let mut bidx = 0usize;
2569            if let Some(block) = fit.block_states.get(bidx) {
2570                hints_mut.marginal_beta = Some(block.beta.clone());
2571            }
2572            bidx += 1;
2573            if let Some(block) = fit.block_states.get(bidx) {
2574                hints_mut.logslope_beta = Some(block.beta.clone());
2575            }
2576            bidx += 1;
2577            if score_warp_prepared.is_some() {
2578                if let Some(block) = fit.block_states.get(bidx) {
2579                    hints_mut.score_warp_beta = Some(block.beta.clone());
2580                }
2581                bidx += 1;
2582            }
2583            if link_dev_prepared.is_some()
2584                && let Some(block) = fit.block_states.get(bidx)
2585            {
2586                hints_mut.link_dev_beta = Some(block.beta.clone());
2587            }
2588            Ok(fit)
2589        },
2590        |theta,
2591         specs: &[TermCollectionSpec],
2592         designs: &[TermCollectionDesign],
2593         eval_mode,
2594         row_set: &crate::row_kernel::RowSet| {
2595            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
2596                return Err(err);
2597            }
2598            use gam_problem::EvalMode;
2599            // One-shot row-measure waypoint. This closure runs on EVERY outer
2600            // objective evaluation (value/gradient/Hessian probes, line-search
2601            // cost-only probes, EFS evals), so an unconditional per-eval line
2602            // floods the biobank fit log with thousands of near-identical
2603            // entries. The bridge already emits a timed `[STAGE] outer eval`
2604            // marker per eval; this one records the row-measure exactly once.
2605            static BMS_OUTER_EVAL_ROWSET_LOGGED: std::sync::Once = std::sync::Once::new();
2606            BMS_OUTER_EVAL_ROWSET_LOGGED.call_once(|| {
2607                let row_set_rows = match row_set {
2608                    crate::row_kernel::RowSet::All => spec.y.len(),
2609                    crate::row_kernel::RowSet::Subsample { rows, .. } => rows.len(),
2610                };
2611                log::debug!(
2612                    "[BMS exact outer eval] mode={eval_mode:?} row_set_rows={row_set_rows}"
2613                );
2614            });
2615            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
2616            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
2617            // Promote a staged β seed (deposited by the outer ρ-cache hit
2618            // before any eval ran) into the family warm-start slot now that
2619            // we know the per-block widths from the freshly built blocks.
2620            if let Some(beta_seed) = pending_beta_seed.borrow_mut().take() {
2621                let widths: Vec<usize> = blocks.iter().map(|b| b.design.ncols()).collect();
2622                match CustomFamilyWarmStart::from_cached_beta(&widths, &beta_seed) {
2623                    Ok(ws) => {
2624                        exact_warm_start.replace(Some(ws));
2625                    }
2626                    Err(e) => {
2627                        log::warn!(
2628                            "[BMS] outer ρ-cache β-warm-start rejected: {e}; falling back to cold β"
2629                        );
2630                    }
2631                }
2632            }
2633            let sigma = sigma_from_theta(theta);
2634            final_sigma_cell.set(sigma);
2635            let family = make_family(&designs[0], &designs[1], sigma);
2636            let derivative_blocks = get_derivative_blocks(theta, specs, designs)?;
2637            // Downgrade to ValueAndGradient when the caller asks for a
2638            // Hessian we can't provide; preserve ValueOnly probes for
2639            // line-search cost-only evaluation.
2640            let effective_mode = match eval_mode {
2641                EvalMode::ValueGradientHessian if !analytic_joint_hessian_available => {
2642                    EvalMode::ValueAndGradient
2643                }
2644                other => other,
2645            };
2646            let mut eval_options =
2647                joint_hyper_options_for_outer_tolerance(options, exact_spatial_outer_tol);
2648            if let crate::row_kernel::RowSet::Subsample { rows, n_full } = row_set {
2649                let subsample = crate::outer_subsample::OuterScoreSubsample::from_weighted_rows(
2650                    rows.as_ref().clone(),
2651                    *n_full,
2652                    0,
2653                );
2654                eval_options.outer_score_subsample = Some(Arc::new(subsample));
2655                eval_options.auto_outer_subsample = false;
2656            }
2657            let eval = evaluate_custom_family_joint_hyper_shared(
2658                &family,
2659                &blocks,
2660                &eval_options,
2661                &rho,
2662                derivative_blocks,
2663                exact_warm_start.borrow().as_ref(),
2664                effective_mode,
2665            )?;
2666            if let Some(err) = bernoulli_marginal_slope_runaway_error(
2667                &eval.warm_start,
2668                &designs[0],
2669                &specs[0],
2670                eval.inner_converged,
2671                "exact outer evaluation",
2672            ) {
2673                runaway_error.replace(Some(err.clone()));
2674                return Err(err);
2675            }
2676            exact_warm_start.replace(Some(eval.warm_start.clone()));
2677            if !eval.inner_converged {
2678                return Err(
2679                    "exact bernoulli marginal-slope inner solve did not converge".to_string(),
2680                );
2681            }
2682            if matches!(eval_mode, EvalMode::ValueGradientHessian)
2683                && analytic_joint_hessian_available
2684                && !eval.outer_hessian.is_analytic()
2685            {
2686                return Err("exact bernoulli marginal-slope joint [rho, psi] objective did not return an outer Hessian"
2687                            .to_string());
2688            }
2689            Ok((eval.objective, eval.gradient, eval.outer_hessian))
2690        },
2691        |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
2692            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
2693                return Err(err);
2694            }
2695            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
2696            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
2697            if let Some(beta_seed) = pending_beta_seed.borrow_mut().take() {
2698                let widths: Vec<usize> = blocks.iter().map(|b| b.design.ncols()).collect();
2699                match CustomFamilyWarmStart::from_cached_beta(&widths, &beta_seed) {
2700                    Ok(ws) => {
2701                        exact_warm_start.replace(Some(ws));
2702                    }
2703                    Err(e) => {
2704                        log::warn!(
2705                            "[BMS] outer ρ-cache β-warm-start rejected (efs): {e}; falling back to cold β"
2706                        );
2707                    }
2708                }
2709            }
2710            let sigma = sigma_from_theta(theta);
2711            final_sigma_cell.set(sigma);
2712            let family = make_family(&designs[0], &designs[1], sigma);
2713            let derivative_blocks = get_derivative_blocks(theta, specs, designs)?;
2714            let eval = evaluate_custom_family_joint_hyper_efs_shared(
2715                &family,
2716                &blocks,
2717                &joint_hyper_options_for_outer_tolerance(options, exact_spatial_outer_tol),
2718                &rho,
2719                derivative_blocks,
2720                exact_warm_start.borrow().as_ref(),
2721            )?;
2722            if let Some(err) = bernoulli_marginal_slope_runaway_error(
2723                &eval.warm_start,
2724                &designs[0],
2725                &specs[0],
2726                eval.inner_converged,
2727                "EFS outer evaluation",
2728            ) {
2729                runaway_error.replace(Some(err.clone()));
2730                return Err(err);
2731            }
2732            exact_warm_start.replace(Some(eval.warm_start.clone()));
2733            if !eval.inner_converged {
2734                return Err(
2735                    "exact bernoulli marginal-slope EFS inner solve did not converge".to_string(),
2736                );
2737            }
2738            Ok(eval.efs_eval)
2739        },
2740        crate::marginal_slope_shared::make_beta_seed_validator(&pending_beta_seed),
2741    )?;
2742
2743    let mut resolved_specs = solved.resolved_specs;
2744    let mut designs = solved.designs;
2745    // Reduced-basis round-trip (robust cure). When the logslope design was
2746    // orthogonalised to a reduced basis `G·T`, the fitted logslope coefficient
2747    // `β'` lives in the reduced coordinates (width `r`). The returned
2748    // `logslope_design` / `logslopespec_resolved` are the ORIGINAL full-width
2749    // basis (prediction rebuilds full `G` from the resolved spec), so map the
2750    // reported logslope coefficients back to the original basis `β_logslope =
2751    // T·β'` (predictor-identical: `G·(T·β') = (G·T)·β'`). The marginal block,
2752    // aux blocks, and the internal reduced-width flat β/geometry are untouched;
2753    // only the per-block reported logslope coefficients (blocks[1] and
2754    // block_states[1]) — which prediction/reporting consume against the full
2755    // design — are lifted to full width.
2756    let mut solved_fit = solved.fit;
2757    if let Some(reparam) = logslope_reduced_reparam.as_ref() {
2758        let r = reparam.reduced_cols();
2759        if let Some(block) = solved_fit.blocks.get_mut(1)
2760            && block.beta.len() == r
2761        {
2762            block.beta = reparam.recover_original_logslope_beta(&block.beta)?;
2763        }
2764        if let Some(state) = solved_fit.block_states.get_mut(1)
2765            && state.beta.len() == r
2766        {
2767            state.beta = reparam.recover_original_logslope_beta(&state.beta)?;
2768        }
2769    }
2770    // #905 GENERATED-REGRESSOR (Murphy–Topel) SEAM. When the conditional
2771    // location-scale gate fired, the slope fit above treated the calibrated
2772    // score `ζ = (z − m̂(C))/√v̂(C)` as KNOWN, so `solved_fit.beta_covariance()`
2773    // is the naive second-stage covariance `V_β^naive = H_β⁻¹` that ignores the
2774    // first-stage estimation error in `θ₁ = (mean_coeffs, var_coeffs)`. The
2775    // honest two-stage covariance is
2776    //   `V_β = V_β^naive + (H_β⁻¹ G) V₁ (H_β⁻¹ G)ᵀ`,  `G = ∂(score_β)/∂θ₁`.
2777    // The closed-form first-stage covariance `V₁` and the per-row chain-rule
2778    // sensitivity `∂ζ_i/∂θ₁` are computed and stored on the calibration at fit
2779    // time (see `LatentZConditionalCalibration::{theta1_covariance,
2780    // zeta_theta1_jacobian_row, generated_regressor_term}`), so the correction
2781    // is consumable wherever the slope information `G` (the per-row
2782    // `∂score_β/∂ζ_i` of the marginal/logslope blocks) is available.
2783    //
2784    // ASSEMBLY READY, ONE ENGINE QUANTITY OUTSTANDING. The full correction is
2785    // assembled by `LatentZConditionalCalibration::generated_regressor_correction`
2786    // (mod.rs): given the per-row reduced-frame slope-score sensitivity to the
2787    // calibrated score `s_i = ∂score_β,i/∂ζ_i` (an `n × p_β` matrix), it
2788    //   1. builds `J_zeta` row-by-row via `zeta_theta1_jacobian_row` (exact-zero
2789    //      on floored rows, so `G`'s support is the gate-fired rows),
2790    //   2. accumulates `G = Σ_i s_i ⊗ (∂ζ_i/∂θ₁)` (`p_β × dim θ₁`),
2791    //   3. forms `Vb·G = solved_fit.beta_covariance()·G` (the naive reduced-frame
2792    //      covariance IS `H_β⁻¹`, so `H_β⁻¹ G = Vb·G`), and
2793    //   4. returns `(Vb·G)·V₁·(Vb·G)ᵀ` (PSD ⇒ corrected slope SE strictly ≥
2794    //      naive whenever the gate fires).
2795    // So `V₁`, `∂ζ/∂θ₁`, the `Vb` frame, and the whole congruence are all
2796    // available HERE — the only quantity the seam still lacks is `s_i`.
2797    //
2798    // `s_i = ∂²ℓ_i/∂β∂ζ_i = J_iᵀ·(∂²ℓ_i/∂η_i∂ζ_i)` is the mixed `(β, ζ)` second
2799    // derivative of the row kernel contracted through the slope Jacobian `J_i`.
2800    // The conditional location-scale gate ALWAYS selects the rigid standard-normal
2801    // measure (`build_latent_measure_with_geometry` returns
2802    // `LatentMeasureKind::StandardNormal` for `ConditionalLocationScale`), so the
2803    // per-row kernel is the closed-form `rigid_standard_normal` tower
2804    // `η = q·c(g) + g·(s·ζ)`. The mixed 2-vector `∂²ℓ_i/∂(q,g)∂ζ_i` is read off the
2805    // SAME `Tower4` the value/grad/Hessian path uses (#932 row-jet machinery) by
2806    // seeding `ζ` as a third jet axis
2807    // (`rigid_standard_normal_mixed_z_sensitivity`); contracting it through the
2808    // marginal+logslope design rows (the `J_iᵀ` the row kernel exposes via
2809    // `jacobian_transpose_action`) yields `s_i` in the SAME reduced frame as
2810    // `covariance_conditional` (`rigid_standard_normal_score_zeta_sensitivity`).
2811    let (latent_z_rank_int_calibration, latent_z_conditional_calibration) =
2812        match latent_z_calibration {
2813            LatentMeasureCalibration::None => (None, None),
2814            LatentMeasureCalibration::RankInverseNormal(cal) => (Some(cal), None),
2815            LatentMeasureCalibration::ConditionalLocationScale(cal) => (None, Some(cal)),
2816        };
2817    // #905/#1028: apply the Murphy–Topel generated-regressor correction now that
2818    // `s_i` is available. `covariance_conditional` (Vb) and `covariance_corrected`
2819    // (Vp) are in the reduced logslope frame (`p_m + r`), exactly the frame
2820    // `s_i`'s reduced-logslope contraction lives in, so add the PSD term
2821    // `(Vb·G)·V₁·(Vb·G)ᵀ` to each. Applied only for the canonical (non-flex)
2822    // standard-normal kernel: the rigid tower carries no score_warp/link_dev
2823    // z-dependence, so when aux deviation blocks widen β beyond `p_m + r` the
2824    // correction's deviation columns are not yet derived and the term is skipped
2825    // (the conditional gate's intended kernel has no such blocks).
2826    if let Some(cal) = latent_z_conditional_calibration.as_ref()
2827        && let Some(vb) = solved_fit.covariance_conditional.clone()
2828    {
2829        let p_beta = vb.nrows();
2830        let marginal_dense = marginal_design
2831            .design
2832            .try_to_dense_arc("bms generated-regressor marginal design")?;
2833        let logslope_reduced = reduce_logslope_design(&logslope_design)?;
2834        let logslope_reduced_dense = logslope_reduced
2835            .design
2836            .try_to_dense_arc("bms generated-regressor reduced logslope design")?;
2837        let p_m = marginal_dense.ncols();
2838        let r = logslope_reduced_dense.ncols();
2839        if p_beta != vb.ncols() {
2840            return Err(format!(
2841                "bms generated-regressor: covariance_conditional must be square, got {}×{}",
2842                vb.nrows(),
2843                vb.ncols()
2844            ));
2845        }
2846        // Skip when aux deviation (score_warp / link_dev) blocks are present:
2847        // β is wider than the marginal+reduced-logslope frame the rigid kernel's
2848        // z-channel covers. Equality ⇒ the canonical non-flex gate kernel.
2849        if p_beta == p_m + r {
2850            let marginal_eta = &solved_fit.block_states[0].eta;
2851            let slope_eta = &solved_fit.block_states[1].eta;
2852            let probit_scale = probit_frailty_scale(final_sigma_cell.get());
2853            let s = rigid_standard_normal_score_zeta_sensitivity(
2854                &spec.base_link,
2855                marginal_eta,
2856                slope_eta,
2857                z.as_ref(),
2858                y.as_ref(),
2859                weights.as_ref(),
2860                probit_scale,
2861                marginal_dense.view(),
2862                logslope_reduced_dense.view(),
2863                p_beta,
2864            )?;
2865            // `generated_regressor_correction` re-derives `∂ζ_i/∂θ₁` via
2866            // `zeta_theta1_jacobian_row(z_i, a_row)`, which expects the RAW
2867            // normalized latent score `z_i` (it recomputes `ζ_i = (z_i − m)/√v`
2868            // internally), and conditions on the marginal-index span
2869            // `a(C_i)` = the RAW marginal design rows (the basis the gate was fit
2870            // on). Feed `spec.z` (the standardized raw score, NOT the calibrated
2871            // ζ the kernel consumed) and the raw marginal dense design.
2872            let correction = cal.generated_regressor_correction(
2873                s.view(),
2874                spec.z.view(),
2875                marginal_dense.view(),
2876                vb.view(),
2877            )?;
2878            if let Some(cov) = solved_fit.covariance_conditional.as_mut() {
2879                *cov = &*cov + &correction;
2880            }
2881            if let Some(cov) = solved_fit.covariance_corrected.as_mut() {
2882                *cov = &*cov + &correction;
2883            }
2884            log::info!(
2885                "[BMS latent-z] Murphy–Topel generated-regressor SE correction applied: \
2886                 p_beta={p_beta} theta1_dim={} max_diag_inflation={:.3e}",
2887                cal.theta1_dim(),
2888                (0..p_beta)
2889                    .map(|i| correction[[i, i]])
2890                    .fold(0.0_f64, f64::max),
2891            );
2892        } else {
2893            log::info!(
2894                "[BMS latent-z] Murphy–Topel generated-regressor SE correction skipped: \
2895                 aux deviation blocks present (p_beta={p_beta} > marginal({p_m})+logslope({r})); \
2896                 rigid-kernel z-channel does not yet cover score_warp/link_dev deviations"
2897            );
2898        }
2899    }
2900    // #461: PREDICT SEAM — when the Stage-1 influence absorber is active
2901    // (spec.score_influence_jacobian.is_some()), `fit.block_states[0].beta` is
2902    // the WIDENED marginal coefficient `[β_m; γ]` (length p_m + p₁), but
2903    // `marginal_design` below is the RAW term-collection design (p_m columns):
2904    // the absorbed influence columns Z̃_infl are a TRAINING-only leakage
2905    // absorber and do NOT exist at predict rows (no Stage-1 fold there). The
2906    // orthogonalized β̂_m is a property of the training fit, so prediction must
2907    // use ONLY the first p_m entries of block_states[0].beta against this raw
2908    // marginal_design and DROP the trailing γ. The model-payload / predict
2909    // builder (src/main.rs run_fit_bernoulli_marginal_slope → inference) owns
2910    // that truncation; it must record p_m (= marginal_design.design.ncols())
2911    // and slice the persisted marginal β to it. Survival mirrors this seam.
2912    Ok(BernoulliMarginalSlopeFitResult {
2913        fit: solved_fit,
2914        marginalspec_resolved: resolved_specs.remove(0),
2915        logslopespec_resolved: resolved_specs.remove(0),
2916        marginal_design: designs.remove(0),
2917        logslope_design: designs.remove(0),
2918        baseline_marginal: baseline.0,
2919        baseline_logslope: baseline.1,
2920        z_normalization,
2921        latent_measure,
2922        score_warp_runtime,
2923        link_dev_runtime,
2924        gaussian_frailty_sd: final_sigma_cell.get(),
2925        cross_block_warnings,
2926        latent_z_rank_int_calibration,
2927        latent_z_conditional_calibration,
2928    })
2929}