Skip to main content

gam_models/bms/
block_specs.rs

1use super::family::*;
2use super::gradient_paths::*;
3use super::hessian_paths::{new_cell_moment_cache_stats, new_cell_moment_lru_cache};
4use super::install_flex::validate_spec;
5use super::*;
6use gam_linalg::faer_ndarray::{FaerEigh, fast_ab, fast_atb, fast_xt_diag_x};
7use crate::marginal_slope_orthogonal::influence_absorber_log_lambda;
8use faer::Side;
9
10/// Sup-norm of the FITTED marginal linear predictor `η = X·β` at which the
11/// probit model becomes numerically degenerate. This is the decisive
12/// separation quantity — raw coefficient magnitude is NOT, because the
13/// thin-plate / Duchon marginal bases are non-orthonormal and ill-conditioned,
14/// so a smooth, bounded fitted surface can carry large-and-cancelling
15/// coefficients (e.g. `β=[60,-60]` on two collinear columns yields a bounded
16/// `Xβ`). The probit information `φ(η)²/[Φ(η)(1−Φ(η))]` only collapses once
17/// `|η|` is enormous: `Φ(−35) ≈ 1e−268` is still representable, but by `|η|≈38`
18/// the tail probability underflows to exactly 0 in `f64` and the per-row
19/// Fisher weight vanishes. We trip the guard at 35 — comfortably inside the
20/// representable range yet far beyond any legitimately fitted predictor (a
21/// converged penalized probit surface keeps `|η|` at single/low-double digits),
22/// so a true separating direction (whose `|η|→∞`) still trips it while a
23/// well-fitted ill-conditioned surface does not.
24pub(crate) const BMS_PROBIT_SEPARATION_ETA_INF: f64 = 35.0;
25
26// ── Canonical-gauge priority ladder (issue #322) ─────────────────────────────
27//
28// The priority-ordered RRQR in `canonicalize_for_identifiability` presents
29// higher-priority blocks first and routes any shared cross-block alias drop
30// into the lowest-priority block that still spans the aliased direction. The
31// values below form a single ordered ladder so the relationships that the
32// architecture depends on (anchors > parametric surfaces > flex deviations,
33// marginal > logslope, score_warp > link_dev) are expressed once here rather
34// than re-derived from comments at each `gauge_priority:` site. The ladder
35// mirrors the survival marginal-slope entry (time=200 / marginal=150 /
36// logslope=120 / score_warp=80 / link_dev=60).
37
38/// Audit-only anchor blocks sit at the top of the ladder so the candidate
39/// flex block always yields to them in the cross-block identifiability audit.
40pub(super) const GAUGE_PRIORITY_ANCHOR: u8 = 200;
41/// Marginal surface: strictly above the logslope surface so a shared affine
42/// direction is demoted out of logslope, never out of marginal.
43pub(super) const GAUGE_PRIORITY_MARGINAL: u8 = 150;
44/// Logslope surface: one rung below the marginal surface.
45pub(super) const GAUGE_PRIORITY_LOGSLOPE: u8 = 120;
46/// Candidate flex block under audit: below every parametric anchor so the
47/// audit demotes the candidate when it aliases an anchor.
48pub(super) const GAUGE_PRIORITY_CANDIDATE_FLEX: u8 = 100;
49/// `score_warp_dev`: above `link_dev` because in mixed-flex configurations
50/// link_dev is the residualised block and should yield first.
51pub(super) const GAUGE_PRIORITY_SCORE_WARP_DEV: u8 = 80;
52/// Default for any deviation auxiliary block not otherwise named; below the
53/// parametric default so shared affine directions never demote a parametric
54/// block.
55pub(super) const GAUGE_PRIORITY_DEVIATION_DEFAULT: u8 = 70;
56/// `link_dev`: lowest rung, yields first among the flex deviation blocks.
57pub(super) const GAUGE_PRIORITY_LINK_DEV: u8 = 60;
58
59/// Floor on the relative outer tolerance used by the exact-joint spatial
60/// length-scale optimiser. The user's `rel_tol` drives most of the fit, but
61/// the exact spatial outer loop is a coarse 1-D search over a smooth profiled
62/// objective; tightening it below this floor only burns cycles on noise in the
63/// inner-solve-reported objective without moving the selected length scale.
64pub(crate) const EXACT_SPATIAL_OUTER_TOL_FLOOR: f64 = 1e-6;
65
66// ── BlockEffectiveJacobian impls for BMS ─────────────────────────────────────
67//
68// BMS has a single Bernoulli output per row (n_outputs = 1). The observed η is
69//
70//   η_i = q_i · c_i + s·g_i · z_i
71//
72// where
73//   q_i   = marginal_design[i,:] · β_m + offset_m[i]      (marginal η)
74//   g_i   = logslope_design[i,:] · β_s + offset_s[i]      (log-slope η)
75//   s     = probit_frailty_scale(gaussian_frailty_sd)
76//   c_i   = sqrt(1 + (s·g_i)²)
77//
78// Per-block Jacobians ∂η_i / ∂β_block:
79//
80//   Marginal block  → ∂η_i/∂β_m = c_i · M_i
81//     (M_i = marginal_design row i; c_i is β-dependent but does not involve β_m)
82//
83//   Logslope block  → ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G_i
84//     (G_i = logslope_design row i)
85//
86// score_warp_dev and link_dev blocks use IFT-corrected η, but their
87// contribution to the identifiability audit is captured by the raw design
88// columns (the IFT correction adds a direction already in the anchor span at
89// compile time). These blocks leave jacobian_callback = None and rely on
90// effective_design (= raw design) for the flat audit.
91
92/// β-dependent Jacobian for the BMS marginal block.
93///
94/// ∂η_i/∂β_m = c_i · M[i,:]
95/// where c_i = sqrt(1 + (s · g_i)²),
96///       g_i = G[i,:] · β_s + offset_s[i],
97///       s   = state.probit_frailty_scale.
98///
99/// `probit_frailty_scale` is read from the evaluation state at call time (not
100/// captured at construction) so the callback remains correct across outer-loop
101/// σ updates without rebuilding the block spec.
102///
103/// Designs are pre-densified at construction to avoid repeated materialisation.
104pub struct BmsMarginalJacobian {
105    /// Dense marginal design: n × p_m.
106    pub marginal_dense: Arc<Array2<f64>>,
107    /// Dense logslope design: n × p_s.
108    pub logslope_dense: Arc<Array2<f64>>,
109    pub offset_m: Array1<f64>,
110    pub offset_s: Array1<f64>,
111    /// Number of marginal columns (= size of β_m slice in the full β vector).
112    pub p_marginal: usize,
113}
114
115impl BmsMarginalJacobian {
116    pub fn new(
117        marginal_dense: Arc<Array2<f64>>,
118        logslope_dense: Arc<Array2<f64>>,
119        offset_m: Array1<f64>,
120        offset_s: Array1<f64>,
121        p_marginal: usize,
122    ) -> Self {
123        Self {
124            marginal_dense,
125            logslope_dense,
126            offset_m,
127            offset_s,
128            p_marginal,
129        }
130    }
131}
132
133impl BlockEffectiveJacobian for BmsMarginalJacobian {
134    fn effective_jacobian_rows(
135        &self,
136        state: &FamilyLinearizationState<'_>,
137        rows: std::ops::Range<usize>,
138    ) -> Result<Array2<f64>, String> {
139        let beta = state.beta;
140        let s = state.probit_frailty_scale;
141        let p_m = self.p_marginal;
142        let p_s_block = self.logslope_dense.ncols();
143        let beta_s_raw = if beta.len() > p_m {
144            &beta[p_m..]
145        } else {
146            &[][..]
147        };
148        let p_s_use = p_s_block.min(beta_s_raw.len());
149        let beta_s = &beta_s_raw[..p_s_use];
150        let n = self.marginal_dense.nrows();
151        let rows = rows.start.min(n)..rows.end.min(n);
152        let p_block = self.marginal_dense.ncols();
153
154        // ∂η_i/∂β_m = c_i · M[i,:], with c_i = sqrt(1 + (s·g_i)²) and
155        //   g_i = G[i, :p_s_use] · β_s + offset_s[i].
156        //
157        // This block owns the logslope design and offset, so g_i — and hence
158        // c_i — is fully self-computable at the current β for every row,
159        // including β = 0 (where g_i = offset_s[i], which carries the fitted
160        // logslope baseline and is generically nonzero).  There is no external
161        // scalar this block cannot reconstruct, so the Jacobian is evaluated
162        // directly from owned data with no caller-supplied contract.
163        let mut out = Array2::<f64>::zeros((rows.end - rows.start, p_block));
164        for i in rows.clone() {
165            let g_i = self.offset_s[i]
166                + self
167                    .logslope_dense
168                    .row(i)
169                    .slice(ndarray::s![..p_s_use])
170                    .dot(&ArrayView1::from(beta_s));
171            let sg = s * g_i;
172            let c_i = (1.0 + sg * sg).sqrt();
173            // J[i,:] = c_i · M[i,:]
174            let m_row = self.marginal_dense.row(i);
175            out.row_mut(i - rows.start).assign(&m_row.mapv(|x| c_i * x));
176        }
177        Ok(out)
178    }
179
180    fn n_outputs(&self) -> usize {
181        1
182    }
183
184    fn locks_raw_width_reduction(&self) -> bool {
185        // The BMS family reads its raw-width internal `marginal_design` and
186        // `validate_exact_block_state_shapes` asserts `beta.len() ==
187        // marginal_design.ncols()`. The block's spec carries the real (non
188        // zero-placeholder) marginal design, so the `#933` gauge-composed
189        // reduction cannot silently absorb a dropped column into the callback —
190        // a reduced β would desynchronise the raw-width layout the family reads.
191        // Keep it at raw width and let the robust Jeffreys curvature regularise
192        // any weak cross-block direction (mirrors the survival marginal-slope
193        // time-wiggle block).
194        true
195    }
196}
197
198/// β-dependent Jacobian for the BMS logslope block.
199///
200/// ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G[i,:]
201/// where q_i = M[i,:] · β_m + offset_m[i],
202///       g_i = G[i,:] · β_s + offset_s[i],
203///       c_i = sqrt(1 + (s·g_i)²),
204///       s   = state.probit_frailty_scale.
205///
206/// `probit_frailty_scale` is read from the evaluation state at call time.
207///
208/// Designs are pre-densified at construction to avoid repeated materialisation.
209pub struct BmsLogslopeJacobian {
210    /// Dense marginal design: n × p_m.
211    pub marginal_dense: Arc<Array2<f64>>,
212    /// Dense logslope design: n × p_s.
213    pub logslope_dense: Arc<Array2<f64>>,
214    pub offset_m: Array1<f64>,
215    pub offset_s: Array1<f64>,
216    pub z: Arc<Array1<f64>>,
217    /// Number of marginal columns (= start of β_s in the full β vector).
218    pub p_marginal: usize,
219}
220
221impl BmsLogslopeJacobian {
222    pub fn new(
223        marginal_dense: Arc<Array2<f64>>,
224        logslope_dense: Arc<Array2<f64>>,
225        offset_m: Array1<f64>,
226        offset_s: Array1<f64>,
227        z: Arc<Array1<f64>>,
228        p_marginal: usize,
229    ) -> Self {
230        Self {
231            marginal_dense,
232            logslope_dense,
233            offset_m,
234            offset_s,
235            z,
236            p_marginal,
237        }
238    }
239}
240
241impl BlockEffectiveJacobian for BmsLogslopeJacobian {
242    fn effective_jacobian_rows(
243        &self,
244        state: &FamilyLinearizationState<'_>,
245        rows: std::ops::Range<usize>,
246    ) -> Result<Array2<f64>, String> {
247        let beta = state.beta;
248        let s = state.probit_frailty_scale;
249        let p_m = self.p_marginal;
250        let p_m_use = p_m.min(beta.len());
251        let beta_m = &beta[..p_m_use];
252        let beta_s_raw = if beta.len() > p_m {
253            &beta[p_m..]
254        } else {
255            &[][..]
256        };
257        let p_s_block = self.logslope_dense.ncols();
258        let p_s_use = p_s_block.min(beta_s_raw.len());
259        let beta_s = &beta_s_raw[..p_s_use];
260        let n = self.logslope_dense.nrows();
261        let rows = rows.start.min(n)..rows.end.min(n);
262
263        // ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G[i,:] where
264        //   q_i = M[i,:] · β_m + offset_m[i],
265        //   g_i = G[i,:] · β_s + offset_s[i],
266        //   c_i = sqrt(1 + (s·g_i)²),
267        //   z_i = self.z[i].
268        //
269        // This block owns the marginal design, logslope design, both offsets,
270        // and z, so q_i, g_i, c_i, z_i are all closed-form functions of the
271        // current β and owned data — for every row at every β, including β = 0
272        // (where g_i = offset_s[i] carries the nonzero fitted logslope
273        // baseline).  The Jacobian is therefore evaluated directly from owned
274        // data with no caller-supplied scalar contract.
275        let mut out = Array2::<f64>::zeros((rows.end - rows.start, p_s_block));
276        for i in rows.clone() {
277            let q_i = self.offset_m[i]
278                + self
279                    .marginal_dense
280                    .row(i)
281                    .slice(ndarray::s![..p_m_use])
282                    .dot(&ArrayView1::from(beta_m));
283            let g_i = self.offset_s[i]
284                + self
285                    .logslope_dense
286                    .row(i)
287                    .slice(ndarray::s![..p_s_use])
288                    .dot(&ArrayView1::from(beta_s));
289            let sg = s * g_i;
290            let c_i = (1.0 + sg * sg).sqrt();
291            let z_i = self.z[i];
292            // per-row scalar factor: q_i · s²·g_i / c_i + s·z_i
293            let factor = q_i * s * s * g_i / c_i + s * z_i;
294            // J[i,:] = factor · G[i,:]
295            let g_row = self.logslope_dense.row(i);
296            out.row_mut(i - rows.start)
297                .assign(&g_row.mapv(|x| factor * x));
298        }
299        Ok(out)
300    }
301
302    fn n_outputs(&self) -> usize {
303        1
304    }
305
306    fn locks_raw_width_reduction(&self) -> bool {
307        // The BMS family reads its raw-width internal `logslope_design` and
308        // `validate_exact_block_state_shapes` asserts `beta.len() ==
309        // logslope_design.ncols()`. An intercept-only logslope (`logslope_formula
310        // = "1"`) is a constant column perfectly aliased with the marginal
311        // intercept; the effective reduced-reparam leaves a width-1 block at raw
312        // width (a zero-width reduction would delete the score-effect surface), so
313        // the canonicaliser would otherwise column-reduce this block to width 0
314        // and hand the family a length-0 β against a width-1 design. Lock it to
315        // raw width — the robust Jeffreys curvature regularises the weak aliased
316        // direction (mirrors the survival marginal-slope time-wiggle block).
317        true
318    }
319}
320
321/// Horizontally stack the absorbed influence columns `Z̃_infl` onto the raw
322/// marginal design `M`, yielding the widened additive marginal-index design
323/// `[M | Z̃_infl]` (#461). When `influence_columns` is `None` the original
324/// dense design is returned unchanged. The influence columns shift the marginal
325/// index `α(x)` additively, so the de-nested probit kernel — which reads the
326/// marginal index from `block_states[0].eta` and reconstructs `∂q/∂β_m` from
327/// `self.marginal_design` (a matched (design, β) pair) — picks them up with no
328/// kernel-site change; the widened `p_marginal` keeps every per-row Jacobian /
329/// gradient / Hessian projection consistent.
330pub(crate) fn widen_marginal_dense_with_influence(
331    marginal_dense: &Arc<Array2<f64>>,
332    influence_columns: Option<&Array2<f64>>,
333) -> Result<Arc<Array2<f64>>, String> {
334    let Some(z_infl) = influence_columns else {
335        return Ok(Arc::clone(marginal_dense));
336    };
337    let n = marginal_dense.nrows();
338    if z_infl.nrows() != n {
339        return Err(format!(
340            "influence block: residualised columns have {} rows, marginal design has {n}",
341            z_infl.nrows()
342        ));
343    }
344    let p_m = marginal_dense.ncols();
345    let p1 = z_infl.ncols();
346    let mut widened = Array2::<f64>::zeros((n, p_m + p1));
347    widened
348        .slice_mut(s![.., ..p_m])
349        .assign(marginal_dense.as_ref());
350    widened.slice_mut(s![.., p_m..]).assign(z_infl);
351    Ok(Arc::new(widened))
352}
353
354/// Tolerance (relative to the dominant retained eigenvalue) below which a
355/// reduced-basis direction of the W-orthogonalised effective logslope Gram is
356/// treated as a confounded null direction and dropped. Directions whose
357/// effective weighted image is (near-)explained by the marginal span collapse to
358/// ~0 eigenvalue in `Gtt` (see [`build_reduced_logslope_reparam`]); this keeps
359/// the cut well above floating-point noise but well below any genuine surviving
360/// logslope curvature.
361pub(crate) const LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL: f64 = 1.0e-6;
362
363/// An exact reduced-basis reparameterization of the BMS logslope design through
364/// the family's OWN internal `logslope_design` geometry, expressed as a single
365/// linear map `T` (`p_logslope × r`, `r ≤ p_logslope`).
366///
367/// # Why a reduced basis (not a dense design swap)
368///
369/// The structural confound is that the score-weighted logslope channel
370/// `diag(factor)·G·β_s` overlaps the effective marginal channel `diag(c)·M·β_m`
371/// in the PIRLS row metric `W`, leaving the joint penalised Hessian rank-soft
372/// along the shared direction. The shared-solver primitive
373/// [`OrthogonalReparam`](gam_solve::orthogonal_reparam::OrthogonalReparam)
374/// forms `C̃ = C − M·B`, exactly W-orthogonal to `span(M)` — but `C̃` is a dense
375/// design the BMS family's row kernel does NOT consume: the family reads
376/// `η_logslope = G·β_s` from its own `logslope_design` and reconstructs the
377/// per-row Jacobian `factor_i · G_i` from that same matrix. A block-level design
378/// swap is therefore ignored by the family, and feeding a rank-deficient `C̃` at
379/// full width desynchronises the inner identifiable-subspace reduction from the
380/// stored design width.
381///
382/// This builds instead a TRUE reparameterization the family consumes: a
383/// full-rank reduced logslope design `G_reduced = G·T` (width `r`) plus the
384/// penalty projection `S_reduced = Tᵀ S T`. The map is constructed so that the
385/// directions of raw logslope coefficient space whose effective weighted image
386/// is W-explained by the marginal span are removed (they carry ~zero curvature
387/// in the W-orthogonalised effective Gram), and the surviving `r` directions are
388/// full-rank.
389///
390/// # The math
391///
392/// At the rigid pilot, the effective Jacobians are
393///
394/// ```text
395///     M_eff = diag(c) · M        (n × p_m),   c_i = sqrt(1 + (s·g_i)²)
396///     G_eff = diag(f) · G        (n × p_g),   f_i = q_i·s²·g_i/c_i + s·z_i
397/// ```
398///
399/// In the row metric `W` the component of the effective logslope design that is
400/// W-orthogonal to `span(M_eff)` has the raw-coordinate Gram
401///
402/// ```text
403///     Gtt = G_effᵀ W G_eff − (G_effᵀ W M_eff)(M_effᵀ W M_eff + εI)⁻¹(M_effᵀ W G_eff)
404/// ```
405///
406/// (a `p_g × p_g` PSD matrix in the raw logslope coefficient coordinates). Its
407/// range = the logslope directions that survive the confound removal; its null
408/// space = the confounded directions absorbed by the marginal span. The reduced
409/// transform `T` is the orthonormal eigenbasis of `Gtt` for eigenvalues above a
410/// relative tolerance; `r = rank(Gtt)`. The new design `G_reduced = G·T`, the
411/// reparameterized penalty `S_reduced = Tᵀ S T`, and the round-trip
412/// `β_logslope = T·β'` make the family's geometry consistent at width `r` and
413/// recover the original-basis logslope coefficients for prediction/reporting.
414#[derive(Debug, Clone)]
415pub(super) struct ReducedLogslopeReparam {
416    /// Reduced transform `T` (`p_logslope × r`). `G_reduced = G·T`,
417    /// `β_logslope = T·β'`, `S_reduced = Tᵀ S T`.
418    transform: Array2<f64>,
419}
420
421impl ReducedLogslopeReparam {
422    /// Original (full) logslope width `p_logslope`.
423    #[inline]
424    pub(super) fn original_cols(&self) -> usize {
425        self.transform.nrows()
426    }
427
428    /// Reduced width `r`.
429    #[inline]
430    pub(super) fn reduced_cols(&self) -> usize {
431        self.transform.ncols()
432    }
433
434    /// Map a reduced-basis logslope coefficient `β'` (length `r`) back to the
435    /// original logslope basis `β_logslope = T·β'` (length `p_logslope`), so
436    /// prediction/reporting are unchanged-in-meaning.
437    pub(super) fn recover_original_logslope_beta(
438        &self,
439        beta_reduced: &Array1<f64>,
440    ) -> Result<Array1<f64>, String> {
441        if beta_reduced.len() != self.reduced_cols() {
442            return Err(format!(
443                "reduced logslope reparam: β' length ({}) != reduced width ({})",
444                beta_reduced.len(),
445                self.reduced_cols()
446            ));
447        }
448        Ok(self.transform.dot(beta_reduced))
449    }
450}
451
452/// Build the reduced-basis logslope reparameterization (see
453/// [`ReducedLogslopeReparam`]) from the rigid-pilot EFFECTIVE Jacobian geometry,
454/// in the PIRLS row metric `W`. Extracts the dense pilot designs and delegates
455/// the geometry to [`reduced_logslope_transform_effective`]. Returns `Ok(None)`
456/// when there is no logslope/marginal span, no effective-confounded direction to
457/// remove (`r == p_g`), or the whole effective logslope image is in the effective
458/// marginal span (`r == 0`); in those cases the caller keeps the raw design.
459fn build_reduced_logslope_reparam(
460    marginal_design: &TermCollectionDesign,
461    logslope_design: &TermCollectionDesign,
462    z: &Array1<f64>,
463    row_metric: &Array1<f64>,
464    marginal_offset: &Array1<f64>,
465    logslope_offset: &Array1<f64>,
466    marginal_baseline: f64,
467    logslope_baseline: f64,
468    probit_scale: f64,
469) -> Result<Option<ReducedLogslopeReparam>, String> {
470    let marginal = marginal_design
471        .design
472        .try_to_dense_arc("build_reduced_logslope_reparam::marginal")?;
473    let logslope = logslope_design
474        .design
475        .try_to_dense_arc("build_reduced_logslope_reparam::logslope")?;
476    let n = marginal.nrows();
477    if logslope.nrows() != n
478        || z.len() != n
479        || row_metric.len() != n
480        || marginal_offset.len() != n
481        || logslope_offset.len() != n
482    {
483        return Err(format!(
484            "reduced logslope reparam row mismatch: marginal={}, logslope={}, z={}, row_metric={}, marginal_offset={}, logslope_offset={}",
485            marginal.nrows(),
486            logslope.nrows(),
487            z.len(),
488            row_metric.len(),
489            marginal_offset.len(),
490            logslope_offset.len(),
491        ));
492    }
493    let p_m = marginal.ncols();
494    let p_g = logslope.ncols();
495    if p_m == 0 || p_g == 0 {
496        return Ok(None);
497    }
498    if !marginal_baseline.is_finite()
499        || !logslope_baseline.is_finite()
500        || !probit_scale.is_finite()
501        || probit_scale <= 0.0
502        || z.iter().any(|v| !v.is_finite())
503        || row_metric.iter().any(|v| !v.is_finite() || *v < 0.0)
504        || marginal_offset.iter().any(|v| !v.is_finite())
505        || logslope_offset.iter().any(|v| !v.is_finite())
506    {
507        return Err(
508            "reduced logslope reparam requires finite pilot geometry and finite non-negative row metric"
509                .to_string(),
510        );
511    }
512
513    // The joint Hessian the inner solve factorises is built from the EFFECTIVE
514    // BMS Jacobians, not the raw design. At the rigid pilot,
515    //   ∂η_i/∂β_m = c_i · M_i,   c_i = sqrt(1 + (s·g_i)²)
516    //   ∂η_i/∂β_s = f_i · G_i,   f_i = q_i·s²·g_i/c_i + s·z_i
517    // so a raw logslope direction `v` is rank-soft in the joint Hessian iff its
518    // EFFECTIVE image `diag(f)·G·v` is W-explained by `span(diag(c)·M)` — NOT iff
519    // raw `G·v` is W-explained by `span(M)`. Auditing the raw design removes the
520    // wrong directions; the reduced basis is built from the effective Schur Gram.
521    // The pure-array geometry lives in `reduced_logslope_transform_effective` so
522    // it can be unit-tested directly against the raw-vs-effective counterexample.
523    match reduced_logslope_transform_effective(
524        marginal.view(),
525        logslope.view(),
526        z,
527        row_metric,
528        marginal_offset,
529        logslope_offset,
530        marginal_baseline,
531        logslope_baseline,
532        probit_scale,
533    )? {
534        Some(transform) => Ok(Some(ReducedLogslopeReparam { transform })),
535        None => Ok(None),
536    }
537}
538
539/// Build the reduced logslope basis `T` (p_g × r) from the EFFECTIVE BMS pilot
540/// geometry, in the PIRLS row metric `W`. `T`'s columns span the raw logslope
541/// coefficient directions whose effective image `diag(f)·G·v` is NOT W-explained
542/// by `span(diag(c)·M)` — i.e. the directions the joint Hessian retains real
543/// curvature along. Returns `Ok(None)` when there is nothing to reduce
544/// (`r == p_g`) or when the entire effective logslope image collapses into the
545/// effective marginal span (`r == 0`); in both cases the caller keeps the raw
546/// design (a zero-width reduction would silently delete the score-effect surface,
547/// which is the estimand — the REML penalty regularises any residual softness).
548///
549/// At the rigid pilot the effective Jacobians are
550///     M_eff = diag(c) · M,   c_i = sqrt(1 + (s·g_i)²)
551///     G_eff = diag(f) · G,   f_i = q_i·s²·g_i/c_i + s·z_i
552/// and the raw-coordinate Gram of the logslope component W-orthogonal to
553/// `span(M_eff)` is the Schur complement
554///     Gtt = G_effᵀ W G_eff − (G_effᵀ W M_eff)(M_effᵀ W M_eff + εI)⁻¹(M_effᵀ W G_eff).
555/// `T` is the orthonormal eigenbasis of `Gtt` for eigenvalues above a tolerance
556/// relative to the effective logslope energy scale.
557pub(crate) fn reduced_logslope_transform_effective(
558    marginal: ArrayView2<'_, f64>,
559    logslope: ArrayView2<'_, f64>,
560    z: &Array1<f64>,
561    row_metric: &Array1<f64>,
562    marginal_offset: &Array1<f64>,
563    logslope_offset: &Array1<f64>,
564    marginal_baseline: f64,
565    logslope_baseline: f64,
566    probit_scale: f64,
567) -> Result<Option<Array2<f64>>, String> {
568    let n = marginal.nrows();
569    let p_m = marginal.ncols();
570    let p_g = logslope.ncols();
571    if p_m == 0 || p_g == 0 {
572        return Ok(None);
573    }
574
575    // Effective pilot Jacobians M_eff = diag(c)·M and G_eff = diag(f)·G.
576    let mut m_eff = Array2::<f64>::zeros((n, p_m));
577    let mut g_eff = Array2::<f64>::zeros((n, p_g));
578    for i in 0..n {
579        let q_i = marginal_offset[i] + marginal_baseline;
580        let g_i = logslope_offset[i] + logslope_baseline;
581        let sg = probit_scale * g_i;
582        let c_i = (1.0 + sg * sg).sqrt();
583        let f_i = q_i * probit_scale * probit_scale * g_i / c_i + probit_scale * z[i];
584        for j in 0..p_m {
585            m_eff[[i, j]] = c_i * marginal[[i, j]];
586        }
587        for j in 0..p_g {
588            g_eff[[i, j]] = f_i * logslope[[i, j]];
589        }
590    }
591
592    // C = G_effᵀ W G_eff (raw-coordinate effective logslope Gram); its diagonal
593    // sets the energy scale for the relative kept-direction tolerance.
594    let c_gram = fast_xt_diag_x(&g_eff, row_metric);
595    let energy_scale = (0..p_g).map(|i| c_gram[[i, i]]).fold(0.0_f64, f64::max);
596    if !energy_scale.is_finite() || energy_scale <= 0.0 {
597        return Ok(None);
598    }
599
600    // A = M_effᵀ W M_eff + εI (ridge relative to the marginal effective energy so
601    // the Schur solve is well-posed even when the marginal pilot Gram is
602    // rank-soft; the ridge only under-removes, i.e. is conservative).
603    let mut a_gram = fast_xt_diag_x(&m_eff, row_metric);
604    let a_scale = (0..p_m).map(|i| a_gram[[i, i]]).fold(0.0_f64, f64::max);
605    let a_ridge = (a_scale * LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL).max(f64::EPSILON);
606    for i in 0..p_m {
607        a_gram[[i, i]] += a_ridge;
608    }
609
610    // B = M_effᵀ W G_eff (p_m × p_g);  Gtt = C − Bᵀ A⁻¹ B (p_g × p_g, PSD).
611    let b_cross = gam_linalg::faer_ndarray::fast_xt_diag_y(&m_eff, row_metric, &g_eff);
612    let a_view = gam_linalg::faer_ndarray::FaerArrayView::new(&a_gram);
613    let a_factor =
614        gam_linalg::faer_ndarray::factorize_symmetricwith_fallback(a_view.as_ref(), Side::Lower)
615            .map_err(|e| {
616                format!(
617                    "reduced logslope reparam: effective marginal Gram factorization failed: {e}"
618                )
619            })?;
620    let b_view = gam_linalg::faer_ndarray::FaerArrayView::new(&b_cross);
621    let solved = a_factor.solve(b_view.as_ref()); // A⁻¹ B  (p_m × p_g)
622    let a_inv_b = Array2::from_shape_fn((p_m, p_g), |(i, j)| solved[(i, j)]);
623    let schur = fast_atb(&b_cross, &a_inv_b); // Bᵀ A⁻¹ B  (p_g × p_g)
624    let mut stt = &c_gram - &schur;
625    stt = (&stt + &stt.t()) * 0.5;
626    if stt.iter().any(|v| !v.is_finite()) {
627        return Err(
628            "reduced logslope reparam: effective Schur Gram produced non-finite entries"
629                .to_string(),
630        );
631    }
632
633    let (evals, evecs) = stt
634        .eigh(Side::Lower)
635        .map_err(|e| format!("reduced logslope reparam: eigendecomposition failed: {e:?}"))?;
636    // A `Gtt` eigenvalue far below the effective logslope energy scale means that
637    // direction's effective logslope column is W-explained by the effective
638    // marginal span — exactly the joint-Hessian rank-soft confounded direction.
639    let tol = energy_scale * LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL;
640    let mut kept: Vec<usize> = (0..evals.len()).filter(|&i| evals[i] > tol).collect();
641    kept.sort_by(|&a, &b| {
642        evals[b]
643            .partial_cmp(&evals[a])
644            .unwrap_or(std::cmp::Ordering::Equal)
645    });
646    let r = kept.len();
647    // r == p_g: no effective-confounded direction to remove. r == 0: the whole
648    // effective logslope image is in the effective marginal span. In both cases
649    // install no transform (see fn-level doc) and keep the raw design.
650    if r == p_g || r == 0 {
651        return Ok(None);
652    }
653    let mut transform = Array2::<f64>::zeros((p_g, r));
654    for (out_col, &src) in kept.iter().enumerate() {
655        transform.column_mut(out_col).assign(&evecs.column(src));
656    }
657    if transform.iter().any(|v| !v.is_finite()) {
658        return Err(
659            "reduced logslope reparam: reduced transform produced non-finite entries".to_string(),
660        );
661    }
662    Ok(Some(transform))
663}
664
665/// Apply a [`ReducedLogslopeReparam`] to a logslope `TermCollectionDesign`,
666/// producing a new design at the reduced width `r`: the design becomes
667/// `G_reduced = G·T`, and every blockwise penalty `S` is reparameterized to
668/// `S_reduced = Tᵀ S T` over the full reduced column range `0..r`. The reduced
669/// penalty's null space is recomputed from its numerical rank so the REML
670/// log-determinant accounting stays consistent at the reduced width.
671fn reparameterize_logslope_design_reduced(
672    logslope_design: &TermCollectionDesign,
673    reparam: &ReducedLogslopeReparam,
674) -> Result<TermCollectionDesign, String> {
675    let g = logslope_design
676        .design
677        .try_to_dense_arc("reparameterize_logslope_design_reduced::logslope")?;
678    let p_g = g.ncols();
679    if p_g != reparam.original_cols() {
680        return Err(format!(
681            "reduced logslope reparam width mismatch: design has {p_g} cols, transform expects {}",
682            reparam.original_cols()
683        ));
684    }
685    let t = &reparam.transform;
686    let r = reparam.reduced_cols();
687    // G_reduced = G·T   (n × r).
688    let g_reduced = fast_ab(&g, t);
689
690    // Reparameterize each penalty: embed its local block at full width p_g, then
691    // form S_reduced = Tᵀ S T (r × r) over the whole reduced column range.
692    let mut new_penalties: Vec<gam_terms::smooth::BlockwisePenalty> =
693        Vec::with_capacity(logslope_design.penalties.len());
694    let mut new_nullspace_dims: Vec<usize> = Vec::with_capacity(logslope_design.penalties.len());
695    for bp in &logslope_design.penalties {
696        let mut full = Array2::<f64>::zeros((p_g, p_g));
697        full.slice_mut(s![bp.col_range.clone(), bp.col_range.clone()])
698            .assign(&bp.local);
699        // S_reduced = Tᵀ (S) T.
700        let st = fast_ab(&full, t); // p_g × r
701        let mut s_reduced = fast_atb(t, &st); // r × r
702        s_reduced = (&s_reduced + &s_reduced.t()) * 0.5;
703        // Null-space dimension of the reduced penalty = r − rank(S_reduced).
704        let (evals, _) = s_reduced
705            .eigh(Side::Lower)
706            .map_err(|e| format!("reduced logslope penalty eigendecomposition failed: {e:?}"))?;
707        let max_eval = evals.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs()));
708        let pen_tol = (max_eval * 1.0e-12).max(f64::EPSILON);
709        let rank = evals.iter().filter(|&&v| v.abs() > pen_tol).count();
710        let nullspace_dim = r.saturating_sub(rank);
711        new_penalties.push(gam_terms::smooth::BlockwisePenalty::new(0..r, s_reduced));
712        new_nullspace_dims.push(nullspace_dim);
713    }
714
715    let new_design = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(g_reduced));
716    // The reduced logslope block is a single dense smooth-like surface over the
717    // reparameterized coordinates; it carries no parametric/random-effect/
718    // intercept structure of its own (those live in the marginal block), so the
719    // structural ranges collapse to empty and the smooth metadata is cleared.
720    // The penalties + nullspace_dims above are what the joint REML consumes.
721    Ok(TermCollectionDesign {
722        design: new_design,
723        penalties: new_penalties,
724        nullspace_dims: new_nullspace_dims,
725        penaltyinfo: Vec::new(),
726        dropped_penaltyinfo: Vec::new(),
727        coefficient_lower_bounds: None,
728        linear_constraints: None,
729        intercept_range: 0..0,
730        linear_ranges: Vec::new(),
731        random_effect_ranges: Vec::new(),
732        random_effect_levels: Vec::new(),
733        smooth: gam_terms::smooth::SmoothDesign {
734            term_designs: Vec::new(),
735            penalties: Vec::new(),
736            nullspace_dims: Vec::new(),
737            penaltyinfo: Vec::new(),
738            dropped_penaltyinfo: Vec::new(),
739            terms: Vec::new(),
740            coefficient_lower_bounds: None,
741            linear_constraints: None,
742        },
743    })
744}
745
746/// Re-embed the term-collection marginal penalties at the (possibly widened)
747/// block dimension `p_m [+ p₁]`, then append the #461 fixed-ridge absorber:
748///
749///  (#461, only with influence columns) the fixed-ridge absorber identity on
750///  the influence columns `p_m..p_m+p₁`.
751///
752/// The two former gam#754 pinned ridges — the marginal nullspace-shrinkage ridge
753/// and the marginal↔logslope overlap ridge — are DELETED: robustness is now
754/// unconditional, so the full-identifiable-span Jeffreys term (`Z_J = I`, see
755/// `jeffreys_subspace_from_penalty`) supplies automatic O(n)-scaled curvature on
756/// every under-identified direction (subsuming the nullspace ridge), and the
757/// exact orthogonal reparameterization of the logslope design (now unconditional,
758/// see `build_reduced_logslope_reparam`) resolves the marginal↔logslope confound
759/// by construction (subsuming the overlap ridge).
760///
761/// The genuine marginal smooth penalties keep their `col_range` (marginal
762/// columns stay in `0..p_m`). Returns `(penalties, nullspace_dims,
763/// initial_log_lambdas)` to install on the marginal block. Fixed ridges remain
764/// in this physical penalty layout and are removed from every REML/outer
765/// coordinate vector by [`PenaltyMatrix::Fixed`].
766pub(crate) fn marginal_penalties_with_influence_ridge(
767    design: &TermCollectionDesign,
768    rho_marginal: &Array1<f64>,
769    influence_columns: Option<&Array2<f64>>,
770    influence_ridge_log_lambda: f64,
771) -> Result<(Vec<PenaltyMatrix>, Vec<usize>, Array1<f64>), String> {
772    let p_m = design.design.ncols();
773    let p1 = influence_columns.map(|z| z.ncols()).unwrap_or(0);
774    let total_dim = p_m + p1;
775    // Re-embed each marginal penalty at the (widened) total dimension (col_range
776    // unchanged: marginal columns remain 0..p_m).
777    let mut penalties: Vec<PenaltyMatrix> = design
778        .penalties
779        .iter()
780        .map(|bp| bp.to_penalty_matrix(total_dim))
781        .collect();
782    let mut nullspace_dims = design.nullspace_dims.clone();
783    let mut log_lambdas = rho_marginal.to_vec();
784
785    // (#461) fixed-ridge absorber: identity on the influence columns only.
786    // Full rank (nullspace 0); its log λ is pinned out of REML by a degenerate
787    // ρ box.
788    if p1 > 0 {
789        penalties.push(
790            PenaltyMatrix::Blockwise {
791                local: Array2::<f64>::eye(p1),
792                col_range: p_m..total_dim,
793                total_dim,
794            }
795            .with_fixed_log_lambda(influence_ridge_log_lambda),
796        );
797        nullspace_dims.push(0);
798        log_lambdas.push(influence_ridge_log_lambda);
799    }
800
801    Ok((penalties, nullspace_dims, Array1::from_vec(log_lambdas)))
802}
803
804/// Widen an optional β warm-start hint to the influence-widened marginal
805/// dimension, zero-filling the absorber coefficients `γ` (#461).
806pub(crate) fn widen_marginal_beta_hint(
807    beta_hint: Option<Array1<f64>>,
808    p_marginal_widened: usize,
809) -> Option<Array1<f64>> {
810    beta_hint.map(|hint| {
811        if hint.len() == p_marginal_widened {
812            hint
813        } else {
814            let mut widened = Array1::<f64>::zeros(p_marginal_widened);
815            let copy = hint.len().min(p_marginal_widened);
816            widened
817                .slice_mut(s![..copy])
818                .assign(&hint.slice(s![..copy]));
819            widened
820        }
821    })
822}
823
824/// Sup-norm of the fitted marginal linear predictor `η = X·β` restricted to a
825/// subset of the marginal design's columns. The mask selects which columns of
826/// `design.design` (length `design.design.ncols()`) contribute; coefficients
827/// beyond `ncols` (the fixed-ridge influence absorber) never enter the marginal
828/// predictor and are excluded by construction. Returns `0.0` for an empty
829/// design. This is the decisive separation quantity: the probit Fisher weight
830/// collapses with `|η|`, not with `‖β‖` (an ill-conditioned non-orthonormal
831/// Duchon/thin-plate basis carries large cancelling coefficients on a smooth,
832/// bounded surface).
833fn marginal_fitted_eta_sup_norm(design: &TermCollectionDesign, masked_beta: &Array1<f64>) -> f64 {
834    let x = &design.design;
835    let n = x.nrows();
836    if n == 0 || x.ncols() == 0 {
837        return 0.0;
838    }
839    let mut sup = 0.0_f64;
840    for row in 0..n {
841        let eta = x.dot_row_view(row, masked_beta.view());
842        if eta.is_finite() {
843            sup = sup.max(eta.abs());
844        }
845    }
846    sup
847}
848
849/// Build a copy of the marginal block β truncated to `design.design.ncols()`
850/// (drops any fixed-ridge absorber tail) so it can drive `X·β`.
851fn marginal_design_beta(
852    design: &TermCollectionDesign,
853    block_beta: ArrayView1<'_, f64>,
854) -> Array1<f64> {
855    let ncols = design.design.ncols();
856    let mut masked = Array1::<f64>::zeros(ncols);
857    let copy = ncols.min(block_beta.len());
858    masked
859        .slice_mut(s![..copy])
860        .assign(&block_beta.slice(s![..copy]));
861    masked
862}
863
864/// Zero every entry of `beta` outside the parametric (penalty-nullspace)
865/// marginal columns — the intercept and the single-penalty linear terms. These
866/// are the directions an unpenalized fit can genuinely separate along (no
867/// smoothness penalty bounds them), so their fitted contribution is tested on
868/// the same η scale.
869fn mask_parametric_columns(
870    design: &TermCollectionDesign,
871    spec: &TermCollectionSpec,
872    full: &Array1<f64>,
873) -> Array1<f64> {
874    let ncols = design.design.ncols();
875    let mut masked = Array1::<f64>::zeros(ncols);
876    if design.intercept_range.len() == 1 {
877        let idx = design.intercept_range.start;
878        if idx < ncols {
879            masked[idx] = full[idx];
880        }
881    }
882    for (linear, (_, range)) in spec.linear_terms.iter().zip(design.linear_ranges.iter()) {
883        if linear.double_penalty {
884            continue;
885        }
886        for col in range.clone() {
887            if col < ncols {
888                masked[col] = full[col];
889            }
890        }
891    }
892    masked
893}
894
895/// Decide whether the converged marginal fit has genuinely separated, using the
896/// FITTED predictor sup-norm `|η|∞` (not raw `|β|∞`). Two arms share the
897/// numerical-degeneracy threshold [`BMS_PROBIT_SEPARATION_ETA_INF`]:
898///   - parametric arm: the penalty-nullspace columns' fitted contribution
899///     (an unpenalized direction can run to infinity);
900///   - full arm: the whole marginal surface's fitted predictor.
901/// The raw `|β|∞` (and its term label) is reported only as diagnostic context;
902/// it never gates the abort. When `|η|∞` is below threshold the converged
903/// penalized fit is numerically trustworthy and this returns `None` — no error,
904/// even if individual coefficients are large.
905pub(crate) fn bernoulli_marginal_slope_runaway_error_from_beta(
906    block_beta: ArrayView1<'_, f64>,
907    design: &TermCollectionDesign,
908    spec: &TermCollectionSpec,
909    inner_converged: bool,
910    eval_label: &str,
911) -> Option<String> {
912    let full_beta = marginal_design_beta(design, block_beta);
913    let parametric_beta = mask_parametric_columns(design, spec, &full_beta);
914
915    let eta_parametric = marginal_fitted_eta_sup_norm(design, &parametric_beta);
916    let eta_full = marginal_fitted_eta_sup_norm(design, &full_beta);
917
918    let (eta_inf, explanation) = if eta_parametric >= BMS_PROBIT_SEPARATION_ETA_INF {
919        (
920            eta_parametric,
921            "an unpenalized parametric marginal direction has no stable finite probit optimum and its fitted predictor has run to the probit underflow scale",
922        )
923    } else if eta_full >= BMS_PROBIT_SEPARATION_ETA_INF {
924        (
925            eta_full,
926            "a marginal direction is trading off against the logslope surface; this is the under-constrained marginal/logslope coupling that appears when the score is correlated with the shared surface covariates",
927        )
928    } else {
929        // |η|∞ is bounded: even if raw coefficients are large (ill-conditioned
930        // non-orthonormal basis with cancellation), the converged penalized
931        // probit fit is numerically trustworthy. Do NOT abort.
932        return None;
933    };
934
935    let inner_status = if inner_converged {
936        "the inner solve reached a KKT certificate at this separation-scale predictor"
937    } else {
938        "the inner solve failed while already carrying a separation-scale predictor"
939    };
940    // Raw |β|∞ context (decisive quantity is |η|∞ above).
941    let beta_abs = full_beta
942        .iter()
943        .copied()
944        .filter(|v| v.is_finite())
945        .fold(0.0_f64, |acc, v| acc.max(v.abs()));
946
947    Some(format!(
948        "bernoulli marginal-slope probit marginal/logslope runaway detected in block \
949         'marginal_surface' during {eval_label}: the fitted marginal predictor has \
950         |η|∞={eta_inf:.3e} (numerical-degeneracy threshold \
951         {BMS_PROBIT_SEPARATION_ETA_INF:.1}; raw |β|∞={beta_abs:.3e} is reported for \
952         context only and does not gate this diagnostic). The joint design is \
953         identifiable; {explanation}. {inner_status}. The robust Jeffreys curvature \
954         path is already installed for this fit, so this diagnostic means the current \
955         coupled surface still drives the linear predictor to the probit underflow \
956         scale rather than a request for an external bias-reduction prior. Reduce or \
957         reparameterize the coupled marginal/logslope surface, or use a \
958         lower-dimensional logslope interaction. This is not a \
959         Matérn/Duchon polynomial-nullspace or cross-block gauge-priority \
960         failure."
961    ))
962}
963
964pub(crate) fn bernoulli_marginal_slope_runaway_error(
965    warm_start: &CustomFamilyWarmStart,
966    design: &TermCollectionDesign,
967    spec: &TermCollectionSpec,
968    inner_converged: bool,
969    eval_label: &str,
970) -> Option<String> {
971    let block_beta = warm_start.block_beta_view(0)?;
972    bernoulli_marginal_slope_runaway_error_from_beta(
973        block_beta,
974        design,
975        spec,
976        inner_converged,
977        eval_label,
978    )
979}
980
981#[cfg(test)]
982mod runaway_tests {
983    use super::*;
984    use gam_linalg::faer_ndarray::{FaerArrayView, factorize_symmetricwith_fallback, fast_xt_diag_y};
985    use gam_terms::smooth::{LinearCoefficientGeometry, LinearTermSpec};
986
987    // The marginal↔logslope overlap penalty is no longer installed as a pinned
988    // ridge (subsumed by the now-unconditional exact logslope orthogonalisation in
989    // `build_reduced_logslope_reparam`). The geometry helper is retained here under
990    // the test module because the basis-independence/weight-orthogonality unit tests
991    // below exercise it directly as the canonical overlap-direction reference.
992    pub(crate) fn marginal_logslope_overlap_penalty(
993        marginal_design: &DesignMatrix,
994        logslope_design: &DesignMatrix,
995        z: &Array1<f64>,
996        row_metric: &Array1<f64>,
997        marginal_offset: &Array1<f64>,
998        logslope_offset: &Array1<f64>,
999        marginal_baseline: f64,
1000        logslope_baseline: f64,
1001        probit_scale: f64,
1002    ) -> Result<Option<Array2<f64>>, String> {
1003        let marginal =
1004            marginal_design.try_to_dense_arc("marginal_logslope_overlap_penalty::marginal")?;
1005        let logslope =
1006            logslope_design.try_to_dense_arc("marginal_logslope_overlap_penalty::logslope")?;
1007        let n = marginal.nrows();
1008        if logslope.nrows() != n
1009            || z.len() != n
1010            || row_metric.len() != n
1011            || marginal_offset.len() != n
1012            || logslope_offset.len() != n
1013        {
1014            return Err(format!(
1015                "marginal/logslope overlap penalty row mismatch: marginal={}, logslope={}, z={}, row_metric={}, marginal_offset={}, logslope_offset={}",
1016                marginal.nrows(),
1017                logslope.nrows(),
1018                z.len(),
1019                row_metric.len(),
1020                marginal_offset.len(),
1021                logslope_offset.len(),
1022            ));
1023        }
1024        let p_m = marginal.ncols();
1025        let p_g = logslope.ncols();
1026        if p_m == 0 || p_g == 0 {
1027            return Ok(None);
1028        }
1029        if !marginal_baseline.is_finite()
1030            || !logslope_baseline.is_finite()
1031            || !probit_scale.is_finite()
1032            || probit_scale <= 0.0
1033            || z.iter().any(|v| !v.is_finite())
1034            || row_metric.iter().any(|v| !v.is_finite() || *v < 0.0)
1035            || marginal_offset.iter().any(|v| !v.is_finite())
1036            || logslope_offset.iter().any(|v| !v.is_finite())
1037        {
1038            return Err(
1039                "marginal/logslope overlap penalty requires finite pilot geometry and finite non-negative row metric"
1040                    .to_string(),
1041            );
1042        }
1043
1044        let mut marginal_effective = Array2::<f64>::zeros((n, p_m));
1045        let mut effective_logslope = Array2::<f64>::zeros((n, p_g));
1046        for i in 0..n {
1047            let q_i = marginal_offset[i] + marginal_baseline;
1048            let g_i = logslope_offset[i] + logslope_baseline;
1049            let sg = probit_scale * g_i;
1050            let c_i = (1.0 + sg * sg).sqrt();
1051            let logslope_factor =
1052                q_i * probit_scale * probit_scale * g_i / c_i + probit_scale * z[i];
1053            for j in 0..p_m {
1054                marginal_effective[[i, j]] = c_i * marginal[[i, j]];
1055            }
1056            for j in 0..p_g {
1057                effective_logslope[[i, j]] = logslope_factor * logslope[[i, j]];
1058            }
1059        }
1060        if effective_logslope.iter().all(|v| v.abs() <= f64::EPSILON) {
1061            return Ok(None);
1062        }
1063
1064        let mut gram = fast_xt_diag_x(&effective_logslope, row_metric);
1065        let gram_scale = gram.diag().iter().copied().fold(0.0_f64, f64::max);
1066        if !gram_scale.is_finite() || gram_scale <= 0.0 {
1067            return Ok(None);
1068        }
1069        let projection_ridge = (gram_scale * 1.0e-10).max(f64::EPSILON);
1070        for i in 0..p_g {
1071            gram[[i, i]] += projection_ridge;
1072        }
1073        let cross = fast_xt_diag_y(&effective_logslope, row_metric, &marginal_effective);
1074        let gram_view = FaerArrayView::new(&gram);
1075        let factor = factorize_symmetricwith_fallback(gram_view.as_ref(), Side::Lower)
1076            .map_err(|e| format!("marginal/logslope overlap Gram factorization failed: {e}"))?;
1077        let rhsview = FaerArrayView::new(&cross);
1078        let coeffs_mat = factor.solve(rhsview.as_ref());
1079        let coeffs = Array2::from_shape_fn((p_g, p_m), |(i, j)| coeffs_mat[(i, j)]);
1080        let projected_marginal = fast_ab(&effective_logslope, &coeffs);
1081        let mut penalty = fast_xt_diag_y(&marginal_effective, row_metric, &projected_marginal);
1082        penalty = (&penalty + &penalty.t()) * 0.5;
1083        let max_abs = penalty.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
1084        if !max_abs.is_finite() || max_abs <= 1.0e-12 {
1085            return Ok(None);
1086        }
1087        Ok(Some(penalty))
1088    }
1089
1090    // The raw-vs-effective counterexample. With q=g=0, s=1: c_i=1 (so M_eff=M)
1091    // and f_i=z_i (so G_eff=diag(z)·G). Pick M=[1,1,1]ᵀ and a two-column logslope
1092    // G whose RAW columns are both linearly independent of M (raw orthogonalising
1093    // [M|G] would keep BOTH — the old code returned None, no reduction), but whose
1094    // first EFFECTIVE column diag(z)·G[:,0] equals M_eff exactly. The effective
1095    // audit must therefore drop exactly one direction (r=1), proving it removes
1096    // the joint-Hessian rank-soft direction the raw audit could not see.
1097    #[test]
1098    pub(crate) fn effective_reduction_drops_score_weighted_confound_raw_audit_misses() {
1099        // G col0 = [1,2,3], col1 = [1,2,9]  (row-major rows: [1,1],[2,2],[3,9]).
1100        let m = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1101        let g = Array2::<f64>::from_shape_vec((3, 2), vec![1.0, 1.0, 2.0, 2.0, 3.0, 9.0]).unwrap();
1102        let z = Array1::from_vec(vec![1.0, 0.5, 1.0 / 3.0]);
1103        let w = Array1::<f64>::ones(3);
1104        let zero = Array1::<f64>::zeros(3);
1105
1106        // diag(z)·G[:,0] = [1·1, 0.5·2, (1/3)·3] = [1,1,1] = M_eff (fully aliased);
1107        // diag(z)·G[:,1] = [1·1, 0.5·2, (1/3)·9] = [1,1,3] (NOT in span([1,1,1])).
1108        let reparam = reduced_logslope_transform_effective(
1109            m.view(),
1110            g.view(),
1111            &z,
1112            &w,
1113            &zero,
1114            &zero,
1115            0.0,
1116            0.0,
1117            1.0,
1118        )
1119        .expect("effective reduction must succeed")
1120        .expect("effective audit must reduce the score-weighted confound (raw audit would not)");
1121        assert_eq!(
1122            reparam.ncols(),
1123            1,
1124            "exactly one effective-identifiable logslope direction should survive"
1125        );
1126
1127        // The surviving raw direction's EFFECTIVE image diag(z)·G·t must carry the
1128        // non-constant ([1,1,3]) content — i.e. it is the identifiable direction,
1129        // not the [1,1,1] confound. Its row variance must be clearly positive.
1130        let g_eff = {
1131            let mut e = Array2::<f64>::zeros((3, 2));
1132            for i in 0..3 {
1133                for j in 0..2 {
1134                    e[[i, j]] = z[i] * g[[i, j]];
1135                }
1136            }
1137            e
1138        };
1139        let img = g_eff.dot(&reparam.column(0));
1140        let mean = img.iter().sum::<f64>() / 3.0;
1141        let var = img.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / 3.0;
1142        assert!(
1143            var > 1.0e-6,
1144            "kept direction must be the identifiable (non-constant) effective column, var={var}"
1145        );
1146    }
1147
1148    // The single-column fully-confounded case: G=[1,2,3]ᵀ, z=[1,1/2,1/3] gives
1149    // G_eff=[1,1,1]=M_eff, so the entire effective logslope image is in the
1150    // effective marginal span (r==0). The helper returns None (keep the raw
1151    // design) rather than emitting a zero-width transform that would delete the
1152    // score-effect surface.
1153    #[test]
1154    pub(crate) fn effective_reduction_fully_confounded_single_column_returns_none() {
1155        let m = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1156        let g = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
1157        let z = Array1::from_vec(vec![1.0, 0.5, 1.0 / 3.0]);
1158        let w = Array1::<f64>::ones(3);
1159        let zero = Array1::<f64>::zeros(3);
1160        let reparam = reduced_logslope_transform_effective(
1161            m.view(),
1162            g.view(),
1163            &z,
1164            &w,
1165            &zero,
1166            &zero,
1167            0.0,
1168            0.0,
1169            1.0,
1170        )
1171        .expect("effective reduction must succeed");
1172        assert!(
1173            reparam.is_none(),
1174            "fully effective-confounded logslope must keep raw design (None), not a 0-width block"
1175        );
1176    }
1177
1178    // No effective confound: both effective logslope columns stay independent of
1179    // M_eff, so nothing is reduced (r==p_g ⇒ None) and healthy fits are untouched.
1180    #[test]
1181    pub(crate) fn effective_reduction_no_confound_returns_none() {
1182        let m = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1183        // diag(z)·col gives non-constant images for both columns under z below.
1184        let g = Array2::<f64>::from_shape_vec((3, 2), vec![1.0, 0.0, 0.0, 1.0, 0.0, 0.0]).unwrap();
1185        let z = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1186        let w = Array1::<f64>::ones(3);
1187        let zero = Array1::<f64>::zeros(3);
1188        let reparam = reduced_logslope_transform_effective(
1189            m.view(),
1190            g.view(),
1191            &z,
1192            &w,
1193            &zero,
1194            &zero,
1195            0.0,
1196            0.0,
1197            1.0,
1198        )
1199        .expect("effective reduction must succeed");
1200        assert!(
1201            reparam.is_none(),
1202            "no effective confound ⇒ no reduction (raw design kept unchanged)"
1203        );
1204    }
1205
1206    #[test]
1207    pub(crate) fn spatial_joint_setup_counts_only_learned_penalties_in_rho() {
1208        let data = Array2::<f64>::zeros((3, 1));
1209        let empty_terms = TermCollectionSpec {
1210            linear_terms: Vec::new(),
1211            random_effect_terms: Vec::new(),
1212            smooth_terms: Vec::new(),
1213        };
1214        let setup = joint_setup(
1215            data.view(),
1216            &empty_terms,
1217            &empty_terms,
1218            2,
1219            3,
1220            &[0.4],
1221            &SpatialLengthScaleOptimizationOptions::default(),
1222        );
1223
1224        assert_eq!(
1225            setup.rho_dim(),
1226            6,
1227            "BMS spatial setup rho must contain only learned marginal/logslope/auxiliary penalties; fixed physical ridges are carried by PenaltyMatrix::Fixed"
1228        );
1229    }
1230
1231    #[test]
1232    pub(crate) fn overlap_penalty_targets_score_weighted_logslope_span() {
1233        let marginal = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1234            Array2::from_shape_vec((4, 1), vec![0.0, 1.0, 2.0, 3.0]).unwrap(),
1235        ));
1236        let logslope = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1237            Array2::from_shape_vec((4, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap(),
1238        ));
1239        let z = Array1::from_vec(vec![0.0, 1.0, 2.0, 3.0]);
1240        let row_metric = Array1::ones(4);
1241        let offsets = Array1::zeros(4);
1242
1243        let penalty = marginal_logslope_overlap_penalty(
1244            &marginal,
1245            &logslope,
1246            &z,
1247            &row_metric,
1248            &offsets,
1249            &offsets,
1250            0.0,
1251            0.0,
1252            1.0,
1253        )
1254        .expect("overlap penalty should build")
1255        .expect("marginal signal lies in the pilot logslope Jacobian span");
1256
1257        assert_eq!(penalty.dim(), (1, 1));
1258        assert!((penalty[[0, 0]] - 14.0).abs() < 1.0e-6);
1259    }
1260
1261    #[test]
1262    pub(crate) fn overlap_penalty_skips_weight_orthogonal_channels() {
1263        let marginal = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1264            Array2::from_shape_vec((4, 1), vec![-1.0, 1.0, -1.0, 1.0]).unwrap(),
1265        ));
1266        let logslope = DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1267            Array2::from_shape_vec((4, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap(),
1268        ));
1269        let z = Array1::ones(4);
1270        let row_metric = Array1::ones(4);
1271        let offsets = Array1::zeros(4);
1272
1273        let penalty = marginal_logslope_overlap_penalty(
1274            &marginal,
1275            &logslope,
1276            &z,
1277            &row_metric,
1278            &offsets,
1279            &offsets,
1280            0.0,
1281            0.0,
1282            1.0,
1283        )
1284        .expect("overlap penalty should build");
1285
1286        assert!(penalty.is_none());
1287    }
1288
1289    // ── Fitted-η separation guard fixtures ───────────────────────────────
1290    //
1291    // The runaway guard tests the FITTED marginal predictor sup-norm
1292    // `|η|∞ = max_i |X[i,:]·β|`, not raw `|β|∞`. These helpers build minimal
1293    // `TermCollectionDesign` / `TermCollectionSpec` pairs from a dense design so
1294    // the criterion is exercised deterministically with no data files.
1295
1296    fn dense_marginal_design(
1297        x: Array2<f64>,
1298        intercept_range: std::ops::Range<usize>,
1299        linear_ranges: Vec<(String, std::ops::Range<usize>)>,
1300    ) -> TermCollectionDesign {
1301        TermCollectionDesign {
1302            design: DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(x)),
1303            penalties: Vec::new(),
1304            nullspace_dims: Vec::new(),
1305            penaltyinfo: Vec::new(),
1306            dropped_penaltyinfo: Vec::new(),
1307            coefficient_lower_bounds: None,
1308            linear_constraints: None,
1309            intercept_range,
1310            linear_ranges,
1311            random_effect_ranges: Vec::new(),
1312            random_effect_levels: Vec::new(),
1313            smooth: gam_terms::smooth::SmoothDesign {
1314                term_designs: Vec::new(),
1315                penalties: Vec::new(),
1316                nullspace_dims: Vec::new(),
1317                penaltyinfo: Vec::new(),
1318                dropped_penaltyinfo: Vec::new(),
1319                terms: Vec::new(),
1320                coefficient_lower_bounds: None,
1321                linear_constraints: None,
1322            },
1323        }
1324    }
1325
1326    fn linear_term(name: &str, feature_col: usize) -> LinearTermSpec {
1327        LinearTermSpec {
1328            name: name.to_string(),
1329            feature_col,
1330            feature_cols: vec![feature_col],
1331            categorical_levels: vec![],
1332            double_penalty: false,
1333            coefficient_geometry: LinearCoefficientGeometry::default(),
1334            coefficient_min: None,
1335            coefficient_max: None,
1336        }
1337    }
1338
1339    fn empty_spec() -> TermCollectionSpec {
1340        TermCollectionSpec {
1341            linear_terms: Vec::new(),
1342            random_effect_terms: Vec::new(),
1343            smooth_terms: Vec::new(),
1344        }
1345    }
1346
1347    /// Regression lock for the false-positive fix: an ill-conditioned
1348    /// non-orthonormal basis (two identical/collinear columns) with a large
1349    /// cancelling coefficient `β=[60,-60]` yields `Xβ ≡ 0` — a perfectly
1350    /// bounded fitted predictor. The guard MUST NOT fire even though raw
1351    /// `|β|∞=60` is far above the old `40.0` coefficient threshold. This is the
1352    /// exact pathology that aborted valid biobank fits.
1353    #[test]
1354    pub(crate) fn runaway_guard_silent_when_huge_beta_cancels_to_bounded_eta() {
1355        // Two identical columns ⇒ Xβ = (β0+β1)·col; β=[60,-60] ⇒ Xβ ≡ 0.
1356        let x = Array2::<f64>::from_shape_vec((4, 2), vec![1.0; 8]).unwrap();
1357        let design = dense_marginal_design(x, 0..0, Vec::new());
1358        let beta = Array1::from_vec(vec![60.0, -60.0]);
1359
1360        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1361            beta.view(),
1362            &design,
1363            &empty_spec(),
1364            true,
1365            "regression-fixture",
1366        );
1367        assert!(
1368            msg.is_none(),
1369            "huge cancelling β with bounded fitted η must NOT trip the runaway guard; got {msg:?}"
1370        );
1371    }
1372
1373    /// A genuinely separating full marginal surface: a single column with a
1374    /// large coefficient drives `|η|∞ = 40 ≥ 35`, so the guard fires and names
1375    /// the marginal/logslope coupling explanation.
1376    #[test]
1377    pub(crate) fn runaway_guard_fires_when_fitted_eta_exceeds_threshold() {
1378        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1379        let design = dense_marginal_design(x, 0..0, Vec::new());
1380        let beta = Array1::from_vec(vec![40.0]);
1381
1382        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1383            beta.view(),
1384            &design,
1385            &empty_spec(),
1386            true,
1387            "separation-fixture",
1388        )
1389        .expect("fitted |η|∞=40 ≥ 35 must trip the runaway guard");
1390
1391        assert!(msg.contains("marginal/logslope runaway"));
1392        assert!(msg.contains("|η|∞"));
1393        assert!(msg.contains("4.000e1"));
1394        assert!(msg.contains("score is correlated with the shared surface covariates"));
1395        assert!(msg.contains("not a Matérn/Duchon polynomial-nullspace"));
1396        assert!(msg.contains("KKT certificate"));
1397    }
1398
1399    /// An unpenalized parametric direction can genuinely separate (no smoothness
1400    /// penalty bounding it). When its fitted contribution reaches the η scale
1401    /// the parametric arm fires first and names the parametric explanation.
1402    #[test]
1403    pub(crate) fn runaway_guard_names_unpenalized_parametric_direction_via_fitted_eta() {
1404        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1405        let design = dense_marginal_design(x, 0..0, vec![("sex".to_string(), 0..1)]);
1406        let mut spec = empty_spec();
1407        spec.linear_terms.push(linear_term("sex", 0));
1408        let beta = Array1::from_vec(vec![41.0]);
1409
1410        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1411            beta.view(),
1412            &design,
1413            &spec,
1414            true,
1415            "parametric-fixture",
1416        )
1417        .expect("parametric fitted |η|∞=41 ≥ 35 must trip the runaway guard");
1418
1419        assert!(msg.contains("unpenalized parametric marginal direction"));
1420        assert!(msg.contains("|η|∞"));
1421        assert!(msg.contains("robust Jeffreys curvature path is already installed"));
1422        assert!(msg.contains("not a Matérn/Duchon polynomial-nullspace"));
1423    }
1424
1425    /// A non-converged inner solve with a BOUNDED fitted predictor must NOT
1426    /// surface the separation error — the non-convergence is reported through
1427    /// the existing downstream path, not as a runaway.
1428    #[test]
1429    pub(crate) fn runaway_guard_silent_for_nonconverged_but_bounded_eta() {
1430        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1431        let design = dense_marginal_design(x, 0..0, Vec::new());
1432        let beta = Array1::from_vec(vec![5.0]);
1433
1434        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1435            beta.view(),
1436            &design,
1437            &empty_spec(),
1438            false,
1439            "nonconverged-fixture",
1440        );
1441        assert!(
1442            msg.is_none(),
1443            "bounded fitted η must not raise the separation error even when the inner solve did not converge; got {msg:?}"
1444        );
1445    }
1446
1447    /// A genuinely separating fit that ALSO failed to converge still surfaces
1448    /// the runaway error, and reports the non-converged inner status.
1449    #[test]
1450    pub(crate) fn runaway_guard_fires_for_nonconverged_separating_eta() {
1451        let x = Array2::<f64>::from_shape_vec((3, 1), vec![1.0, 1.0, 1.0]).unwrap();
1452        let design = dense_marginal_design(x, 0..0, Vec::new());
1453        let beta = Array1::from_vec(vec![50.0]);
1454
1455        let msg = bernoulli_marginal_slope_runaway_error_from_beta(
1456            beta.view(),
1457            &design,
1458            &empty_spec(),
1459            false,
1460            "nonconverged-separating-fixture",
1461        )
1462        .expect("separating |η|∞ at non-convergence must still trip the guard");
1463
1464        assert!(msg.contains(
1465            "the inner solve failed while already carrying a separation-scale predictor"
1466        ));
1467    }
1468
1469    /// Regression lock for gam#370: the pre-fit identifiability audit evaluates
1470    /// every block's effective Jacobian at the empty/zero β with
1471    /// `family_scalars: None`. The BMS marginal and logslope blocks own the
1472    /// logslope design + offset, so `g_i = offset_s[i]` (the fitted logslope
1473    /// baseline, generically nonzero) at β = 0 — and the old code hard-errored
1474    /// ("requires BmsFamilyScalars when beta != 0 … got family_scalars: None")
1475    /// because it read `any_nonzero_g` and demanded a caller-supplied scalar it
1476    /// could in fact reconstruct itself. Both blocks must now self-compute the
1477    /// closed-form `c_i = sqrt(1 + (s·g_i)²)` (and the logslope factor) from
1478    /// owned data and return a finite Jacobian, NOT an error. This makes every
1479    /// `logslope_formula` / `linkwiggle(...)` BMS fit reachable through the
1480    /// Python `gamfit.fit` API (the audit no longer aborts before the fit).
1481    #[test]
1482    pub(crate) fn bms_block_jacobians_self_compute_at_audit_empty_beta_nonzero_logslope_baseline() {
1483        use std::sync::Arc;
1484        let n = 4usize;
1485        let marginal =
1486            Arc::new(Array2::<f64>::from_shape_vec((n, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap());
1487        let logslope =
1488            Arc::new(Array2::<f64>::from_shape_vec((n, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap());
1489        let offset_m = Array1::<f64>::zeros(n);
1490        // Nonzero logslope baseline absorbed into offset_s — the exact pooled
1491        // probit pilot value that is "essentially never exactly 0".
1492        let g_baseline = 0.3_f64;
1493        let offset_s = Array1::<f64>::from_elem(n, g_baseline);
1494        let z = Arc::new(Array1::from_vec(vec![-0.7, 0.2, 0.9, 1.4]));
1495        let s = 1.0_f64;
1496
1497        // The pre-fit audit linearization state: EMPTY β, no family scalars.
1498        let beta: Vec<f64> = Vec::new();
1499        let state = FamilyLinearizationState {
1500            beta: &beta,
1501            family_scalars: None,
1502            channel_hessian: None,
1503            probit_frailty_scale: s,
1504        };
1505
1506        let marginal_jac = BmsMarginalJacobian::new(
1507            Arc::clone(&marginal),
1508            Arc::clone(&logslope),
1509            offset_m.clone(),
1510            offset_s.clone(),
1511            1,
1512        );
1513        let j_m = marginal_jac
1514            .effective_jacobian_rows(&state, 0..n)
1515            .expect("BMS marginal Jacobian must self-compute at audit empty β (gam#370)");
1516        // ∂η_i/∂β_m = c_i · M[i,:], c_i = sqrt(1 + (s·g_baseline)²), M[i,0]=1.
1517        let c_expected = (1.0 + (s * g_baseline).powi(2)).sqrt();
1518        assert_eq!(j_m.dim(), (n, 1));
1519        for i in 0..n {
1520            assert!(
1521                (j_m[[i, 0]] - c_expected).abs() < 1e-12,
1522                "marginal J[{i}] = {} != closed-form c_i = {c_expected}",
1523                j_m[[i, 0]]
1524            );
1525        }
1526
1527        let logslope_jac = BmsLogslopeJacobian::new(
1528            Arc::clone(&marginal),
1529            Arc::clone(&logslope),
1530            offset_m,
1531            offset_s,
1532            Arc::clone(&z),
1533            1,
1534        );
1535        let j_s = logslope_jac
1536            .effective_jacobian_rows(&state, 0..n)
1537            .expect("BMS logslope Jacobian must self-compute at audit empty β (gam#370)");
1538        // ∂η_i/∂β_s = (q_i·s²·g_i/c_i + s·z_i)·G[i,:]; at empty β, q_i = 0 and
1539        // g_i = g_baseline, so the factor is s²·0·… + s·z_i = s·z_i (G[i,0]=1).
1540        assert_eq!(j_s.dim(), (n, 1));
1541        for i in 0..n {
1542            let expected = s * z[i];
1543            assert!(
1544                (j_s[[i, 0]] - expected).abs() < 1e-12,
1545                "logslope J[{i}] = {} != closed-form factor {expected}",
1546                j_s[[i, 0]]
1547            );
1548            assert!(j_s[[i, 0]].is_finite());
1549        }
1550    }
1551}
1552
1553pub(crate) fn build_marginal_blockspec_bms(
1554    design: &TermCollectionDesign,
1555    baseline: f64,
1556    offset: &Array1<f64>,
1557    rho: Array1<f64>,
1558    beta_hint: Option<Array1<f64>>,
1559    logslope_design: &TermCollectionDesign,
1560    logslope_offset: &Array1<f64>,
1561    logslope_baseline: f64,
1562    p_marginal: usize,
1563    influence_columns: Option<&Array2<f64>>,
1564    influence_ridge_log_lambda: f64,
1565) -> Result<ParameterBlockSpec, String> {
1566    let offset_m = offset + baseline;
1567    let offset_s = logslope_offset + logslope_baseline;
1568    let raw_marginal_dense = design
1569        .design
1570        .try_to_dense_arc("build_marginal_blockspec_bms::marginal")?;
1571    let marginal_dense =
1572        widen_marginal_dense_with_influence(&raw_marginal_dense, influence_columns)?;
1573    let logslope_dense = logslope_design
1574        .design
1575        .try_to_dense_arc("build_marginal_blockspec_bms::logslope")?;
1576    let callback: Arc<dyn BlockEffectiveJacobian> = Arc::new(BmsMarginalJacobian {
1577        marginal_dense: Arc::clone(&marginal_dense),
1578        logslope_dense,
1579        offset_m: offset_m.clone(),
1580        offset_s,
1581        p_marginal,
1582    });
1583    let (penalties, nullspace_dims, initial_log_lambdas) = marginal_penalties_with_influence_ridge(
1584        design,
1585        &rho,
1586        influence_columns,
1587        influence_ridge_log_lambda,
1588    )?;
1589    Ok(ParameterBlockSpec {
1590        name: "marginal_surface".to_string(),
1591        design: DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1592            (*marginal_dense).clone(),
1593        )),
1594        offset: offset_m,
1595        penalties,
1596        nullspace_dims,
1597        initial_log_lambdas,
1598        initial_beta: widen_marginal_beta_hint(beta_hint, p_marginal),
1599        // Canonical-gauge architecture (issue #322): give marginal_surface
1600        // strictly higher priority than logslope_surface so the priority-
1601        // ordered RRQR in `canonicalize_for_identifiability` presents
1602        // marginal columns first and routes any cross-block alias drop into
1603        // logslope.  Equal priorities (the previous default of 100/100)
1604        // produced a same-priority `hard_alias_pair` whenever a
1605        // high-dimensional smooth — e.g. `s(x, type=duchon, centers>=6)`
1606        // in the location block — accidentally spanned the logslope basis
1607        // direction, leaving the joint Hessian with a structural null and
1608        // the spectral Newton solve refusing to step.  The values mirror
1609        // the canonical-gauge entry for survival marginal-slope
1610        // (marginal=150, logslope=120).
1611        gauge_priority: GAUGE_PRIORITY_MARGINAL,
1612        jacobian_callback: Some(callback),
1613        stacked_design: None,
1614        stacked_offset: None,
1615    })
1616}
1617
1618pub(crate) fn build_logslope_blockspec_bms(
1619    design: &TermCollectionDesign,
1620    baseline: f64,
1621    offset: &Array1<f64>,
1622    rho: Array1<f64>,
1623    beta_hint: Option<Array1<f64>>,
1624    marginal_design: &TermCollectionDesign,
1625    marginal_offset: &Array1<f64>,
1626    marginal_baseline: f64,
1627    z: Arc<Array1<f64>>,
1628    p_marginal: usize,
1629    influence_columns: Option<&Array2<f64>>,
1630) -> Result<ParameterBlockSpec, String> {
1631    let offset_s = offset + baseline;
1632    let offset_m = marginal_offset + marginal_baseline;
1633    let raw_marginal_dense = marginal_design
1634        .design
1635        .try_to_dense_arc("build_logslope_blockspec_bms::marginal")?;
1636    // The logslope Jacobian reconstructs q_i = M·β_m + offset_m; with the
1637    // absorbed influence columns folded into the marginal index, the marginal
1638    // reference design and p_marginal MUST be the widened [M | Z̃] / (p_m+p₁)
1639    // so β_m slices to the absorber and q_i carries the Z̃·γ shift (#461).
1640    let marginal_dense =
1641        widen_marginal_dense_with_influence(&raw_marginal_dense, influence_columns)?;
1642    let logslope_dense = design
1643        .design
1644        .try_to_dense_arc("build_logslope_blockspec_bms::logslope")?;
1645    let callback: Arc<dyn BlockEffectiveJacobian> = Arc::new(BmsLogslopeJacobian {
1646        marginal_dense,
1647        logslope_dense: Arc::clone(&logslope_dense),
1648        offset_m,
1649        offset_s: offset_s.clone(),
1650        z,
1651        p_marginal,
1652    });
1653    Ok(ParameterBlockSpec {
1654        name: "logslope_surface".to_string(),
1655        design: DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from(
1656            (*logslope_dense).clone(),
1657        )),
1658        offset: offset_s,
1659        penalties: design.penalties_as_penalty_matrix(),
1660        nullspace_dims: design.nullspace_dims.clone(),
1661        initial_log_lambdas: rho,
1662        initial_beta: beta_hint,
1663        // Canonical-gauge architecture (issue #322): logslope is strictly
1664        // lower priority than marginal so the priority-ordered RRQR in
1665        // `canonicalize_for_identifiability` demotes a shared cross-block
1666        // direction here, not in marginal.  Mirrors the survival-mgs
1667        // value (marginal=150, logslope=120).  See the matching comment
1668        // on `build_marginal_blockspec_bms` for the failure mode this
1669        // resolves.
1670        gauge_priority: GAUGE_PRIORITY_LOGSLOPE,
1671        jacobian_callback: Some(callback),
1672        stacked_design: None,
1673        stacked_offset: None,
1674    })
1675}
1676
1677pub(crate) fn build_deviation_aux_blockspec(
1678    name: &str,
1679    prepared: &DeviationPrepared,
1680    rho: Array1<f64>,
1681    beta_hint: Option<Array1<f64>>,
1682) -> Result<ParameterBlockSpec, String> {
1683    let mut block = prepared.block.clone();
1684    block.initial_log_lambdas = Some(rho);
1685    let candidate_beta = beta_hint.or_else(|| Some(Array1::<f64>::zeros(block.design.ncols())));
1686    block.initial_beta = candidate_beta
1687        .map(|beta| {
1688            let zero = Array1::<f64>::zeros(beta.len());
1689            project_monotone_feasible_beta(&prepared.runtime, &zero, &beta, name)
1690        })
1691        .transpose()?;
1692    let mut spec = block.intospec(name)?;
1693    // Deviation auxiliary blocks (score_warp_dev, link_dev, and any
1694    // future flex block routed through this builder) model pure
1695    // shape modifications on top of parametric anchors. They must
1696    // never own a shared affine direction with the parametric
1697    // (time / marginal / logslope) blocks. The canonical-gauge
1698    // selector drops shared directions from blocks with lower
1699    // gauge_priority first; assigning a value below the parametric
1700    // default (GAUGE_PRIORITY_CANDIDATE_FLEX) realises that contract
1701    // automatically.
1702    spec.gauge_priority = match name {
1703        "link_dev" => GAUGE_PRIORITY_LINK_DEV,
1704        // score_warp_dev gets a slightly higher priority than link_dev
1705        // because in mixed-flex configurations (both blocks present)
1706        // link_dev is the residualised one (orthogonalised against the
1707        // parametric anchors PLUS the already-prepared score_warp
1708        // basis at construction time); link_dev should therefore yield
1709        // first when an alias still survives into the joint design.
1710        "score_warp_dev" => GAUGE_PRIORITY_SCORE_WARP_DEV,
1711        _ => GAUGE_PRIORITY_DEVIATION_DEFAULT,
1712    };
1713    Ok(spec)
1714}
1715
1716pub(crate) fn push_deviation_aux_blockspecs(
1717    blocks: &mut Vec<ParameterBlockSpec>,
1718    rho: &Array1<f64>,
1719    cursor: &mut usize,
1720    score_warp_prepared: Option<&DeviationPrepared>,
1721    link_dev_prepared: Option<&DeviationPrepared>,
1722    score_warp_beta_hint: Option<Array1<f64>>,
1723    link_dev_beta_hint: Option<Array1<f64>>,
1724) -> Result<(), String> {
1725    if let Some(prepared) = score_warp_prepared {
1726        let rho_h = rho
1727            .slice(s![*cursor..*cursor + prepared.block.penalties.len()])
1728            .to_owned();
1729        *cursor += prepared.block.penalties.len();
1730        blocks.push(build_deviation_aux_blockspec(
1731            "score_warp_dev",
1732            prepared,
1733            rho_h,
1734            score_warp_beta_hint,
1735        )?);
1736    }
1737    if let Some(prepared) = link_dev_prepared {
1738        let rho_w = rho
1739            .slice(s![*cursor..*cursor + prepared.block.penalties.len()])
1740            .to_owned();
1741        blocks.push(build_deviation_aux_blockspec(
1742            "link_dev",
1743            prepared,
1744            rho_w,
1745            link_dev_beta_hint,
1746        )?);
1747    }
1748    Ok(())
1749}
1750
1751fn inner_fit(
1752    family: &BernoulliMarginalSlopeFamily,
1753    blocks: &[ParameterBlockSpec],
1754    options: &BlockwiseFitOptions,
1755) -> Result<UnifiedFitResult, String> {
1756    let mut options = options.clone();
1757    // BMS carries fixed physical ridge penalties that regularize coefficient
1758    // geometry but are not REML coordinates. The exact hyper-Hessian route can
1759    // stall after that projection; the family has a dedicated exact-gradient
1760    // path with full-data polish, so make it the primary nested smoother.
1761    options.use_outer_hessian = false;
1762    options.outer_tol = options.outer_tol.max(2.0e-5);
1763    fit_custom_family(family, blocks, &options).map_err(|e| e.to_string())
1764}
1765
1766pub fn fit_bernoulli_marginal_slope_terms(
1767    data: ArrayView2<'_, f64>,
1768    spec: BernoulliMarginalSlopeTermSpec,
1769    options: &BlockwiseFitOptions,
1770    kappa_options: &SpatialLengthScaleOptimizationOptions,
1771    policy: &gam_runtime::resource::ResourcePolicy,
1772) -> Result<BernoulliMarginalSlopeFitResult, String> {
1773    let mut spec = spec;
1774    let data_view = data;
1775    validate_spec(data_view, &spec)?;
1776    // Freeze the measure-jet representer length-scale dial on the coupled
1777    // marginal + log-slope surfaces (#1116). A shared mjs basis feeds BOTH
1778    // blocks; a design-moving ℓ on those shared covariates lets the outer
1779    // search reach a sharp ℓ where a marginal smooth direction trades off
1780    // against the log-slope into a separation-scale runaway (|β|→1e3). The auto
1781    // (frozen) ℓ is well-conditioned here — the pre-#1116 behavior this
1782    // restores. ℓ-learning stays on for single-surface (e.g. Gaussian) fits,
1783    // where there is no marginal/log-slope coupling to destabilize.
1784    let mjs_frozen_marginal =
1785        gam_terms::smooth::freeze_measure_jet_length_scale_learning(&mut spec.marginalspec);
1786    let mjs_frozen_logslope =
1787        gam_terms::smooth::freeze_measure_jet_length_scale_learning(&mut spec.logslopespec);
1788    if mjs_frozen_marginal + mjs_frozen_logslope > 0 {
1789        log::info!(
1790            "[BMS spatial] froze measure-jet length-scale learning on {} marginal + {} log-slope \
1791             term(s): the coupled surface keeps ℓ at its conditioned auto value (#1116)",
1792            mjs_frozen_marginal,
1793            mjs_frozen_logslope
1794        );
1795    }
1796    let mut effective_kappa_options = kappa_options.clone();
1797    // Honor explicit `length_scale=X` in the user's formula: when every
1798    // spatial term in BOTH the marginal mean and log-slope blocks carries
1799    // a user-supplied scalar length scale and no per-axis anisotropy is
1800    // requested, there is nothing for the joint-spatial outer optimizer
1801    // to do. Routing through it anyway spends ~80 outer ARC iters stalled
1802    // at the user's chosen ρ (the n-block ARC's first proposed step lands
1803    // at the box corner and never recovers), then falls through to the
1804    // ρ-only "custom family" path which is what we wanted all along.
1805    // Short-circuit straight to the ρ-only path.
1806    let kappa_locked_marginal = gam_terms::smooth::all_spatial_terms_kappa_fixed(&spec.marginalspec);
1807    let kappa_locked_logslope = gam_terms::smooth::all_spatial_terms_kappa_fixed(&spec.logslopespec);
1808    if effective_kappa_options.enabled && kappa_locked_marginal && kappa_locked_logslope {
1809        log::info!(
1810            "[BMS spatial] disabling κ/ψ optimization: every spatial term has an \
1811             explicit length_scale and no anisotropy; user-supplied kernel scale is fixed"
1812        );
1813        effective_kappa_options.enabled = false;
1814    }
1815    let flex_spatial_pilot_path = (spec.score_warp.is_some() || spec.link_dev.is_some())
1816        && spec.y.len() >= BMS_FLEX_SPATIAL_OUTER_PILOT_ROW_THRESHOLD
1817        && effective_kappa_options.enabled;
1818    if flex_spatial_pilot_path {
1819        let marginal_terms = spatial_length_scale_term_indices(&spec.marginalspec);
1820        let logslope_terms = spatial_length_scale_term_indices(&spec.logslopespec);
1821        let marginal_updates = apply_spatial_anisotropy_pilot_initializer(
1822            data_view,
1823            &mut spec.marginalspec,
1824            &marginal_terms,
1825            effective_kappa_options.pilot_subsample_threshold,
1826            &effective_kappa_options,
1827        );
1828        let logslope_updates = apply_spatial_anisotropy_pilot_initializer(
1829            data_view,
1830            &mut spec.logslopespec,
1831            &logslope_terms,
1832            effective_kappa_options.pilot_subsample_threshold,
1833            &effective_kappa_options,
1834        );
1835        effective_kappa_options.enabled = false;
1836        log::info!(
1837            "[BMS spatial] n={} flex=true pilot_geometry_updates={} iterative_spatial_outer=false reason=large-flex-spatial-pilot",
1838            spec.y.len(),
1839            marginal_updates + logslope_updates,
1840        );
1841    }
1842    let (z_standardized, z_normalization) = standardize_latent_z_with_policy(
1843        &spec.z,
1844        &spec.weights,
1845        "bernoulli-marginal-slope",
1846        &spec.latent_z_policy,
1847    )?;
1848    spec.z = z_standardized;
1849    let sigma_learnable = matches!(
1850        &spec.frailty,
1851        FrailtySpec::GaussianShift { sigma_fixed: None }
1852    );
1853    let initial_sigma = match &spec.frailty {
1854        FrailtySpec::GaussianShift {
1855            sigma_fixed: Some(s),
1856        } => Some(*s),
1857        FrailtySpec::GaussianShift { sigma_fixed: None } => Some(0.5),
1858        FrailtySpec::None => None,
1859        FrailtySpec::HazardMultiplier { .. } => {
1860            return Err(
1861                "internal: validate_spec should have rejected unsupported marginal-slope frailty"
1862                    .to_string(),
1863            );
1864        }
1865    };
1866    let probit_scale = probit_frailty_scale(initial_sigma);
1867    let (_raw_joint_designs, mut joint_specs) = build_term_collection_designs_and_freeze_joint(
1868        data_view,
1869        &[spec.marginalspec.clone(), spec.logslopespec.clone()],
1870    )
1871    .map_err(|e| e.to_string())?;
1872    let marginalspec_boot = joint_specs.remove(0);
1873    let logslopespec_boot = joint_specs.remove(0);
1874    // Rebuild the probe designs from the frozen `*_boot` specs so the probe's
1875    // penalty topology matches the topology produced by every other build path
1876    // in this optimization. The spatial optimizer's own bootstrap
1877    // (`build_term_collection_designs_and_freeze_joint(data, &[marginalspec_boot,
1878    // logslopespec_boot])` inside `optimize_spatial_length_scale_exact_joint`)
1879    // and every subsequent kappa-driven rebuild feed the basis builder the
1880    // captured `FrozenTransform` identifiability. Applying that captured
1881    // transform to the same kernel can land the structural null-space block on
1882    // the other side of `build_nullspace_shrinkage_penalty`'s spectral
1883    // tolerance, so the raw and frozen builds disagree on whether the trend
1884    // ridge survives as an active penalty candidate. Without this rebuild,
1885    // `marginal_penalty_count` / `logslope_design.penalties.len()` are taken
1886    // from the raw build but every subsequent evaluator measures the frozen
1887    // build, and `evaluate_custom_family_joint_hyper` refuses with a
1888    // "joint hyper rho dimension mismatch". Mirrors the CTN-side fix in
1889    // `fit_transformation_normal`.
1890    let (mut joint_designs, _) = build_term_collection_designs_and_freeze_joint(
1891        data_view,
1892        &[marginalspec_boot.clone(), logslopespec_boot.clone()],
1893    )
1894    .map_err(|e| format!("failed to rebuild frozen probe BMS joint designs: {e}"))?;
1895    let marginal_design = joint_designs.remove(0);
1896    let logslope_design = joint_designs.remove(0);
1897    // #905: the conditional `E[z|C]`/`Var(z|C)` Rao gate conditions on the
1898    // marginal-index span a(C) (= the marginal design columns), which is
1899    // exactly where the `b(C)·m(C)` leakage lives. It is engaged only on the
1900    // raw-z path (no CTN Stage-1 influence absorber); when an absorber is
1901    // active the conditional leakage is already absorbed (#461) and the
1902    // widened-marginal predict seam must not be perturbed by replacing z.
1903    let absorber_active = spec
1904        .score_influence_jacobian
1905        .as_ref()
1906        .is_some_and(|j| j.ncols() > 0);
1907    let conditioning_dense = if absorber_active {
1908        None
1909    } else {
1910        Some(
1911            marginal_design
1912                .design
1913                .try_to_dense_arc("bernoulli marginal-slope conditional latent-z gate")?,
1914        )
1915    };
1916    let (latent_measure, latent_z_calibration) = build_latent_measure_with_geometry(
1917        &spec.z,
1918        &spec.weights,
1919        &spec.latent_z_policy,
1920        conditioning_dense.as_ref().map(|d| d.view()),
1921    )?;
1922    if latent_measure.is_empirical() && sigma_learnable {
1923        return Err("empirical latent-measure marginal-slope calibration requires fixed GaussianShift sigma; learnable sigma derivatives must be fit under the standard-normal latent measure"
1924                    .to_string());
1925    }
1926
1927    let y = Arc::new(spec.y.clone());
1928    let weights = Arc::new(spec.weights.clone());
1929    // Apply rank-INT calibration to training z before any downstream
1930    // consumer (pooled probit baseline, term-collection designs, the
1931    // family's PIRLS loops) sees it. The calibration is persisted on the
1932    // fit result so prediction applies the identical monotone map.
1933    let z = match &latent_z_calibration {
1934        LatentMeasureCalibration::None => Arc::new(spec.z.clone()),
1935        LatentMeasureCalibration::RankInverseNormal(cal) => {
1936            Arc::new(cal.apply_to_training(&spec.z)?)
1937        }
1938        LatentMeasureCalibration::ConditionalLocationScale(cal) => {
1939            // ζ = (z − m(C))/√v(C) on the marginal-index span. The conditioning
1940            // block was built above (raw-z path only), so it is present here.
1941            let a_block = conditioning_dense.as_ref().ok_or_else(|| {
1942                "conditional latent calibration requires the marginal conditioning block"
1943                    .to_string()
1944            })?;
1945            Arc::new(cal.apply(spec.z.view(), a_block.view())?)
1946        }
1947    };
1948    let z_train = z.as_ref();
1949    let pilot_baseline = pooled_probit_baseline(&spec.y, z_train, &spec.weights)?;
1950    let baseline = (
1951        bernoulli_marginal_slope_eta_from_probability(
1952            &spec.base_link,
1953            normal_cdf(pilot_baseline.0),
1954            "bernoulli marginal-slope baseline link inversion",
1955        )?,
1956        pilot_baseline.1 / probit_scale,
1957    );
1958
1959    // Score-warp basis construction is β-independent (identifiability is
1960    // provided by the smoothness-null-space drop on the basis transform,
1961    // not by a data-distribution moment anchor at the rigid-pilot η₀), so
1962    // the standard-normal and empirical latent-measure branches build the
1963    // same block. There is no row-weight pilot to thread into the basis;
1964    // the latent-measure split is enforced upstream via the empirical
1965    // intercept solve in `build_row_exact_context_with_stats`, not in the
1966    // deviation basis.
1967    // Score-warp basis is built first, then immediately reparameterised
1968    // against the parametric span (marginal + logslope columns at the n
1969    // training rows) so its column span is orthogonal to span(X_marginal,
1970    // X_logslope) by construction. This is the first half of the joint-
1971    // design identifiability invariant; the second half (link-deviation
1972    // orthogonalised against parametric + the now-reparameterised score-
1973    // warp) runs inside the link-deviation closure below. Together they
1974    // ensure `[X_marginal | X_logslope | Φ_score_warp · T_sw |
1975    // Φ_link_dev · T_lw]` has full numerical column rank, structurally
1976    // bounding `σ_min(joint H + S) ≥ λ_min(S) > 0` regardless of how β
1977    // drifts the linear predictor distribution during PIRLS.
1978    // Cross-block W-metric pilot. The joint penalised Hessian during PIRLS
1979    // uses the probit-style data Hessian row metric
1980    //
1981    //   W_pirls[i] = spec.weights[i] · φ(η_i)² / (μ_i·(1−μ_i))
1982    //
1983    // which is the canonical IRLS row weight. The cross-block
1984    // orthogonalisation below must use this metric (not uniform
1985    // spec.weights) so that `Aᵀ W C̃ = 0` holds in the same inner product
1986    // the joint Hessian sees — otherwise A and C̃ are merely Euclidean-
1987    // orthogonal, `Aᵀ W_pirls C̃ ≠ 0`, the joint Hessian carries a near-
1988    // null direction along the W-metric alias, and REML can drive the
1989    // flex block's λ small enough that the alias direction's joint
1990    // Hessian eigenvalue collapses. β then runs away along the alias
1991    // (manifest as `rho≈2.0`, constant `step_inf`, growing `beta_inf`
1992    // during PIRLS, and the inner solve hitting `inner_max_cycles`
1993    // without satisfying the KKT residual).
1994    //
1995    // Use the rigid pooled-probit pilot η for score-warp (its basis is
1996    // β-independent in z, so the rigid pilot suffices) and the one-GN-
1997    // stepped pilot η for link-deviation (its basis is evaluated at the
1998    // same eta_pilot used here, so the orthogonalisation metric matches
1999    // the basis evaluation point exactly). Both are β-independent so the
2000    // orthogonalisation remains a one-shot construction-time step.
2001    let rigid_pilot_eta = rigid_pooled_probit_pilot_eta(
2002        &spec.base_link,
2003        z_train,
2004        &spec.marginal_offset,
2005        &spec.logslope_offset,
2006        baseline.0,
2007        baseline.1,
2008        probit_scale,
2009    )?;
2010    let cross_block_pilot_w_score_warp =
2011        pilot_irls_hessian_row_metric_at_eta(&rigid_pilot_eta, &spec.weights);
2012
2013    // Absorbed Stage-1 influence columns (#461, design §3). When the workflow
2014    // chained a CTN Stage-1 into this marginal-slope fit, `spec.score_influence_
2015    // jacobian` carries the out-of-fold `J = ∂z/∂θ₁`; the realized leakage
2016    // directions `Z_infl = diag(s_f·β̂₀)·J` are residualised against the fitted
2017    // marginal+logslope target span and appended to the additive marginal-index
2018    // block as a fixed-ridge absorber, so the joint penalised solve makes the
2019    // (α,β) score orthogonal to the remaining nuisance span without letting the
2020    // absorber compete for identifiable β(x) signal. `None` ⇒ raw z, and the
2021    // free score_warp spline below is the x-free-column fallback. β̂₀(x_i) is
2022    // the rigid-pilot logslope `baseline.1 + logslope_offset[i]`; s_f =
2023    // probit_scale.
2024    let influence_columns = if let Some(jac) = spec
2025        .score_influence_jacobian
2026        .as_ref()
2027        .filter(|j| j.ncols() > 0)
2028    {
2029        let protected_design = DesignMatrix::hstack(vec![
2030            marginal_design.design.clone(),
2031            logslope_design.design.clone(),
2032        ])
2033        .map_err(|e| {
2034            format!(
2035                "bernoulli marginal-slope influence-block protected projection stack failed to concatenate marginal + logslope design: {e}"
2036            )
2037        })?;
2038        let protected_dense_for_proj = protected_design
2039            .try_to_dense_arc("bernoulli marginal-slope influence-block protected projection")?;
2040        let protected_dense = protected_dense_for_proj.as_ref();
2041        if jac.nrows() != protected_dense.nrows() {
2042            return Err(format!(
2043                "influence block: Jacobian has {} rows, protected design has {}",
2044                jac.nrows(),
2045                protected_dense.nrows()
2046            ));
2047        }
2048        // Z̃ = residualize(diag(s_f·β̂₀)·J) against the fitted target span in
2049        // the rigid-pilot W-metric.  For BMS the absorbed columns are installed
2050        // in the same additive predictor as the marginal surface; if we protect
2051        // only M, any component of Z_infl aligned with the logslope design G can
2052        // be assigned to the fixed-ridge absorber by the joint solve, erasing
2053        // genuine β(x) heterogeneity.  Projecting out [M | G] keeps the nuisance
2054        // absorber orthogonal to both parametric target surfaces while still
2055        // absorbing Stage-1 leakage directions outside that identifiable target
2056        // span. β̂₀(x_i) = baseline.1 + logslope_offset[i]; s_f = probit_scale;
2057        // W = the rigid-pilot PIRLS row metric.
2058        let rigid_logslope_at_rows = &spec.logslope_offset + baseline.1;
2059        let residualized = crate::marginal_slope_orthogonal::residualized_influence_block(
2060            jac,
2061            z_train,
2062            &rigid_logslope_at_rows,
2063            probit_scale,
2064            protected_dense.view(),
2065            &cross_block_pilot_w_score_warp,
2066        )?;
2067        Some(residualized)
2068    } else {
2069        None
2070    };
2071    let mut cross_block_warnings: Vec<CrossBlockIdentifiabilityWarning> = Vec::new();
2072    let score_warp_prepared = if let Some(cfg) = spec.score_warp.as_ref() {
2073        use super::deviation_runtime::ParametricAnchorBlock;
2074        let mut prepared = build_score_warp_deviation_block_from_seed(z_train, cfg)?;
2075        // `install_compiled_flex_block_into_runtime` now delegates
2076        // its math body to `identifiability::families::compiler::compile` (commit
2077        // 4e20b8dc8); the prior Phase-4a shadow compile here was a
2078        // duplicate of that internal call and has been removed.
2079        let outcome = install_compiled_flex_block_into_runtime(
2080            &mut prepared,
2081            z_train,
2082            cfg,
2083            &[
2084                (&marginal_design.design, ParametricAnchorBlock::Marginal),
2085                (&logslope_design.design, ParametricAnchorBlock::Logslope),
2086            ],
2087            &[],
2088            &cross_block_pilot_w_score_warp,
2089        )?;
2090        match outcome {
2091            FlexCompileOutcome::Reparameterised => Some(prepared),
2092            FlexCompileOutcome::FullyAliased { reason } => {
2093                // Record via the structured channel. Keep the original
2094                // (non-compiled) design so the unified audit sees score_warp_dev
2095                // and attributes the drop via dropped_columns (gauge_priority=80
2096                // is below marginal=150 / logslope=120, so RRQR correctly
2097                // demotes score_warp_dev when it aliases those blocks).
2098                cross_block_warnings.push(CrossBlockIdentifiabilityWarning {
2099                    candidate_label: "score_warp",
2100                    anchor_summary: "marginal+logslope".to_string(),
2101                    reason,
2102                });
2103                Some(prepared)
2104            }
2105        }
2106    } else {
2107        None
2108    };
2109    // Build the link-deviation block. The basis lives in η-space, and at
2110    // PIRLS time `runtime.design(η_current)` is re-evaluated at the
2111    // current β-dependent η, so the basis is genuinely β-dependent during
2112    // optimisation. The construction-time seed is used only for (a) knot
2113    // placement in η-space and (b) the cross-block identifiability check
2114    // that computes the basis-space transform `T` orthogonalising the
2115    // candidate against the parametric and score-warp anchors at training
2116    // rows.
2117    //
2118    // Using the rigid pooled probit pilot directly (`q0 = a₀·√(…) + s_f·
2119    // b₀·z`) is structurally degenerate: with zero per-row offsets it is
2120    // affine in z, so a degree-3 I-spline of `q0` spans the same column
2121    // space at training rows as a degree-3 I-spline of z, and the cross-
2122    // block check finds the candidate fully aliased by the score-warp
2123    // anchor even though at any non-rigid β the link-deviation carries
2124    // PC/age structure the score-warp cannot represent.
2125    //
2126    // Instead, seed both knot placement and the orthogonalisation pivot at
2127    // a non-rigid pilot η computed via one probit Gauss-Newton step from
2128    // the rigid pilot onto the full marginal design (see
2129    // `pilot_eta_for_link_dev_orthogonalisation`). The pilot is row-varying
2130    // in PCs/age and the resulting `T` drops only directions aliased
2131    // across all β. The score-warp basis at training rows is also threaded
2132    // in as a flex anchor when active so the kept directions are jointly
2133    // orthogonal to parametric ⊕ score-warp.
2134    let link_dev_prepared = if let Some(cfg) = spec.link_dev.as_ref() {
2135        let eta_pilot = pilot_eta_for_link_dev_orthogonalisation(
2136            &spec.base_link,
2137            &spec.y,
2138            z_train,
2139            &spec.weights,
2140            &marginal_design.design,
2141            &spec.marginal_offset,
2142            &spec.logslope_offset,
2143            baseline.0,
2144            baseline.1,
2145            probit_scale,
2146        )?;
2147        let link_dev_seed = padded_deviation_seed(&eta_pilot, 1.0, 0.5);
2148        let mut prepared = build_link_deviation_block_from_knots_design_seed_and_weights(
2149            &link_dev_seed,
2150            &eta_pilot,
2151            cfg,
2152        )?;
2153        // Cross-block identifiability for the link-deviation basis. The
2154        // anchor union covers BOTH possible aliasing channels:
2155        //
2156        //  - Parametric: location and logslope designs evaluated at the n
2157        //    training rows. Columns of `Φ_link_dev(q0)` that reproduce
2158        //    parametric features become null-direction targets in the
2159        //    joint penalised Hessian since `S_link_dev` has no mass on
2160        //    them.
2161        //
2162        //  - Score-warp (when active): the now-reparameterised score-warp
2163        //    basis, also evaluated at training rows. Both flex bases are
2164        //    cubic I-spline cubic combinations of an η-pilot scalar, and
2165        //    even with each block's own smoothness-null-space drop their
2166        //    column spans can still overlap inside the orthogonal
2167        //    complement of `{1, η_pilot}`.
2168        //
2169        // After the orthogonalisation, `[X_marginal | X_logslope |
2170        // Φ_score_warp · T_sw | Φ_link_dev · T_lw]` has full numerical
2171        // column rank at training rows, so `σ_min(joint H+S) ≥ λ_min(S)
2172        // > 0` for every β. This is the standard GAM `gam.side`
2173        // convention generalised to multi-anchor unions (mgcv applies it
2174        // sequentially across smooths sharing a covariate).
2175        // When `install_compiled_flex_block_into_runtime`
2176        // reparameterised the score-warp runtime against the parametric
2177        // anchor union (marginal + logslope), it installed an
2178        // `anchor_residual` and cached the training-row parametric
2179        // anchor matrix on the runtime. `runtime.design()` on a
2180        // residualised runtime returns the *raw* basis evaluation,
2181        // which `assert`s the caller hasn't conflated with the
2182        // reparameterised basis — we want the reparameterised one
2183        // here, so go through `design_at_training_with_residual` so
2184        // the cached anchor rows are folded in. For score-warp
2185        // configurations where reparameterisation was a no-op (no
2186        // residual installed) the same call falls back to the raw
2187        // `design()` path, so the residual-vs-no-residual branches
2188        // converge on the right matrix.
2189        let score_warp_anchor_design = score_warp_prepared
2190            .as_ref()
2191            .map(|sw| sw.runtime.design_at_training_with_residual(z_train))
2192            .transpose()?;
2193        use super::deviation_runtime::ParametricAnchorBlock;
2194        let parametric_anchors: [(&DesignMatrix, ParametricAnchorBlock); 2] = [
2195            (&marginal_design.design, ParametricAnchorBlock::Marginal),
2196            (&logslope_design.design, ParametricAnchorBlock::Logslope),
2197        ];
2198        let flex_anchor_slot: Option<&Array2<f64>> = score_warp_anchor_design.as_ref();
2199        let flex_anchors: Vec<&Array2<f64>> = flex_anchor_slot.into_iter().collect();
2200        // W-metric for link-deviation orthogonalisation: same IRLS-style
2201        // probit Hessian row weight as the score-warp path, but evaluated at
2202        // `eta_pilot` (the one-GN-stepped pilot at which the link-dev basis
2203        // itself is anchored).
2204        let cross_block_pilot_w_link_dev =
2205            pilot_irls_hessian_row_metric_at_eta(&eta_pilot, &spec.weights);
2206        let outcome = install_compiled_flex_block_into_runtime(
2207            &mut prepared,
2208            &eta_pilot,
2209            cfg,
2210            &parametric_anchors,
2211            &flex_anchors,
2212            &cross_block_pilot_w_link_dev,
2213        )?;
2214        match outcome {
2215            FlexCompileOutcome::Reparameterised => Some(prepared),
2216            FlexCompileOutcome::FullyAliased { reason } => {
2217                // Record via the structured channel. Keep the original
2218                // (non-compiled) design so the unified audit sees link_dev
2219                // and attributes the drop via dropped_columns (gauge_priority=60
2220                // is below all parametric blocks so RRQR correctly demotes
2221                // link_dev when it aliases marginal / logslope / score_warp).
2222                cross_block_warnings.push(CrossBlockIdentifiabilityWarning {
2223                    candidate_label: "link_deviation",
2224                    anchor_summary: "marginal+logslope+score_warp".to_string(),
2225                    reason,
2226                });
2227                Some(prepared)
2228            }
2229        }
2230    } else {
2231        None
2232    };
2233    let extra_rho0 = {
2234        let mut out = Vec::new();
2235        if let Some(ref prepared) = score_warp_prepared {
2236            out.extend(std::iter::repeat_n(0.0, prepared.block.penalties.len()));
2237        }
2238        if let Some(ref prepared) = link_dev_prepared {
2239            out.extend(std::iter::repeat_n(0.0, prepared.block.penalties.len()));
2240        }
2241        out
2242    };
2243    // Reduced-basis orthogonalisation of the logslope design through the BMS
2244    // family's OWN internal `logslope_design` geometry (robust cure for the
2245    // marginal↔logslope structural confound). Robustness is unconditional, so we
2246    // always reparameterize the logslope coordinate space to a full-rank reduced
2247    // basis `T` whose effective weighted columns are W-orthogonal to the marginal
2248    // span at the rigid pilot — removing the rank-soft confounded direction the
2249    // former pinned overlap ridge merely penalised. The transform is
2250    // β/ρ-independent (pilot geometry only), so it is a one-shot construction-
2251    // time map applied to every per-iteration logslope design inside
2252    // `build_blocks` / `make_family`, and inverted at fit-result assembly so the
2253    // reported logslope β is in the original basis. `None` ⇒ nothing to reduce
2254    // (no rank-soft confounded direction) ⇒ raw design used everywhere.
2255    let logslope_reduced_reparam: Option<ReducedLogslopeReparam> = build_reduced_logslope_reparam(
2256        &marginal_design,
2257        &logslope_design,
2258        z.as_ref(),
2259        &cross_block_pilot_w_score_warp,
2260        &spec.marginal_offset,
2261        &spec.logslope_offset,
2262        baseline.0,
2263        baseline.1,
2264        probit_scale,
2265    )?;
2266    // Apply the reduced reparam to a logslope `TermCollectionDesign`, or return
2267    // the raw design clone when the reparam is absent (flag off / nothing to
2268    // reduce). Used by both `build_blocks` and `make_family` so the family's
2269    // internal design, the block design, β width, jacobian, penalty, and the
2270    // `validate_exact_block_state_shapes` check all agree at the reduced width.
2271    let reduce_logslope_design =
2272        |logslope_design: &TermCollectionDesign| -> Result<TermCollectionDesign, String> {
2273            match logslope_reduced_reparam.as_ref() {
2274                Some(reparam) => reparameterize_logslope_design_reduced(logslope_design, reparam),
2275                None => Ok(logslope_design.clone()),
2276            }
2277        };
2278
2279    let marginal_penalty_count = marginal_design.penalties.len();
2280    let setup = joint_setup(
2281        data_view,
2282        &marginalspec_boot,
2283        &logslopespec_boot,
2284        marginal_penalty_count,
2285        logslope_design.penalties.len(),
2286        &extra_rho0,
2287        &effective_kappa_options,
2288    );
2289    let setup = if sigma_learnable {
2290        setup.with_auxiliary(
2291            Array1::from_vec(vec![initial_sigma.expect("learnable sigma seed").ln()]),
2292            Array1::from_vec(vec![0.01_f64.ln()]),
2293            Array1::from_vec(vec![5.0_f64.ln()]),
2294        )
2295    } else {
2296        setup
2297    };
2298    let final_sigma_cell = std::cell::Cell::new(initial_sigma);
2299    let exact_warm_start = RefCell::new(None::<CustomFamilyWarmStart>);
2300    let runaway_error = RefCell::new(None::<String>);
2301    // Outer ρ-cache β-seed staging slot. On a cache hit the spatial-joint
2302    // optimizer invokes `seed_inner_beta_fn` before the first eval at the
2303    // restored ρ: per-block column widths aren't known until the first
2304    // `build_blocks(rho, …)` runs, so we stash the flat β here and the eval
2305    // closures promote it into `exact_warm_start` (the slot the inner
2306    // PIRLS / Newton solve actually consumes) on their first invocation.
2307    let pending_beta_seed = RefCell::new(None::<Array1<f64>>);
2308    let hints = RefCell::new(ThetaHints::default());
2309    let score_warp_runtime = score_warp_prepared.as_ref().map(|p| p.runtime.clone());
2310    let link_dev_runtime = link_dev_prepared.as_ref().map(|p| p.runtime.clone());
2311
2312    let build_blocks = |rho: &Array1<f64>,
2313                        marginal_design: &TermCollectionDesign,
2314                        logslope_design: &TermCollectionDesign|
2315     -> Result<Vec<ParameterBlockSpec>, String> {
2316        let hints = hints.borrow();
2317        let mut cursor = 0usize;
2318        // Reduced-basis orthogonalisation: replace the per-iteration logslope
2319        // design with its full-rank reduced reparameterization `G·T` (flag ON);
2320        // a no-op clone when off. The reduced design carries the SAME number of
2321        // penalties (each S → Tᵀ S T), so the `rho_logslope` slice width below
2322        // is unchanged. Every consumer (marginal jacobian's c_i, logslope
2323        // blockspec design/β/penalty/jacobian) now agrees at the reduced width.
2324        let logslope_design_reduced = reduce_logslope_design(logslope_design)?;
2325        let logslope_design = &logslope_design_reduced;
2326        // Fixed #754/#461 ridges are appended inside
2327        // `marginal_penalties_with_influence_ridge` as physical penalties and
2328        // are excluded from `rho`; only genuine REML-learned smooth penalties
2329        // appear in the spatial joint setup.
2330        let rho_marginal = rho
2331            .slice(s![cursor..cursor + marginal_design.penalties.len()])
2332            .to_owned();
2333        cursor += marginal_design.penalties.len();
2334        let rho_logslope = rho
2335            .slice(s![cursor..cursor + logslope_design.penalties.len()])
2336            .to_owned();
2337        cursor += logslope_design.penalties.len();
2338        let p_m = marginal_design.design.ncols()
2339            + influence_columns.as_ref().map(|z| z.ncols()).unwrap_or(0);
2340        let mut blocks = vec![
2341            build_marginal_blockspec_bms(
2342                marginal_design,
2343                baseline.0,
2344                &spec.marginal_offset,
2345                rho_marginal,
2346                hints.marginal_beta.clone(),
2347                logslope_design,
2348                &spec.logslope_offset,
2349                baseline.1,
2350                p_m,
2351                influence_columns.as_ref(),
2352                influence_absorber_log_lambda(spec.z.len()),
2353            )?,
2354            build_logslope_blockspec_bms(
2355                logslope_design,
2356                baseline.1,
2357                &spec.logslope_offset,
2358                rho_logslope,
2359                hints.logslope_beta.clone(),
2360                marginal_design,
2361                &spec.marginal_offset,
2362                baseline.0,
2363                Arc::clone(&z),
2364                p_m,
2365                influence_columns.as_ref(),
2366            )?,
2367        ];
2368        push_deviation_aux_blockspecs(
2369            &mut blocks,
2370            rho,
2371            &mut cursor,
2372            score_warp_prepared.as_ref(),
2373            link_dev_prepared.as_ref(),
2374            hints.score_warp_beta.clone(),
2375            hints.link_dev_beta.clone(),
2376        )?;
2377        Ok(blocks)
2378    };
2379
2380    let intercept_warm_starts = new_intercept_warm_start_cache(y.len());
2381    let cell_moment_lru = new_cell_moment_lru_cache(policy);
2382    let cell_moment_cache_stats = new_cell_moment_cache_stats();
2383    let make_family = |marginal_design: &TermCollectionDesign,
2384                       logslope_design: &TermCollectionDesign,
2385                       sigma: Option<f64>|
2386     -> BernoulliMarginalSlopeFamily {
2387        // The kernel reads the marginal index from a matched (self.marginal_
2388        // design, β_m) pair. When the Stage-1 influence absorber is active the
2389        // marginal β is widened to [β_m; γ], so the family's marginal design
2390        // MUST be the widened [M | Z̃] for every per-row projection to slice
2391        // correctly (#461). With no absorber it is the raw design unchanged.
2392        let kernel_marginal_design = match influence_columns.as_ref() {
2393            Some(z_infl) => {
2394                let raw = marginal_design
2395                    .design
2396                    .try_to_dense_arc("make_family::widened-marginal")
2397                    .expect("dense marginal design for influence widening");
2398                let widened = widen_marginal_dense_with_influence(&raw, Some(z_infl))
2399                    .expect("widen marginal design with influence columns");
2400                DesignMatrix::Dense(gam_linalg::matrix::DenseDesignMatrix::from((*widened).clone()))
2401            }
2402            None => marginal_design.design.clone(),
2403        };
2404        // The family's row kernel reconstructs η_logslope = G·β_s and the
2405        // logslope Jacobian factor_i·G_i from this matched (logslope_design,
2406        // β_s) pair, so it MUST be the SAME reduced design `G·T` the block specs
2407        // fit against — otherwise β_s (reduced width) and the family design
2408        // (full width) desync. A no-op clone when the reparam is absent.
2409        let kernel_logslope_design = reduce_logslope_design(logslope_design)
2410            .expect("reduce logslope design for family construction")
2411            .design;
2412        BernoulliMarginalSlopeFamily {
2413            y: Arc::clone(&y),
2414            weights: Arc::clone(&weights),
2415            z: Arc::clone(&z),
2416            latent_measure: latent_measure.clone(),
2417            gaussian_frailty_sd: sigma,
2418            base_link: spec.base_link.clone(),
2419            marginal_design: kernel_marginal_design,
2420            logslope_design: kernel_logslope_design,
2421            score_warp: score_warp_runtime.clone(),
2422            link_dev: link_dev_runtime.clone(),
2423            policy: policy.clone(),
2424            cell_moment_lru: Arc::clone(&cell_moment_lru),
2425            cell_moment_cache_stats: Arc::clone(&cell_moment_cache_stats),
2426            intercept_warm_starts: Some(Arc::clone(&intercept_warm_starts)),
2427            auto_subsample_phase_counter: Arc::new(std::sync::atomic::AtomicUsize::new(0)),
2428            auto_subsample_last_rho: Arc::new(Mutex::new(None)),
2429        }
2430    };
2431
2432    let marginal_terms = spatial_length_scale_term_indices(&marginalspec_boot);
2433    let logslope_terms = spatial_length_scale_term_indices(&logslopespec_boot);
2434    let marginal_has_spatial = !marginal_terms.is_empty();
2435    let logslope_has_spatial = !logslope_terms.is_empty();
2436    let analytic_joint_derivatives_available =
2437        marginal_has_spatial || logslope_has_spatial || setup.log_kappa_dim() == 0;
2438    if setup.log_kappa_dim() > 0 && !analytic_joint_derivatives_available {
2439        return Err("exact bernoulli marginal-slope spatial optimization requires analytic joint psi derivatives"
2440                    .to_string());
2441    }
2442    let initial_rho = setup.theta0().slice(s![..setup.rho_dim()]).to_owned();
2443    let initial_blocks = build_blocks(&initial_rho, &marginal_design, &logslope_design)?;
2444    let initial_family = make_family(&marginal_design, &logslope_design, initial_sigma);
2445    let (joint_gradient, joint_hessian) =
2446        custom_family_outer_derivatives(&initial_family, &initial_blocks, options);
2447    let analytic_joint_gradient_available = analytic_joint_derivatives_available
2448        && matches!(joint_gradient, gam_problem::Derivative::Analytic);
2449    // Keep the analytic outer Hessian advertised at large scale. The
2450    // row-tensor terms below are represented through block-local
2451    // `HyperOperator`s and cached exact-Hessian workspaces, so ARC/trust-region
2452    // can consume exact HVPs without falling back to BFGS merely because the
2453    // realized problem is large.
2454    let analytic_joint_hessian_available =
2455        analytic_joint_derivatives_available && joint_hessian.is_analytic();
2456    let kappa_options_ref: &SpatialLengthScaleOptimizationOptions = &effective_kappa_options;
2457    let sigma_from_theta = |theta: &Array1<f64>| -> Option<f64> {
2458        if sigma_learnable {
2459            Some(theta[setup.rho_dim() + setup.log_kappa_dim()].exp())
2460        } else {
2461            initial_sigma
2462        }
2463    };
2464    let derivative_block_cache = RefCell::new(
2465        None::<(
2466            Array1<f64>,
2467            Arc<Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>>,
2468        )>,
2469    );
2470    let theta_matches = |left: &Array1<f64>, right: &Array1<f64>| -> bool {
2471        left.len() == right.len()
2472            && left
2473                .iter()
2474                .zip(right.iter())
2475                .all(|(lhs, rhs)| (*lhs - *rhs).abs() <= 1e-12 * (1.0 + lhs.abs().max(rhs.abs())))
2476    };
2477    let get_derivative_blocks = |theta: &Array1<f64>,
2478                                 specs: &[TermCollectionSpec],
2479                                 designs: &[TermCollectionDesign]|
2480     -> Result<
2481        Arc<Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>>,
2482        String,
2483    > {
2484        if let Some((cached_theta, cached_blocks)) = derivative_block_cache.borrow().as_ref()
2485            && theta_matches(cached_theta, theta)
2486        {
2487            return Ok(Arc::clone(cached_blocks));
2488        }
2489
2490        let built = |specs: &[TermCollectionSpec],
2491                     designs: &[TermCollectionDesign]|
2492         -> Result<
2493            Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>,
2494            String,
2495        > {
2496            let marginal_psi_derivs = if marginal_has_spatial {
2497                build_block_spatial_psi_derivatives(data_view, &specs[0], &designs[0])?.ok_or_else(
2498                    || {
2499                        "bernoulli marginal-slope: marginal block has spatial terms \
2500                         but spatial psi derivatives are unavailable"
2501                            .to_string()
2502                    },
2503                )?
2504            } else {
2505                Vec::new()
2506            };
2507            let logslope_psi_derivs = if logslope_has_spatial {
2508                build_block_spatial_psi_derivatives(data_view, &specs[1], &designs[1])?.ok_or_else(
2509                    || {
2510                        "bernoulli marginal-slope: logslope block has spatial terms \
2511                         but spatial psi derivatives are unavailable"
2512                            .to_string()
2513                    },
2514                )?
2515            } else {
2516                Vec::new()
2517            };
2518            let mut derivative_blocks = vec![marginal_psi_derivs, logslope_psi_derivs];
2519            if score_warp_runtime.is_some() {
2520                derivative_blocks.push(Vec::new());
2521            }
2522            if link_dev_runtime.is_some() {
2523                derivative_blocks.push(Vec::new());
2524            }
2525            if sigma_learnable {
2526                derivative_blocks
2527                    .last_mut()
2528                    .expect("bernoulli derivative block list is non-empty")
2529                    .push(crate::custom_family::CustomFamilyBlockPsiDerivative::new(
2530                        None,
2531                        Array2::zeros((0, 0)),
2532                        Array2::zeros((0, 0)),
2533                        None,
2534                        None,
2535                        None,
2536                        None,
2537                    ));
2538            }
2539            Ok(derivative_blocks)
2540        }(specs, designs)?;
2541        let built = Arc::new(built);
2542        derivative_block_cache.replace(Some((theta.clone(), Arc::clone(&built))));
2543        Ok(built)
2544    };
2545
2546    // Bernoulli marginal-slope is a multi-block family with β-dependent
2547    // joint Hessian: EFS/HybridEFS fixed-point structural invariant fails,
2548    // so we disable fixed-point at plan time rather than burning cycles on
2549    // a stalled first attempt that silently falls back.
2550    let outer_policy = {
2551        let psi_dim = setup.theta0().len() - setup.rho_dim();
2552        initial_family.outer_derivative_policy(&initial_blocks, psi_dim, options)
2553    };
2554    let exact_spatial_outer_tol = kappa_options_ref.rel_tol.max(EXACT_SPATIAL_OUTER_TOL_FLOOR);
2555    let solved = optimize_spatial_length_scale_exact_joint(
2556        data_view,
2557        &[marginalspec_boot.clone(), logslopespec_boot.clone()],
2558        &[marginal_terms.clone(), logslope_terms.clone()],
2559        kappa_options_ref,
2560        &setup,
2561        gam_solve::seeding::SeedRiskProfile::GeneralizedLinear,
2562        analytic_joint_gradient_available,
2563        analytic_joint_hessian_available,
2564        true,
2565        None,
2566        outer_policy,
2567        |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
2568            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
2569                return Err(err);
2570            }
2571            assert_eq!(
2572                specs.len(),
2573                designs.len(),
2574                "spatial joint optimizer must supply one spec per design",
2575            );
2576            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
2577            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
2578            let sigma = sigma_from_theta(theta);
2579            final_sigma_cell.set(sigma);
2580            let family = make_family(&designs[0], &designs[1], sigma);
2581            let fit = inner_fit(&family, &blocks, options)?;
2582            if let Some(block) = fit.block_states.first()
2583                && let Some(err) = bernoulli_marginal_slope_runaway_error_from_beta(
2584                    block.beta.view(),
2585                    &designs[0],
2586                    &specs[0],
2587                    fit.outer_converged,
2588                    "final fit",
2589                )
2590            {
2591                runaway_error.replace(Some(err.clone()));
2592                return Err(err);
2593            }
2594            let mut hints_mut = hints.borrow_mut();
2595            let mut bidx = 0usize;
2596            if let Some(block) = fit.block_states.get(bidx) {
2597                hints_mut.marginal_beta = Some(block.beta.clone());
2598            }
2599            bidx += 1;
2600            if let Some(block) = fit.block_states.get(bidx) {
2601                hints_mut.logslope_beta = Some(block.beta.clone());
2602            }
2603            bidx += 1;
2604            if score_warp_prepared.is_some() {
2605                if let Some(block) = fit.block_states.get(bidx) {
2606                    hints_mut.score_warp_beta = Some(block.beta.clone());
2607                }
2608                bidx += 1;
2609            }
2610            if link_dev_prepared.is_some()
2611                && let Some(block) = fit.block_states.get(bidx)
2612            {
2613                hints_mut.link_dev_beta = Some(block.beta.clone());
2614            }
2615            Ok(fit)
2616        },
2617        |theta,
2618         specs: &[TermCollectionSpec],
2619         designs: &[TermCollectionDesign],
2620         eval_mode,
2621         row_set: &crate::row_kernel::RowSet| {
2622            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
2623                return Err(err);
2624            }
2625            use gam_problem::EvalMode;
2626            // One-shot row-measure waypoint. This closure runs on EVERY outer
2627            // objective evaluation (value/gradient/Hessian probes, line-search
2628            // cost-only probes, EFS evals), so an unconditional per-eval line
2629            // floods the biobank fit log with thousands of near-identical
2630            // entries. The bridge already emits a timed `[STAGE] outer eval`
2631            // marker per eval; this one records the row-measure exactly once.
2632            static BMS_OUTER_EVAL_ROWSET_LOGGED: std::sync::Once = std::sync::Once::new();
2633            BMS_OUTER_EVAL_ROWSET_LOGGED.call_once(|| {
2634                let row_set_rows = match row_set {
2635                    crate::row_kernel::RowSet::All => spec.y.len(),
2636                    crate::row_kernel::RowSet::Subsample { rows, .. } => rows.len(),
2637                };
2638                log::debug!(
2639                    "[BMS exact outer eval] mode={eval_mode:?} row_set_rows={row_set_rows}"
2640                );
2641            });
2642            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
2643            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
2644            // Promote a staged β seed (deposited by the outer ρ-cache hit
2645            // before any eval ran) into the family warm-start slot now that
2646            // we know the per-block widths from the freshly built blocks.
2647            if let Some(beta_seed) = pending_beta_seed.borrow_mut().take() {
2648                let widths: Vec<usize> = blocks.iter().map(|b| b.design.ncols()).collect();
2649                match CustomFamilyWarmStart::from_cached_beta(&widths, &beta_seed) {
2650                    Ok(ws) => {
2651                        exact_warm_start.replace(Some(ws));
2652                    }
2653                    Err(e) => {
2654                        log::warn!(
2655                            "[BMS] outer ρ-cache β-warm-start rejected: {e}; falling back to cold β"
2656                        );
2657                    }
2658                }
2659            }
2660            let sigma = sigma_from_theta(theta);
2661            final_sigma_cell.set(sigma);
2662            let family = make_family(&designs[0], &designs[1], sigma);
2663            let derivative_blocks = get_derivative_blocks(theta, specs, designs)?;
2664            // Downgrade to ValueAndGradient when the caller asks for a
2665            // Hessian we can't provide; preserve ValueOnly probes for
2666            // line-search cost-only evaluation.
2667            let effective_mode = match eval_mode {
2668                EvalMode::ValueGradientHessian if !analytic_joint_hessian_available => {
2669                    EvalMode::ValueAndGradient
2670                }
2671                other => other,
2672            };
2673            let mut eval_options =
2674                joint_hyper_options_for_outer_tolerance(options, exact_spatial_outer_tol);
2675            if let crate::row_kernel::RowSet::Subsample { rows, n_full } = row_set {
2676                let subsample = crate::outer_subsample::OuterScoreSubsample::from_weighted_rows(
2677                    rows.as_ref().clone(),
2678                    *n_full,
2679                    0,
2680                );
2681                eval_options.outer_score_subsample = Some(Arc::new(subsample));
2682                eval_options.auto_outer_subsample = false;
2683            }
2684            let eval = evaluate_custom_family_joint_hyper_shared(
2685                &family,
2686                &blocks,
2687                &eval_options,
2688                &rho,
2689                derivative_blocks,
2690                exact_warm_start.borrow().as_ref(),
2691                effective_mode,
2692            )?;
2693            if let Some(err) = bernoulli_marginal_slope_runaway_error(
2694                &eval.warm_start,
2695                &designs[0],
2696                &specs[0],
2697                eval.inner_converged,
2698                "exact outer evaluation",
2699            ) {
2700                runaway_error.replace(Some(err.clone()));
2701                return Err(err);
2702            }
2703            exact_warm_start.replace(Some(eval.warm_start.clone()));
2704            if !eval.inner_converged {
2705                return Err(
2706                    "exact bernoulli marginal-slope inner solve did not converge".to_string(),
2707                );
2708            }
2709            if matches!(eval_mode, EvalMode::ValueGradientHessian)
2710                && analytic_joint_hessian_available
2711                && !eval.outer_hessian.is_analytic()
2712            {
2713                return Err("exact bernoulli marginal-slope joint [rho, psi] objective did not return an outer Hessian"
2714                            .to_string());
2715            }
2716            Ok((eval.objective, eval.gradient, eval.outer_hessian))
2717        },
2718        |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
2719            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
2720                return Err(err);
2721            }
2722            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
2723            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
2724            if let Some(beta_seed) = pending_beta_seed.borrow_mut().take() {
2725                let widths: Vec<usize> = blocks.iter().map(|b| b.design.ncols()).collect();
2726                match CustomFamilyWarmStart::from_cached_beta(&widths, &beta_seed) {
2727                    Ok(ws) => {
2728                        exact_warm_start.replace(Some(ws));
2729                    }
2730                    Err(e) => {
2731                        log::warn!(
2732                            "[BMS] outer ρ-cache β-warm-start rejected (efs): {e}; falling back to cold β"
2733                        );
2734                    }
2735                }
2736            }
2737            let sigma = sigma_from_theta(theta);
2738            final_sigma_cell.set(sigma);
2739            let family = make_family(&designs[0], &designs[1], sigma);
2740            let derivative_blocks = get_derivative_blocks(theta, specs, designs)?;
2741            let eval = evaluate_custom_family_joint_hyper_efs_shared(
2742                &family,
2743                &blocks,
2744                &joint_hyper_options_for_outer_tolerance(options, exact_spatial_outer_tol),
2745                &rho,
2746                derivative_blocks,
2747                exact_warm_start.borrow().as_ref(),
2748            )?;
2749            if let Some(err) = bernoulli_marginal_slope_runaway_error(
2750                &eval.warm_start,
2751                &designs[0],
2752                &specs[0],
2753                eval.inner_converged,
2754                "EFS outer evaluation",
2755            ) {
2756                runaway_error.replace(Some(err.clone()));
2757                return Err(err);
2758            }
2759            exact_warm_start.replace(Some(eval.warm_start.clone()));
2760            if !eval.inner_converged {
2761                return Err(
2762                    "exact bernoulli marginal-slope EFS inner solve did not converge".to_string(),
2763                );
2764            }
2765            Ok(eval.efs_eval)
2766        },
2767        crate::marginal_slope_shared::make_beta_seed_validator(&pending_beta_seed),
2768    )?;
2769
2770    let mut resolved_specs = solved.resolved_specs;
2771    let mut designs = solved.designs;
2772    // Reduced-basis round-trip (robust cure). When the logslope design was
2773    // orthogonalised to a reduced basis `G·T`, the fitted logslope coefficient
2774    // `β'` lives in the reduced coordinates (width `r`). The returned
2775    // `logslope_design` / `logslopespec_resolved` are the ORIGINAL full-width
2776    // basis (prediction rebuilds full `G` from the resolved spec), so map the
2777    // reported logslope coefficients back to the original basis `β_logslope =
2778    // T·β'` (predictor-identical: `G·(T·β') = (G·T)·β'`). The marginal block,
2779    // aux blocks, and the internal reduced-width flat β/geometry are untouched;
2780    // only the per-block reported logslope coefficients (blocks[1] and
2781    // block_states[1]) — which prediction/reporting consume against the full
2782    // design — are lifted to full width.
2783    let mut solved_fit = solved.fit;
2784    if let Some(reparam) = logslope_reduced_reparam.as_ref() {
2785        let r = reparam.reduced_cols();
2786        if let Some(block) = solved_fit.blocks.get_mut(1)
2787            && block.beta.len() == r
2788        {
2789            block.beta = reparam.recover_original_logslope_beta(&block.beta)?;
2790        }
2791        if let Some(state) = solved_fit.block_states.get_mut(1)
2792            && state.beta.len() == r
2793        {
2794            state.beta = reparam.recover_original_logslope_beta(&state.beta)?;
2795        }
2796    }
2797    // #905 GENERATED-REGRESSOR (Murphy–Topel) SEAM. When the conditional
2798    // location-scale gate fired, the slope fit above treated the calibrated
2799    // score `ζ = (z − m̂(C))/√v̂(C)` as KNOWN, so `solved_fit.beta_covariance()`
2800    // is the naive second-stage covariance `V_β^naive = H_β⁻¹` that ignores the
2801    // first-stage estimation error in `θ₁ = (mean_coeffs, var_coeffs)`. The
2802    // honest two-stage covariance is
2803    //   `V_β = V_β^naive + (H_β⁻¹ G) V₁ (H_β⁻¹ G)ᵀ`,  `G = ∂(score_β)/∂θ₁`.
2804    // The closed-form first-stage covariance `V₁` and the per-row chain-rule
2805    // sensitivity `∂ζ_i/∂θ₁` are computed and stored on the calibration at fit
2806    // time (see `LatentZConditionalCalibration::{theta1_covariance,
2807    // zeta_theta1_jacobian_row, generated_regressor_term}`), so the correction
2808    // is consumable wherever the slope information `G` (the per-row
2809    // `∂score_β/∂ζ_i` of the marginal/logslope blocks) is available.
2810    //
2811    // ASSEMBLY READY, ONE ENGINE QUANTITY OUTSTANDING. The full correction is
2812    // assembled by `LatentZConditionalCalibration::generated_regressor_correction`
2813    // (mod.rs): given the per-row reduced-frame slope-score sensitivity to the
2814    // calibrated score `s_i = ∂score_β,i/∂ζ_i` (an `n × p_β` matrix), it
2815    //   1. builds `J_zeta` row-by-row via `zeta_theta1_jacobian_row` (exact-zero
2816    //      on floored rows, so `G`'s support is the gate-fired rows),
2817    //   2. accumulates `G = Σ_i s_i ⊗ (∂ζ_i/∂θ₁)` (`p_β × dim θ₁`),
2818    //   3. forms `Vb·G = solved_fit.beta_covariance()·G` (the naive reduced-frame
2819    //      covariance IS `H_β⁻¹`, so `H_β⁻¹ G = Vb·G`), and
2820    //   4. returns `(Vb·G)·V₁·(Vb·G)ᵀ` (PSD ⇒ corrected slope SE strictly ≥
2821    //      naive whenever the gate fires).
2822    // So `V₁`, `∂ζ/∂θ₁`, the `Vb` frame, and the whole congruence are all
2823    // available HERE — the only quantity the seam still lacks is `s_i`.
2824    //
2825    // `s_i = ∂²ℓ_i/∂β∂ζ_i = J_iᵀ·(∂²ℓ_i/∂η_i∂ζ_i)` is the mixed `(β, ζ)` second
2826    // derivative of the row kernel contracted through the slope Jacobian `J_i`.
2827    // The conditional location-scale gate ALWAYS selects the rigid standard-normal
2828    // measure (`build_latent_measure_with_geometry` returns
2829    // `LatentMeasureKind::StandardNormal` for `ConditionalLocationScale`), so the
2830    // per-row kernel is the closed-form `rigid_standard_normal` tower
2831    // `η = q·c(g) + g·(s·ζ)`. The mixed 2-vector `∂²ℓ_i/∂(q,g)∂ζ_i` is read off the
2832    // SAME `Tower4` the value/grad/Hessian path uses (#932 row-jet machinery) by
2833    // seeding `ζ` as a third jet axis
2834    // (`rigid_standard_normal_mixed_z_sensitivity`); contracting it through the
2835    // marginal+logslope design rows (the `J_iᵀ` the row kernel exposes via
2836    // `jacobian_transpose_action`) yields `s_i` in the SAME reduced frame as
2837    // `covariance_conditional` (`rigid_standard_normal_score_zeta_sensitivity`).
2838    let (latent_z_rank_int_calibration, latent_z_conditional_calibration) =
2839        match latent_z_calibration {
2840            LatentMeasureCalibration::None => (None, None),
2841            LatentMeasureCalibration::RankInverseNormal(cal) => (Some(cal), None),
2842            LatentMeasureCalibration::ConditionalLocationScale(cal) => (None, Some(cal)),
2843        };
2844    // #905/#1028: apply the Murphy–Topel generated-regressor correction now that
2845    // `s_i` is available. `covariance_conditional` (Vb) and `covariance_corrected`
2846    // (Vp) are in the reduced logslope frame (`p_m + r`), exactly the frame
2847    // `s_i`'s reduced-logslope contraction lives in, so add the PSD term
2848    // `(Vb·G)·V₁·(Vb·G)ᵀ` to each. Applied only for the canonical (non-flex)
2849    // standard-normal kernel: the rigid tower carries no score_warp/link_dev
2850    // z-dependence, so when aux deviation blocks widen β beyond `p_m + r` the
2851    // correction's deviation columns are not yet derived and the term is skipped
2852    // (the conditional gate's intended kernel has no such blocks).
2853    if let Some(cal) = latent_z_conditional_calibration.as_ref()
2854        && let Some(vb) = solved_fit.covariance_conditional.clone()
2855    {
2856        let p_beta = vb.nrows();
2857        let marginal_dense = marginal_design
2858            .design
2859            .try_to_dense_arc("bms generated-regressor marginal design")?;
2860        let logslope_reduced = reduce_logslope_design(&logslope_design)?;
2861        let logslope_reduced_dense = logslope_reduced
2862            .design
2863            .try_to_dense_arc("bms generated-regressor reduced logslope design")?;
2864        let p_m = marginal_dense.ncols();
2865        let r = logslope_reduced_dense.ncols();
2866        if p_beta != vb.ncols() {
2867            return Err(format!(
2868                "bms generated-regressor: covariance_conditional must be square, got {}×{}",
2869                vb.nrows(),
2870                vb.ncols()
2871            ));
2872        }
2873        // Skip when aux deviation (score_warp / link_dev) blocks are present:
2874        // β is wider than the marginal+reduced-logslope frame the rigid kernel's
2875        // z-channel covers. Equality ⇒ the canonical non-flex gate kernel.
2876        if p_beta == p_m + r {
2877            let marginal_eta = &solved_fit.block_states[0].eta;
2878            let slope_eta = &solved_fit.block_states[1].eta;
2879            let probit_scale = probit_frailty_scale(final_sigma_cell.get());
2880            let s = rigid_standard_normal_score_zeta_sensitivity(
2881                &spec.base_link,
2882                marginal_eta,
2883                slope_eta,
2884                z.as_ref(),
2885                y.as_ref(),
2886                weights.as_ref(),
2887                probit_scale,
2888                marginal_dense.view(),
2889                logslope_reduced_dense.view(),
2890                p_beta,
2891            )?;
2892            // `generated_regressor_correction` re-derives `∂ζ_i/∂θ₁` via
2893            // `zeta_theta1_jacobian_row(z_i, a_row)`, which expects the RAW
2894            // normalized latent score `z_i` (it recomputes `ζ_i = (z_i − m)/√v`
2895            // internally), and conditions on the marginal-index span
2896            // `a(C_i)` = the RAW marginal design rows (the basis the gate was fit
2897            // on). Feed `spec.z` (the standardized raw score, NOT the calibrated
2898            // ζ the kernel consumed) and the raw marginal dense design.
2899            let correction = cal.generated_regressor_correction(
2900                s.view(),
2901                spec.z.view(),
2902                marginal_dense.view(),
2903                vb.view(),
2904            )?;
2905            if let Some(cov) = solved_fit.covariance_conditional.as_mut() {
2906                *cov = &*cov + &correction;
2907            }
2908            if let Some(cov) = solved_fit.covariance_corrected.as_mut() {
2909                *cov = &*cov + &correction;
2910            }
2911            log::info!(
2912                "[BMS latent-z] Murphy–Topel generated-regressor SE correction applied: \
2913                 p_beta={p_beta} theta1_dim={} max_diag_inflation={:.3e}",
2914                cal.theta1_dim(),
2915                (0..p_beta)
2916                    .map(|i| correction[[i, i]])
2917                    .fold(0.0_f64, f64::max),
2918            );
2919        } else {
2920            log::info!(
2921                "[BMS latent-z] Murphy–Topel generated-regressor SE correction skipped: \
2922                 aux deviation blocks present (p_beta={p_beta} > marginal({p_m})+logslope({r})); \
2923                 rigid-kernel z-channel does not yet cover score_warp/link_dev deviations"
2924            );
2925        }
2926    }
2927    // #461: PREDICT SEAM — when the Stage-1 influence absorber is active
2928    // (spec.score_influence_jacobian.is_some()), `fit.block_states[0].beta` is
2929    // the WIDENED marginal coefficient `[β_m; γ]` (length p_m + p₁), but
2930    // `marginal_design` below is the RAW term-collection design (p_m columns):
2931    // the absorbed influence columns Z̃_infl are a TRAINING-only leakage
2932    // absorber and do NOT exist at predict rows (no Stage-1 fold there). The
2933    // orthogonalized β̂_m is a property of the training fit, so prediction must
2934    // use ONLY the first p_m entries of block_states[0].beta against this raw
2935    // marginal_design and DROP the trailing γ. The model-payload / predict
2936    // builder (src/main.rs run_fit_bernoulli_marginal_slope → inference) owns
2937    // that truncation; it must record p_m (= marginal_design.design.ncols())
2938    // and slice the persisted marginal β to it. Survival mirrors this seam.
2939    Ok(BernoulliMarginalSlopeFitResult {
2940        fit: solved_fit,
2941        marginalspec_resolved: resolved_specs.remove(0),
2942        logslopespec_resolved: resolved_specs.remove(0),
2943        marginal_design: designs.remove(0),
2944        logslope_design: designs.remove(0),
2945        baseline_marginal: baseline.0,
2946        baseline_logslope: baseline.1,
2947        z_normalization,
2948        latent_measure,
2949        score_warp_runtime,
2950        link_dev_runtime,
2951        gaussian_frailty_sd: final_sigma_cell.get(),
2952        cross_block_warnings,
2953        latent_z_rank_int_calibration,
2954        latent_z_conditional_calibration,
2955    })
2956}