Skip to main content

gam_solve/
psi_gram_tensor.rs

1//! Certified Chebyshev-in-ψ Gram tensor: n-independent design-moving trials
2//! (#1033 item b).
3//!
4//! ## Why
5//!
6//! When a design-moving hyperparameter ψ (= log κ for the radial families) is
7//! searched by the outer loop, every trial today rebuilds the n×k design and
8//! re-forms XᵀWX — an O(n·k) + O(n·k²) pass per trial. But along the trial
9//! window every design entry `X(ψ)[i, j]` is an ANALYTIC function of ψ on a
10//! compact interval (Matérn channels depend on (r, ℓ) only through κr and
11//! κ-power prefactors; Duchon power blocks are ψ-free; partial-fraction
12//! coefficients are analytic scalars), so both the design and its Gaussian
13//! sufficient statistics admit geometrically-convergent Chebyshev expansions
14//!
15//! ```text
16//!   X(ψ) = Σ_{d=0}^{D} X_d · T_d(ψ̃),     ψ̃ = affine map of ψ to [−1, 1],
17//! ```
18//!
19//! with n×k coefficient slabs `X_d` computed ONCE from D+1 exact design
20//! evaluations at Chebyshev nodes (a first-kind DCT). The design coefficients
21//! certify analyticity, while the shipped runtime series is fit directly to the
22//! exact node sufficient statistics `G(ψ_i)=X(ψ_i)ᵀ W X(ψ_i)` and
23//! `c(ψ_i)=X(ψ_i)ᵀ W z`. Interpolating the sufficient statistics directly avoids
24//! the extra product-truncation residual from forming `Σ_d,e X_dᵀWX_e T_dT_e`,
25//! which a weakly penalized radial solve can amplify into visible β̂ drift.
26//! Every subsequent trial is n-free:
27//!
28//! ```text
29//!   XᵀWX(ψ) = Σ_d T_d(ψ̃) G_d          O(D k²)
30//!   XᵀWz(ψ) = Σ_d T_d(ψ̃) c_d          O(D k)
31//!   ∂/∂ψ (XᵀWX) = Σ_d T_d′(ψ̃) G_d     O(D k²)
32//! ```
33//!
34//! The ψ-gradient comes from the SAME representation as the value — one
35//! source of truth, structurally immune to the objective↔gradient desync
36//! class. `T_d′(ψ̃) = d·U_{d−1}(ψ̃) · dψ̃/dψ` is closed-form.
37//!
38//! ## Certification, not approximation-by-fiat
39//!
40//! Same discipline as [`gam_terms::basis::radial_profile`]: [`PsiGramTensor::build`]
41//! returns `None` (callers keep the exact per-trial path) unless BOTH
42//! 1. the Chebyshev coefficient tail of the EXPANDED DESIGN decays below
43//!    [`PSI_GRAM_CERT_RTOL`] of the design scale (geometric-decay certificate
44//!    for analytic interpolands, with node-count escalation), and
45//! 2. deterministic off-node spot checks of the ASSEMBLED Gram against an
46//!    exactly rebuilt Gram agree to [`PSI_GRAM_SPOT_RTOL`].
47//!
48//! Trials outside `[psi_lo, psi_hi]` are the caller's signal to fall back to
49//! the exact path ([`PsiGramTensor::contains`]).
50
51use ndarray::{Array1, Array2, ArrayView1};
52
53/// Relative ceiling on the per-column Chebyshev coefficient tail (#1216).
54///
55/// This is a cheap NECESSARY-CONDITION pre-filter, not the accuracy gate: the
56/// authoritative accuracy gate is the off-node `spot_check` on the ASSEMBLED
57/// Gram ([`PSI_GRAM_SPOT_RTOL`]). On the WIDE STANDARDIZED geometry default 1-D
58/// fits use (#1215) the realized radial design needs the deeper ladder below to
59/// drive the tail beneath the beta-invariance bar. Keep this as a necessary
60/// pre-filter, not the final beta oracle: shallow 65-node tensors were fine for
61/// cost-only gates, but the weakly penalized radial solve amplified their
62/// residual into visible beta-hat drift across the reduced-basis rotation. A
63/// genuinely non-analytic design (a true kink) still refuses here or at the
64/// assembled-Gram spot check.
65pub const PSI_GRAM_CERT_RTOL: f64 = 1.0e-9;
66
67/// Relative agreement required at the off-node Gram spot checks.
68pub const PSI_GRAM_SPOT_RTOL: f64 = 1.0e-10;
69
70/// Node-count escalation ladder for the expansion build (degree = nodes − 1).
71///
72/// The top rung sizes to WIDE trial windows: Chebyshev coefficients of the
73/// Matérn-type channels decay like Bessel `I_d(σ)` with `σ ≈ s_max·halfwidth`
74/// (s = κr), which only drops below the 1e-12 tail tolerance for `d ≳ 2σ` —
75/// e.g. σ ≈ 9 (s_max ≈ 8, ±1.1 window) needs degree ≳ 40, so 33 nodes refuse
76/// and 65 certify. Node counts stay trivially cheap (one design eval each).
77///
78/// On the WIDE STANDARDIZED geometry default 1-D fits use (#1215) the tail
79/// decays cleanly but GEOMETRICALLY-slowly: measured per-column worst tail rel
80/// is ~3.2e-8 at m=33 and ~2.3e-11 at m=65 (a clean ~1300×/doubling decay, NOT a
81/// floor). The old 65-node acceptance was fine for the cost lane but not for the
82/// beta-hat soundness gate: the inner penalized solve `β̂ = (G+λS)⁻¹r`
83/// amplifies Gram residuals by the radial-kernel conditioning, especially after
84/// the production skip was relaxed to cross a reduced-basis rotation. Start at
85/// 513 nodes so the production gate no longer accepts the shallower tensors that
86/// pass the Gram spot check but still move the weakly-penalized beta solve.
87pub const PSI_GRAM_NODE_LADDER: [usize; 1] = [513];
88
89/// Number of deterministic off-node spot-check ψ values.
90pub const PSI_GRAM_SPOT_POINTS: usize = 3;
91
92/// Rank-revealing relative eigenvalue cutoff for the reduced-basis (range)
93/// projector witness [`PsiGramTensor::reduced_basis_equal`] (#1264). An
94/// eigendirection of the conditioned Gram `XᵀWX(ψ)` is counted in the range
95/// (reduced) basis when its eigenvalue exceeds `PSI_GRAM_SKIP_RANK_RTOL · λ_max`.
96/// Sized to match the inner solve's effective rank-revealing scale on the
97/// standardized radial-kernel Gram, whose conditioning sweeps several orders of
98/// magnitude across the ψ-window; a directly-below-cutoff direction is exactly
99/// the one whose inclusion flips with ψ and silently rotates the frozen reduced
100/// basis, which this witness must catch.
101pub const PSI_GRAM_SKIP_RANK_RTOL: f64 = 1.0e-10;
102
103/// Max-norm tolerance on the range-PROJECTOR agreement between the pinning ψ and
104/// the candidate ψ in [`PsiGramTensor::reduced_basis_equal`] (#1264). The
105/// orthogonal projector onto the reduced subspace is gauge-invariant and O(1) in
106/// scale, so a tight absolute tolerance certifies the two reduced bases span the
107/// SAME subspace. A subspace that has measurably rotated (the basis the frozen
108/// fast-path surface would mis-pair with a re-keyed Gram) exceeds this by orders
109/// of magnitude, so it refuses the skip well before the ~1e-6 β̂ bar is at risk.
110pub const PSI_GRAM_SKIP_PROJ_ATOL: f64 = 1.0e-7;
111
112/// Certified Chebyshev-in-ψ expansion of a design-moving Gram (#1033b).
113///
114/// Holds the one-time Chebyshev sufficient-statistic series; every per-trial
115/// accessor is O(Dk²) or cheaper and never touches n rows again.
116pub struct PsiGramTensor {
117    psi_lo: f64,
118    psi_hi: f64,
119    /// Certified gradient window over which the ANALYTIC ψ-derivative
120    /// `dgram_dpsi` reproduces the exact design derivative. The gradient lane
121    /// rides the value-lane off-node certificate [`PSI_GRAM_SPOT_RTOL`]
122    /// (#1033b gradient lane). For the #1033
123    /// sufficient-statistic outer loop this must cover the full optimizer
124    /// window; otherwise callers do not arm the n-free kappa search.
125    ///
126    /// The value reconstruction `gram_at` is certified over the FULL window
127    /// (`T_d ≤ 1` everywhere), but the derivative reconstruction amplifies the
128    /// coefficient-tail error by `T_d′ ∼ d²`. The n-free kappa search is armed
129    /// only when endpoint-aware checks certify this whole interval.
130    grad_psi_lo: f64,
131    grad_psi_hi: f64,
132    /// Number of Chebyshev coefficients (degree + 1).
133    n_coeff: usize,
134    k: usize,
135    /// Chebyshev coefficients of `X(ψ)ᵀ W X(ψ)`, obtained by a first-kind DCT of
136    /// the exact node sufficient statistics. This keeps the per-trial path to a
137    /// single O(Dk²) series and avoids product-truncation drift in β̂.
138    gram: Vec<Array2<f64>>,
139    /// Chebyshev coefficients of `X(ψ)ᵀ W z`.
140    rhs: Vec<Array1<f64>>,
141    /// `zᵀWz` — ψ-free, captured at build so the Gaussian sufficient-statistic
142    /// triple can be assembled per trial without any row access.
143    zt_w_z: f64,
144}
145
146/// One ladder rung's outcome: a hard evaluation failure aborts the whole
147/// build (no larger rung can fix a non-finite design) and carries the reason,
148/// an uncertified tail escalates to the next rung, and a candidate proceeds to
149/// the spot check.
150enum BuildOutcome {
151    EvalFailed(String),
152    TailNotCertified,
153    Candidate(PsiGramTensor),
154}
155
156/// Chebyshev values `T_0..T_{n−1}` at `x ∈ [−1, 1]`.
157fn cheb_t(x: f64, n: usize) -> Vec<f64> {
158    let mut t = vec![0.0; n];
159    if n > 0 {
160        t[0] = 1.0;
161    }
162    if n > 1 {
163        t[1] = x;
164    }
165    for d in 2..n {
166        t[d] = 2.0 * x * t[d - 1] - t[d - 2];
167    }
168    t
169}
170
171/// Chebyshev derivative values `T_0′..T_{n−1}′` at `x ∈ [−1, 1]` in the
172/// MAPPED coordinate (multiply by `dx/dψ` for the ψ-derivative):
173/// `T_d′ = d · U_{d−1}` with the Chebyshev-U recurrence.
174fn cheb_t_prime(x: f64, n: usize) -> Vec<f64> {
175    let mut u = vec![0.0; n.max(1)];
176    // U_0 = 1, U_1 = 2x, U_d = 2x U_{d−1} − U_{d−2}.
177    if !u.is_empty() {
178        u[0] = 1.0;
179    }
180    if n > 1 {
181        u[1] = 2.0 * x;
182    }
183    for d in 2..n {
184        u[d] = 2.0 * x * u[d - 1] - u[d - 2];
185    }
186    let mut tp = vec![0.0; n];
187    for d in 1..n {
188        tp[d] = d as f64 * u[d - 1];
189    }
190    tp
191}
192
193/// Chebyshev SECOND-derivative values `T_0″..T_{n−1}″` at `x ∈ [−1, 1]` in the
194/// MAPPED coordinate (multiply by `(dx/dψ)²` for the ψ-second-derivative).
195///
196/// Differentiating the value recurrence `T_d = 2x T_{d−1} − T_{d−2}` twice in
197/// `x` gives a singularity-free three-term recurrence in lock-step with `cheb_t`
198/// / `cheb_t_prime`:
199///   `T_d′  = 2 T_{d−1} + 2x T_{d−1}′ − T_{d−2}′`,
200///   `T_d″  = 4 T_{d−1}′ + 2x T_{d−1}″ − T_{d−2}″`,
201/// with `T_0 = T_0′ = T_0″ = 0`-seeds as below. Unlike the closed form
202/// `T_n″ = n((n+1)T_n − U_n)/(x²−1)` this never divides by `x²−1`, so it stays
203/// exact at the window edges `x = ±1`.
204fn cheb_t_double_prime(x: f64, n: usize) -> Vec<f64> {
205    let mut t = vec![0.0; n];
206    let mut tp = vec![0.0; n];
207    let mut tpp = vec![0.0; n];
208    if n > 0 {
209        t[0] = 1.0; // T_0 = 1, T_0′ = T_0″ = 0
210    }
211    if n > 1 {
212        t[1] = x; // T_1 = x, T_1′ = 1, T_1″ = 0
213        tp[1] = 1.0;
214    }
215    for d in 2..n {
216        t[d] = 2.0 * x * t[d - 1] - t[d - 2];
217        tp[d] = 2.0 * t[d - 1] + 2.0 * x * tp[d - 1] - tp[d - 2];
218        tpp[d] = 4.0 * tp[d - 1] + 2.0 * x * tpp[d - 1] - tpp[d - 2];
219    }
220    tpp
221}
222
223fn kahan_scaled_add_array2(
224    out: &mut Array2<f64>,
225    comp: &mut Array2<f64>,
226    scale: f64,
227    x: &Array2<f64>,
228) {
229    for ((slot, c), &value) in out.iter_mut().zip(comp.iter_mut()).zip(x.iter()) {
230        let y = scale * value - *c;
231        let t = *slot + y;
232        *c = (t - *slot) - y;
233        *slot = t;
234    }
235}
236
237fn kahan_scaled_add_array1(
238    out: &mut Array1<f64>,
239    comp: &mut Array1<f64>,
240    scale: f64,
241    x: &Array1<f64>,
242) {
243    for ((slot, c), &value) in out.iter_mut().zip(comp.iter_mut()).zip(x.iter()) {
244        let y = scale * value - *c;
245        let t = *slot + y;
246        *c = (t - *slot) - y;
247        *slot = t;
248    }
249}
250
251/// Spectral norm of a SYMMETRIC matrix `m` (here the difference of two
252/// orthogonal range projectors), i.e. `max|eigenvalue|`. For two equal-rank
253/// orthogonal projectors `P_ref`, `P_new` this equals `sin θ_max`, the sine of
254/// the largest principal angle between their ranges — the canonical, gauge- and
255/// basis-invariant distance between the two subspaces (#1033). Returns `None`
256/// if the matrix is non-finite or the symmetric eigendecomposition fails (the
257/// caller then refuses the skip, the sound fallback).
258fn subspace_spectral_distance(m: &Array2<f64>) -> Option<f64> {
259    use gam_linalg::faer_ndarray::FaerEigh;
260    if m.iter().any(|v| !v.is_finite()) {
261        return None;
262    }
263    // Symmetrize defensively against rounding (P_ref − P_new is symmetric in
264    // exact arithmetic) so the symmetric eigensolver sees a genuinely Hermitian
265    // operand and returns real eigenvalues.
266    let msym = 0.5 * (m + &m.t());
267    let (evals, _evecs) = msym.eigh(faer::Side::Lower).ok()?;
268    Some(evals.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs())))
269}
270
271impl PsiGramTensor {
272    /// Build and certify the tensor over `psi ∈ [psi_lo, psi_hi]`.
273    ///
274    /// `eval_design(psi)` must return the EXACT n×k design at `psi` (the same
275    /// builder the per-trial path uses — exactness of the expansion is judged
276    /// against it). `weights` are the fixed observation weights, `z` the fixed
277    /// weighted-response target (e.g. `y − offset`). Returns `None` when the
278    /// window is degenerate, any evaluation fails/has non-finite entries, or
279    /// no ladder rung certifies — callers then keep the exact per-trial path.
280    pub fn build(
281        mut eval_design: impl FnMut(f64) -> Result<Array2<f64>, String>,
282        weights: ArrayView1<'_, f64>,
283        z: ArrayView1<'_, f64>,
284        psi_lo: f64,
285        psi_hi: f64,
286    ) -> Result<Self, String> {
287        if !(psi_lo.is_finite() && psi_hi.is_finite()) || psi_hi <= psi_lo {
288            return Err(format!(
289                "ψ window must be finite with psi_hi > psi_lo (got [{psi_lo}, {psi_hi}])"
290            ));
291        }
292        // Track the largest rung that produced a candidate but failed to
293        // certify (tail or off-node spot check). If the whole ladder is
294        // exhausted without an accepted candidate this drives a reason that
295        // distinguishes "not analytic enough in ψ" from "evaluation failed".
296        let mut last_uncertified: Option<usize> = None;
297        for &m in PSI_GRAM_NODE_LADDER.iter() {
298            match Self::build_at(&mut eval_design, weights, z, psi_lo, psi_hi, m) {
299                // An exact evaluation failed or was non-finite somewhere in
300                // the window — no larger rung can fix that, so abort with the
301                // underlying reason rather than swallowing it as a bare refusal.
302                BuildOutcome::EvalFailed(why) => {
303                    return Err(format!(
304                        "exact design evaluation failed at ladder rung m={m}: {why}"
305                    ));
306                }
307                // Tail not yet below the certificate at this rung: escalate.
308                // (Conflating this with EvalFailed would kill the ladder at
309                // its first — intentionally coarse — rung.)
310                BuildOutcome::TailNotCertified => {
311                    last_uncertified = Some(m);
312                    continue;
313                }
314                BuildOutcome::Candidate(mut candidate) => {
315                    if candidate.spot_check(&mut eval_design, weights) {
316                        candidate.grad_psi_lo = psi_lo;
317                        candidate.grad_psi_hi = psi_hi;
318                        return Ok(candidate);
319                    }
320                    // The assembled Gram disagreed with an exact off-node
321                    // rebuild at this rung; a denser rung may still certify, so
322                    // escalate rather than abort.
323                    last_uncertified = Some(m);
324                }
325            }
326        }
327        let top_rung = PSI_GRAM_NODE_LADDER.last().copied().unwrap_or(0);
328        Err(match last_uncertified {
329            Some(m) => format!(
330                "Chebyshev series did not certify within the node ladder (reached rung \
331                 m={m}, top rung {top_rung}): the design is not analytic enough in ψ over \
332                 [{psi_lo}, {psi_hi}] (a kink or non-finite curvature), so the n-free \
333                 tensor is refused and the exact per-trial path must be used"
334            ),
335            None => "empty Chebyshev node ladder".to_string(),
336        })
337    }
338
339    fn build_at(
340        eval_design: &mut impl FnMut(f64) -> Result<Array2<f64>, String>,
341        weights: ArrayView1<'_, f64>,
342        z: ArrayView1<'_, f64>,
343        psi_lo: f64,
344        psi_hi: f64,
345        m: usize,
346    ) -> BuildOutcome {
347        // #1033 (sufficient-statistic build): the one-time pass must ITSELF be a
348        // sufficient-statistic reduction — it may touch the n data rows once, but
349        // it must never hold or arithmetically process O(n) objects m times. The
350        // earlier build expanded m design-space Chebyshev coefficient SLABS
351        // (`X_d = (γ_d/m) Σ_i X(ψ_i) T_d(x_i)`, each n×k) purely to run a
352        // pre-filter tail certificate, holding all m exact designs AND all m slabs
353        // resident — O(m·n·k) memory (≈157 GB at n=320k, m=513, k=12) and an
354        // O(m²·n·k) coefficient sum that dominated the whole fit's wall-clock and
355        // made the n=320k acceptance sweep un-runnable. None of that O(n) work is
356        // retained: the tensor keeps only the k×k Gram series. So STREAM each
357        // exact node design straight into its weighted k×k sufficient statistic
358        // (Gram `X(ψ_i)ᵀW X(ψ_i)` and RHS `X(ψ_i)ᵀW z`) and DISCARD it before the
359        // next node. Peak memory is O(m·k² + n·k) (one design at a time) and the
360        // only row work is the single O(m·n·k²) node-statistic pass.
361        let mut nodes_x = vec![0.0_f64; m];
362        let mut node_grams: Vec<Array2<f64>> = Vec::with_capacity(m);
363        let mut node_rhs: Vec<Array1<f64>> = Vec::with_capacity(m);
364
365        // Weighted response (n-vector) and zᵀWz, formed once over the data rows.
366        if weights.len() != z.len() || z.is_empty() {
367            return BuildOutcome::EvalFailed(format!(
368                "incompatible build inputs: weights.len()={}, z.len()={}",
369                weights.len(),
370                z.len()
371            ));
372        }
373        let mut wz = Array1::<f64>::zeros(z.len());
374        let mut zt_w_z = 0.0_f64;
375        let mut zt_w_z_comp = 0.0_f64;
376        for ((slot, &w), &zv) in wz.iter_mut().zip(weights.iter()).zip(z.iter()) {
377            *slot = w * zv;
378            let add = w * zv * zv;
379            let y = add - zt_w_z_comp;
380            let t = zt_w_z + y;
381            zt_w_z_comp = (t - zt_w_z) - y;
382            zt_w_z = t;
383        }
384
385        let mut dims: Option<(usize, usize)> = None;
386        for (i, x_slot) in nodes_x.iter_mut().enumerate() {
387            let x = (std::f64::consts::PI * (2 * i + 1) as f64 / (2 * m) as f64).cos();
388            *x_slot = x;
389            let psi = 0.5 * (psi_lo + psi_hi) + 0.5 * (psi_hi - psi_lo) * x;
390            let design = match eval_design(psi) {
391                Ok(design) => design,
392                Err(why) => {
393                    return BuildOutcome::EvalFailed(format!(
394                        "design evaluation refused at node ψ={psi:.6}: {why}"
395                    ));
396                }
397            };
398            if design.iter().any(|v| !v.is_finite()) {
399                return BuildOutcome::EvalFailed(format!(
400                    "design at node ψ={psi:.6} contains a non-finite entry"
401                ));
402            }
403            let (dn, dk) = design.dim();
404            match dims {
405                None => {
406                    if weights.len() != dn || z.len() != dn || dn == 0 || dk == 0 {
407                        return BuildOutcome::EvalFailed(format!(
408                            "incompatible build inputs: design {dn}×{dk}, weights.len()={}, z.len()={}",
409                            weights.len(),
410                            z.len()
411                        ));
412                    }
413                    dims = Some((dn, dk));
414                }
415                Some((n0, k0)) => {
416                    if (dn, dk) != (n0, k0) {
417                        return BuildOutcome::EvalFailed(format!(
418                            "design dimensions vary across ψ nodes (first node is {n0}×{k0}, \
419                             node ψ={psi:.6} is {dn}×{dk})"
420                        ));
421                    }
422                }
423            }
424            // Weighted Gram / RHS at this node, then the n×k design is dropped.
425            // RHS uses the prebuilt `wz = W z` (same factoring as the exact
426            // streamed path) so the retained series is bit-faithful to it.
427            let mut wd = design.clone();
428            for (mut row, &w) in wd.outer_iter_mut().zip(weights.iter()) {
429                row.mapv_inplace(|v| v * w);
430            }
431            node_grams.push(design.t().dot(&wd));
432            node_rhs.push(design.t().dot(&wz));
433        }
434        let (_n, k) = dims.expect("node ladder rung m≥1 yields at least one design");
435        // First-kind discrete orthogonality of the Chebyshev nodes.
436        let t_at_nodes: Vec<Vec<f64>> = nodes_x.iter().map(|&x| cheb_t(x, m)).collect();
437        let mut gram: Vec<Array2<f64>> = (0..m).map(|_| Array2::<f64>::zeros((k, k))).collect();
438        let mut gram_comp: Vec<Array2<f64>> =
439            (0..m).map(|_| Array2::<f64>::zeros((k, k))).collect();
440        let mut rhs: Vec<Array1<f64>> = (0..m).map(|_| Array1::<f64>::zeros(k)).collect();
441        let mut rhs_comp: Vec<Array1<f64>> = (0..m).map(|_| Array1::<f64>::zeros(k)).collect();
442        for d in 0..m {
443            let gamma = if d == 0 { 1.0 } else { 2.0 };
444            for i in 0..m {
445                let wgt = gamma / m as f64 * t_at_nodes[i][d];
446                kahan_scaled_add_array2(&mut gram[d], &mut gram_comp[d], wgt, &node_grams[i]);
447                kahan_scaled_add_array1(&mut rhs[d], &mut rhs_comp[d], wgt, &node_rhs[i]);
448            }
449        }
450        drop(node_grams);
451        drop(node_rhs);
452
453        // Tail-decay certificate, now in k-SPACE on the RETAINED Gram/RHS series
454        // rather than the discarded design slabs.
455        //
456        // The series the per-trial path actually evaluates is the assembled Gram
457        // `G(ψ) = Σ_d gram[d] T_d(x(ψ))` and RHS `c(ψ) = Σ_d rhs[d] T_d(x(ψ))`;
458        // their Chebyshev coefficients are exactly what govern the truncated
459        // reconstruction error, so the cheap NECESSARY-CONDITION pre-filter
460        // belongs on THEM, not on the design X(ψ) (whose coefficients only bound
461        // G's tail indirectly, and at O(m·n·k) cost). The trailing quarter of the
462        // Gram (and RHS) coefficient slabs must fall below [`PSI_GRAM_CERT_RTOL`]
463        // × series scale.
464        //
465        // #1216: on the WIDE STANDARDIZED geometry default 1-D fits use (#1215)
466        // the tail decays cleanly but GEOMETRICALLY-SLOWLY, so the m=513 top rung
467        // is sized to drive the residual far below the bar. The certificate stays
468        // a necessary pre-filter; accuracy is authoritatively enforced by the
469        // off-node `spot_check` (`PSI_GRAM_SPOT_RTOL`, assembled Gram vs an exact
470        // rebuild). A genuinely non-analytic design (a true kink) floors ORDERS
471        // above this — its Gram series tail does NOT decay — and is refused here,
472        // with the spot-check as the hard backstop.
473        let gram_scale = gram.iter().fold(0.0_f64, |acc, slab| {
474            acc.max(slab.iter().fold(0.0_f64, |a, &v| a.max(v.abs())))
475        });
476        let rhs_scale = rhs.iter().fold(0.0_f64, |acc, slab| {
477            acc.max(slab.iter().fold(0.0_f64, |a, &v| a.max(v.abs())))
478        });
479        let tail_start = m - (m / 4).max(1);
480        let gram_bound = PSI_GRAM_CERT_RTOL * gram_scale.max(1e-300);
481        let rhs_bound = PSI_GRAM_CERT_RTOL * rhs_scale.max(1e-300);
482        for d in tail_start..m {
483            if gram[d].iter().any(|&v| v.abs() > gram_bound)
484                || rhs[d].iter().any(|&v| v.abs() > rhs_bound)
485            {
486                return BuildOutcome::TailNotCertified;
487            }
488        }
489        BuildOutcome::Candidate(Self {
490            psi_lo,
491            psi_hi,
492            // Provisional: `build` promotes these to the certified value window
493            // after the value spot-check passes.
494            grad_psi_lo: psi_lo,
495            grad_psi_hi: psi_hi,
496            n_coeff: m,
497            k,
498            gram,
499            rhs,
500            zt_w_z,
501        })
502    }
503
504    /// Off-node certification: the ASSEMBLED Gram must reproduce the exactly
505    /// rebuilt Gram at deterministic interior ψ values.
506    fn spot_check(
507        &self,
508        eval_design: &mut impl FnMut(f64) -> Result<Array2<f64>, String>,
509        weights: ArrayView1<'_, f64>,
510    ) -> bool {
511        for s in 0..PSI_GRAM_SPOT_POINTS {
512            // Golden-ratio low-discrepancy interior points — never the nodes.
513            let frac = ((s as f64 + 1.0) * 0.618_033_988_749_894_9).fract();
514            let psi = self.psi_lo + frac * (self.psi_hi - self.psi_lo);
515            let Ok(design) = eval_design(psi) else {
516                return false;
517            };
518            let mut wd = design.clone();
519            for (mut row, &w) in wd.outer_iter_mut().zip(weights.iter()) {
520                row.mapv_inplace(|v| v * w);
521            }
522            let exact = design.t().dot(&wd);
523            let assembled = self.gram_at(psi);
524            let scale = exact
525                .iter()
526                .fold(0.0_f64, |acc, &v| acc.max(v.abs()))
527                .max(1e-300);
528            for (a, b) in assembled.iter().zip(exact.iter()) {
529                if (a - b).abs() > PSI_GRAM_SPOT_RTOL * scale {
530                    return false;
531                }
532            }
533        }
534        true
535    }
536
537    /// Range (reduced-basis) projector of the conditioned Gram `XᵀWX(ψ)` and the
538    /// numerical rank, computed n-free from the k-space tensor. The reduced basis
539    /// the inner penalized solve forms is the column span of the eigenvectors of
540    /// the (symmetric PSD) Gram whose eigenvalue exceeds a rank-revealing cutoff
541    /// relative to the largest eigenvalue. The orthogonal projector `P = U_r U_rᵀ`
542    /// onto that span is a frame-INVARIANT witness of the reduced basis: two ψ's
543    /// share a reduced basis iff their range projectors coincide (the projector
544    /// is invariant to the orthonormal-basis gauge freedom within the range, so
545    /// it isolates exactly the subspace identity the skip needs, not an arbitrary
546    /// eigenvector rotation). Returns `None` if the Gram is non-finite or its
547    /// symmetric eigendecomposition fails.
548    fn range_projector(&self, psi: f64, rank_rtol: f64) -> Option<(Array2<f64>, usize)> {
549        use gam_linalg::faer_ndarray::FaerEigh;
550        let g = self.gram_at(psi);
551        if g.iter().any(|v| !v.is_finite()) {
552            return None;
553        }
554        // Symmetrize defensively (gram_at is symmetric up to rounding).
555        let gsym = 0.5 * (&g + &g.t());
556        let (evals, evecs) = gsym.eigh(faer::Side::Lower).ok()?;
557        // `eigh` returns ascending eigenvalues; the Gram is PSD so the largest is
558        // the trailing one. The rank cutoff is relative to that maximum.
559        let lambda_max = evals.iter().cloned().fold(0.0_f64, f64::max);
560        if !(lambda_max > 0.0) {
561            return None;
562        }
563        let cutoff = rank_rtol * lambda_max;
564        let mut proj = Array2::<f64>::zeros((self.k, self.k));
565        let mut rank = 0usize;
566        for (col, &lam) in evals.iter().enumerate() {
567            if lam > cutoff {
568                let u = evecs.column(col);
569                // P += u uᵀ.
570                for a in 0..self.k {
571                    for b in 0..self.k {
572                        proj[[a, b]] += u[a] * u[b];
573                    }
574                }
575                rank += 1;
576            }
577        }
578        Some((proj, rank))
579    }
580
581    /// True when the realized reduced basis the design-revision fast path freezes
582    /// at the pinning `psi_ref` is still valid at `psi_new` — the genuine
583    /// reduced-basis-equality witness the skip requires (#1264, #1216 item 3).
584    ///
585    /// The fast path keeps the reference surface (its conditioned frame and its
586    /// RRQR-reduced / null-space basis) frozen at `psi_ref` while re-keying only
587    /// the Gram `XᵀWX(ψ)` and penalty `S(ψ)` to `psi_new`. That is exact iff the
588    /// reduced basis — the range / null split of the conditioned data Gram — is
589    /// unchanged. A conditioning-ratio or RRQR rank/permutation gate only bounds
590    /// NECESSARY conditions; the reduced SUBSPACE can still rotate while rank and
591    /// pivot order look tame, which is exactly the ~7.8e-2 β̂ regression a cluster run
592    /// found. This witness compares the orthogonal RANGE PROJECTORS of the
593    /// conditioned Gram at `psi_ref` and `psi_new` (both assembled n-free from the
594    /// tensor): the skip is sound only when the numerical ranks match AND the
595    /// projectors agree to `proj_atol` in max-norm — i.e. the two reduced bases
596    /// span the SAME subspace. The projector identity is gauge-invariant, so it
597    /// certifies subspace equality directly rather than a particular basis choice.
598    ///
599    /// `psi_ref == psi_new` (a repeat trial at the same ψ) is trivially sound.
600    /// Off-window ψ's, a non-finite / rank-degenerate Gram, or any eigendecomp
601    /// failure return `false` (refuse the skip → caller takes the slow path).
602    ///
603    /// ROTATION WALL (#1033). On production spatial geometry the conditioned
604    /// data-Gram range subspace can ROTATE with ψ at fixed rank — the wall on
605    /// which the earlier RRQR-pivot / entrywise-projector gates kept refusing the
606    /// skip. The fix is the SUBSPACE-DISTANCE certificate below: the skip is sound
607    /// exactly when the two equal-rank ranges coincide as SUBSPACES, measured by
608    /// the spectral norm of the projector difference (the principal angle), which
609    /// is invariant to any orthonormal-basis rotation WITHIN the range. So a pure
610    /// gauge rotation that left the entrywise max-abs above tolerance — and
611    /// therefore used to be refused — now certifies, letting the n-free skip fire
612    /// across the rotation. A genuine subspace MOVE (different rank, or a real
613    /// principal-angle separation) still refuses; refusing is the SOUND fallback
614    /// (the caller takes the exact slow path). Do not weaken
615    /// `PSI_GRAM_SKIP_PROJ_ATOL` / `PSI_GRAM_SKIP_RANK_RTOL`: the spectral gate is
616    /// already the tightest correct subspace metric, and loosening it past a true
617    /// principal-angle separation reintroduces the ~7.8e-2 β̂ regression this
618    /// witness exists to prevent.
619    pub fn reduced_basis_equal(&self, psi_ref: f64, psi_new: f64) -> bool {
620        if !(self.contains(psi_ref) && self.contains(psi_new)) {
621            return false;
622        }
623        if psi_ref == psi_new {
624            return true;
625        }
626        let Some((p_ref, r_ref)) = self.range_projector(psi_ref, PSI_GRAM_SKIP_RANK_RTOL) else {
627            return false;
628        };
629        let Some((p_new, r_new)) = self.range_projector(psi_new, PSI_GRAM_SKIP_RANK_RTOL) else {
630            return false;
631        };
632        if r_ref != r_new {
633            return false;
634        }
635        // Subspace-distance certificate (#1033). The two reduced bases span the
636        // SAME subspace iff their orthogonal range projectors coincide. The
637        // correct, gauge-invariant measure of "how far apart" two equal-rank
638        // subspaces are is the SPECTRAL NORM ‖P_ref − P_new‖₂ = sin θ_max, the
639        // sine of the largest principal angle between the ranges — NOT the
640        // entrywise max-abs of the projector difference. The old entrywise test
641        // is a strictly weaker proxy: across a basis ROTATION (the radial-Gram
642        // rotation wall this skip kept tripping on) the projector entries can
643        // each drift while the spanned subspace is numerically identical, so the
644        // entrywise max could exceed tolerance and FALSELY refuse a sound skip.
645        // P_ref − P_new is symmetric, so its spectral norm is max|eigenvalue|;
646        // compute it via the symmetric eigensolver and gate on the principal
647        // angle directly. This certifies subspace identity across the rotation,
648        // letting the n-free skip fire whenever the range genuinely coincides,
649        // while still refusing (the SOUND fallback) the instant the subspaces
650        // separate by more than PSI_GRAM_SKIP_PROJ_ATOL in true subspace
651        // distance. An eigendecomp failure on the (small k×k) difference refuses.
652        let diff = &p_ref - &p_new;
653        subspace_spectral_distance(&diff)
654            .map(|d| d <= PSI_GRAM_SKIP_PROJ_ATOL)
655            .unwrap_or(false)
656    }
657
658    /// The gauge-invariant subspace distance `‖P(ψ_ref) − P(ψ_new)‖₂ = sin θ_max`
659    /// between the two conditioned-Gram range subspaces — the exact quantity
660    /// [`Self::reduced_basis_equal`] thresholds against `PSI_GRAM_SKIP_PROJ_ATOL`.
661    /// Exposed for #1033 frontier instrumentation so a refused n-free skip can be
662    /// attributed to a genuine in-window basis ROTATION (this distance exceeds the
663    /// tolerance at equal rank) versus a rank change. Returns `None` for an
664    /// off-window ψ, an equal-ψ pair, a rank mismatch, or an eigendecomp failure.
665    /// Purely k-space (O(k³)) — independent of n.
666    pub fn reduced_basis_subspace_distance(&self, psi_ref: f64, psi_new: f64) -> Option<f64> {
667        if !(self.contains(psi_ref) && self.contains(psi_new)) {
668            return None;
669        }
670        if psi_ref == psi_new {
671            return Some(0.0);
672        }
673        let (p_ref, r_ref) = self.range_projector(psi_ref, PSI_GRAM_SKIP_RANK_RTOL)?;
674        let (p_new, r_new) = self.range_projector(psi_new, PSI_GRAM_SKIP_RANK_RTOL)?;
675        if r_ref != r_new {
676            return None;
677        }
678        let diff = &p_ref - &p_new;
679        subspace_spectral_distance(&diff)
680    }
681
682    /// Numerical rank of the conditioned Gram `XᵀWX(ψ)` at `psi`, under the same
683    /// relative cutoff (`PSI_GRAM_SKIP_RANK_RTOL`·λ_max) the design-revision skip's
684    /// `reduced_basis_equal` witness uses. Returns `None` for an off-window /
685    /// non-finite / all-zero Gram. Purely k-space (O(k³)) — independent of n.
686    pub fn gram_numerical_rank(&self, psi: f64) -> Option<usize> {
687        if !self.contains(psi) {
688            return None;
689        }
690        self.range_projector(psi, PSI_GRAM_SKIP_RANK_RTOL)
691            .map(|(_, rank)| rank)
692    }
693
694    /// Lower edge of the contiguous ψ-band, ANCHORED at `psi_anchor`, over which
695    /// the conditioned Gram `XᵀWX(ψ)` holds the SAME numerical rank it has at the
696    /// anchor — i.e. the ψ-floor below which the design-revision skip's
697    /// `reduced_basis_equal` witness must (soundly) refuse, because the range
698    /// subspace collapses as the longest-length-scale radial mode drops under the
699    /// rank cutoff. Lifting the κ-optimizer's lower bound to this floor keeps every
700    /// in-window trial on the n-free fast path and is inherently n-INDEPENDENT: the
701    /// rank is a property of the k×k tensor, not of the sample size (#1033).
702    ///
703    /// Anchoring at `psi_anchor` (the optimizer's ψ seed) is essential: the
704    /// conditioned Gram is rank-deficient at BOTH window ends on production radial
705    /// geometry — at small ψ the longest-scale mode collapses into the polynomial
706    /// nullspace, and at very large ψ every radial column goes collinear with it.
707    /// The maximal-rank region is therefore a middle BAND, and the κ-optimum lives
708    /// inside it. We walk DOWN from the anchor on a fixed k-space grid and return
709    /// the lowest ψ still at the anchor's rank (stopping at the first node that
710    /// differs). Purely O(nodes·k³) — no row access.
711    ///
712    /// Returns `None` when the band already reaches `psi_lo` (no lift needed), when
713    /// the anchor is off-window / rank-indeterminate, or when the window is empty.
714    pub fn rank_stable_psi_floor(&self, psi_anchor: f64) -> Option<f64> {
715        // Fixed k-space grid over the window. 96 nodes resolves the rank cliff
716        // (~1 ψ-decade wide on production Duchon geometry) far finer than the
717        // optimizer's ~ln2 step; the whole scan is O(nodes·k³), independent of n.
718        const NODES: usize = 96;
719        if !(self.psi_hi > self.psi_lo) {
720            return None;
721        }
722        let span = self.psi_hi - self.psi_lo;
723        let psi_at = |i: usize| self.psi_lo + span * (i as f64) / ((NODES - 1) as f64);
724        let ranks: Vec<Option<usize>> =
725            (0..NODES).map(|i| self.gram_numerical_rank(psi_at(i))).collect();
726        // Target the MAXIMAL numerical rank attained anywhere in the window — the
727        // full-rank "good" geometry the skip certifies. Anchoring on the seed's own
728        // rank would be fragile if the seed happened to land in a deficient spot;
729        // the window-max is the rank the κ-optimum's neighbourhood must hold.
730        let max_rank = ranks.iter().filter_map(|r| *r).max()?;
731        // Map the anchor to the nearest grid node, then snap UP to the nearest node
732        // at maximal rank (so the band edge is measured from inside the good band
733        // even if the seed sits just below the cliff). If no max-rank node exists
734        // at/above the anchor, there is nothing to protect below it.
735        let anchor = psi_anchor.clamp(self.psi_lo, self.psi_hi);
736        let anchor_idx = (((anchor - self.psi_lo) / span) * ((NODES - 1) as f64))
737            .round()
738            .clamp(0.0, (NODES - 1) as f64) as usize;
739        let band_idx = (anchor_idx..NODES).find(|&i| ranks[i] == Some(max_rank))?;
740        // Walk DOWN from the band node; the floor is the lowest node from which
741        // every node up to it holds `max_rank`. Stop at the first node below.
742        let mut floor_idx = band_idx;
743        for i in (0..band_idx).rev() {
744            if ranks[i] == Some(max_rank) {
745                floor_idx = i;
746            } else {
747                break;
748            }
749        }
750        if floor_idx == 0 {
751            // The maximal-rank band already reaches `psi_lo` — no lift needed.
752            None
753        } else {
754            Some(psi_at(floor_idx))
755        }
756    }
757
758    /// Upper edge of the contiguous maximal-rank ψ-band, the symmetric twin of
759    /// [`Self::rank_stable_psi_floor`] (#1033). The conditioned Gram `XᵀWX(ψ)` is
760    /// rank-deficient at BOTH window ends — at small ψ the longest-length-scale
761    /// radial mode collapses into the polynomial nullspace, and at very large ψ
762    /// every radial column goes collinear with the low-frequency mode, so the
763    /// maximal-rank region is a middle BAND. The optimizer's line search can
764    /// OVERSHOOT above that band (e.g. ψ≈1.0 on production Duchon geometry), where
765    /// the design-realization skip's `reduced_basis_equal` witness must soundly
766    /// refuse (the range subspace dropped a dimension) → an O(n) `reset_surface`,
767    /// AND the pinning ψ recorded at that reset is itself rank-deficient, so the
768    /// NEXT in-band trial mismatches its reference and resets a SECOND time. Both
769    /// resets vanish once the optimizer's UPPER bound is clamped down to this
770    /// n-free k-space ceiling, keeping every trial inside the maximal-rank band.
771    ///
772    /// Walks UP from the anchor on the same fixed k-space grid as the floor and
773    /// returns the highest ψ still at the window's maximal numerical rank
774    /// (stopping at the first node above that differs). Purely O(nodes·k³) — no
775    /// row access, inherently n-INDEPENDENT (rank is a property of the k×k tensor).
776    ///
777    /// Returns `None` when the band already reaches `psi_hi` (no clamp needed),
778    /// when the anchor is off-window / rank-indeterminate, or when the window is
779    /// empty.
780    pub fn rank_stable_psi_ceiling(&self, psi_anchor: f64) -> Option<f64> {
781        // Same grid + max-rank target + anchor→band snap as `rank_stable_psi_floor`
782        // so the floor and ceiling bracket the SAME contiguous maximal-rank band.
783        const NODES: usize = 96;
784        if !(self.psi_hi > self.psi_lo) {
785            return None;
786        }
787        let span = self.psi_hi - self.psi_lo;
788        let psi_at = |i: usize| self.psi_lo + span * (i as f64) / ((NODES - 1) as f64);
789        let ranks: Vec<Option<usize>> =
790            (0..NODES).map(|i| self.gram_numerical_rank(psi_at(i))).collect();
791        let max_rank = ranks.iter().filter_map(|r| *r).max()?;
792        let anchor = psi_anchor.clamp(self.psi_lo, self.psi_hi);
793        let anchor_idx = (((anchor - self.psi_lo) / span) * ((NODES - 1) as f64))
794            .round()
795            .clamp(0.0, (NODES - 1) as f64) as usize;
796        // Snap to the nearest max-rank node at/below the anchor (the mirror of the
797        // floor's snap-UP), so the band edge is measured from inside the good band.
798        let band_idx = (0..=anchor_idx).rev().find(|&i| ranks[i] == Some(max_rank))?;
799        // Walk UP from the band node; the ceiling is the highest node from which
800        // every node down to it holds `max_rank`. Stop at the first node above.
801        let mut ceil_idx = band_idx;
802        for i in (band_idx + 1)..NODES {
803            if ranks[i] == Some(max_rank) {
804                ceil_idx = i;
805            } else {
806                break;
807            }
808        }
809        if ceil_idx == NODES - 1 {
810            // The maximal-rank band already reaches `psi_hi` — no clamp needed.
811            None
812        } else {
813            Some(psi_at(ceil_idx))
814        }
815    }
816
817    /// True when `psi` lies inside the certified window.
818    pub fn contains(&self, psi: f64) -> bool {
819        psi.is_finite() && psi >= self.psi_lo && psi <= self.psi_hi
820    }
821
822    /// The certified value window `[psi_lo, psi_hi]` (#1033 instrumentation).
823    pub fn psi_window(&self) -> (f64, f64) {
824        (self.psi_lo, self.psi_hi)
825    }
826
827    /// True when `psi` lies inside the certified gradient window where the
828    /// analytic ψ-derivative is bit-tight against the exact design derivative
829    /// (#1033b). The n-free kappa outer loop is armed only when this covers the
830    /// full optimizer bounds.
831    pub fn contains_for_gradient(&self, psi: f64) -> bool {
832        psi.is_finite()
833            && self.grad_psi_lo.is_finite()
834            && self.grad_psi_hi.is_finite()
835            && psi >= self.grad_psi_lo
836            && psi <= self.grad_psi_hi
837    }
838
839    fn mapped(&self, psi: f64) -> f64 {
840        (2.0 * psi - (self.psi_lo + self.psi_hi)) / (self.psi_hi - self.psi_lo)
841    }
842
843    /// `XᵀWX(ψ)` assembled n-free in O(Dk²) from the direct Gram series.
844    pub fn gram_at(&self, psi: f64) -> Array2<f64> {
845        let x = self.mapped(psi);
846        let t = cheb_t(x, self.gram.len());
847        let mut out = Array2::<f64>::zeros((self.k, self.k));
848        let mut comp = Array2::<f64>::zeros((self.k, self.k));
849        for (d, td) in t.iter().enumerate() {
850            kahan_scaled_add_array2(&mut out, &mut comp, *td, &self.gram[d]);
851        }
852        out
853    }
854
855    /// `XᵀWz(ψ)` assembled n-free in O(Dk).
856    pub fn rhs_at(&self, psi: f64) -> Array1<f64> {
857        let x = self.mapped(psi);
858        let t = cheb_t(x, self.n_coeff);
859        let mut out = Array1::<f64>::zeros(self.k);
860        let mut comp = Array1::<f64>::zeros(self.k);
861        for (d, td) in t.iter().enumerate() {
862            kahan_scaled_add_array1(&mut out, &mut comp, *td, &self.rhs[d]);
863        }
864        out
865    }
866
867    /// Exact `∂(XᵀWX)/∂ψ` from the SAME representation as the value — the
868    /// structural cure for the objective↔gradient desync class on this
869    /// channel. n-free, O(Dk²) from the direct Gram series.
870    pub fn dgram_dpsi(&self, psi: f64) -> Array2<f64> {
871        let x = self.mapped(psi);
872        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
873        let tp = cheb_t_prime(x, self.gram.len());
874        let mut out = Array2::<f64>::zeros((self.k, self.k));
875        let mut comp = Array2::<f64>::zeros((self.k, self.k));
876        for (d, tpd) in tp.iter().enumerate() {
877            kahan_scaled_add_array2(&mut out, &mut comp, *tpd * dx_dpsi, &self.gram[d]);
878        }
879        out
880    }
881
882    /// Exact `∂(XᵀWz)/∂ψ`, n-free.
883    pub fn drhs_dpsi(&self, psi: f64) -> Array1<f64> {
884        let x = self.mapped(psi);
885        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
886        let tp = cheb_t_prime(x, self.n_coeff);
887        let mut out = Array1::<f64>::zeros(self.k);
888        let mut comp = Array1::<f64>::zeros(self.k);
889        for (d, tpd) in tp.iter().enumerate() {
890            kahan_scaled_add_array1(&mut out, &mut comp, *tpd * dx_dpsi, &self.rhs[d]);
891        }
892        out
893    }
894
895    /// Exact `∂²(XᵀWX)/∂ψ²` from the SAME representation as the value/gradient —
896    /// the n-free curvature that lets the outer Newton/ARC step read the τ-τ
897    /// Hessian's design-moving block without re-streaming an O(n) slab Gram
898    /// (#1033, Gaussian-identity single-ψ Hessian channel). O(Dk²) from the
899    /// direct Gram series.
900    ///
901    /// `XᵀWX(ψ) = Σ_d T_d(x) G_d` with `x = mapped(ψ)`, so by the chain rule
902    /// `d²/dψ² = T_d″(x) · (dx/dψ)²`.
903    pub fn d2gram_dpsi2(&self, psi: f64) -> Array2<f64> {
904        let x = self.mapped(psi);
905        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
906        let dx_dpsi_sq = dx_dpsi * dx_dpsi;
907        let tpp = cheb_t_double_prime(x, self.gram.len());
908        let mut out = Array2::<f64>::zeros((self.k, self.k));
909        let mut comp = Array2::<f64>::zeros((self.k, self.k));
910        for (d, tppd) in tpp.iter().enumerate() {
911            kahan_scaled_add_array2(&mut out, &mut comp, *tppd * dx_dpsi_sq, &self.gram[d]);
912        }
913        out
914    }
915
916    /// Exact `∂²(XᵀWz)/∂ψ²`, n-free. `T_d″·(dx/dψ)²` against the rhs slabs.
917    pub fn d2rhs_dpsi2(&self, psi: f64) -> Array1<f64> {
918        let x = self.mapped(psi);
919        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
920        let dx_dpsi_sq = dx_dpsi * dx_dpsi;
921        let tpp = cheb_t_double_prime(x, self.n_coeff);
922        let mut out = Array1::<f64>::zeros(self.k);
923        let mut comp = Array1::<f64>::zeros(self.k);
924        for (d, tppd) in tpp.iter().enumerate() {
925            kahan_scaled_add_array1(&mut out, &mut comp, *tppd * dx_dpsi_sq, &self.rhs[d]);
926        }
927        out
928    }
929
930    /// Assemble the Gaussian-identity sufficient-statistic cache at `psi`
931    /// without touching a single data row — the bridge from this tensor into
932    /// the inner PLS solver's fast path (#1033b → `GaussianFixedCache`).
933    ///
934    /// `(XᵀWX, XᵀWz, zᵀWz)` is everything the Gaussian penalized solve needs
935    /// at any λ, so a ψ-trial that holds a certified tensor can hand the
936    /// inner solver this cache instead of realizing the n×k design. The
937    /// caller is responsible for `contains(psi)` (off-window trials fall back
938    /// to the exact realizer path). Dense-path bridge only: the sparse
939    /// scatter cache stays `None`.
940    pub fn gaussian_fixed_cache_at(&self, psi: f64) -> crate::pirls::GaussianFixedCache {
941        crate::pirls::GaussianFixedCache {
942            xtwx_orig: self.gram_at(psi),
943            xtwy_orig: self.rhs_at(psi),
944            centered_weighted_y_sq: self.zt_w_z,
945            row_prediction_is_stale: true,
946            xtwx_sparse_orig: None,
947        }
948    }
949}
950
951#[cfg(test)]
952mod tests {
953    use super::*;
954
955    /// Analytic Matérn-shaped synthetic design: entries g(e^{u_ij + ψ}) with
956    /// g(s) = (1 + s)·exp(−s) (the ν = 3/2 Matérn shape) plus a ψ-free power
957    /// column — the exact structural mix of the production radial designs.
958    fn synth_design(psi: f64, n: usize, k: usize) -> Result<Array2<f64>, String> {
959        let mut x = Array2::<f64>::zeros((n, k));
960        for i in 0..n {
961            for j in 0..k {
962                let r = 0.05 + (i as f64 + 1.0) * (j as f64 + 1.0) / (n as f64 * k as f64) * 3.0;
963                if j == k - 1 {
964                    // ψ-free polynomial block column.
965                    x[[i, j]] = r * r * r;
966                } else {
967                    let s = r * psi.exp();
968                    x[[i, j]] = (1.0 + s) * (-s).exp();
969                }
970            }
971        }
972        Ok(x)
973    }
974
975    /// A genuinely FULL-RANK, well-conditioned, ψ-dependent synthetic design for
976    /// the gauge-invariance witness test. Unlike `synth_design` (whose Matérn-like
977    /// `(1+s)e^{-s}` columns over a narrow `r`-range collapse to a numerical rank
978    /// of 3–4 of `k=6` and whose near-null subspace *rotates* across the window —
979    /// so `reduced_basis_equal` correctly refuses), this builds `k` near-orthogonal
980    /// Fourier/Chebyshev-flavoured base columns and applies a mild, sign-varying
981    /// per-column amplitude `e^{c_j·ψ}`. The base columns are linearly independent
982    /// with a Gram condition number `≈3`, so the weighted Gram is full column rank
983    /// (numerical rank `= k`) at *every* ψ in the window — its range is the whole
984    /// k-space and the orthogonal range projector is the identity for all ψ. The
985    /// amplitude modulation still genuinely *rotates the eigenvectors* with ψ, so
986    /// the witness must certify (identical range subspace) despite a per-ψ
987    /// eigenvector gauge that differs — exactly the gauge invariance under test.
988    /// The amplitudes are entire in ψ, so the Chebyshev tensor still certifies.
989    fn synth_full_rank_design(psi: f64, n: usize, k: usize) -> Result<Array2<f64>, String> {
990        use std::f64::consts::PI;
991        assert!(k >= 2 && k % 2 == 0, "helper assumes an even k ≥ 2");
992        // ψ-analytic Givens angle: rotates each adjacent column plane by θ(ψ). A
993        // rotation is orthogonal, so it preserves the COLUMN SPACE and the Gram
994        // SPECTRUM (rank = k, condition number constant in ψ) while genuinely
995        // turning the eigenvECTORS — the precise setting in which the range
996        // projector is ψ-invariant (identity at full rank) but the per-ψ gauge
997        // differs. cos/sin are entire, so the Chebyshev tensor still certifies.
998        let theta = 0.6 * psi;
999        let (c, s) = (theta.cos(), theta.sin());
1000        let mut x = Array2::<f64>::zeros((n, k));
1001        for i in 0..n {
1002            let t = (i as f64 + 0.5) / n as f64;
1003            // Distinctly-scaled near-orthogonal base columns → distinct, separated
1004            // eigenvalues so each eigenvector is well-defined (no degenerate plane
1005            // that would make the rotation gauge-ambiguous).
1006            let mut b = vec![0.0_f64; k];
1007            for (j, slot) in b.iter_mut().enumerate() {
1008                let base = if j % 2 == 0 {
1009                    ((j as f64) * PI * t).cos()
1010                } else {
1011                    (((j + 1) as f64) * PI * t).sin()
1012                };
1013                *slot = (1.0 + 0.5 * j as f64) * base;
1014            }
1015            // Apply the Givens rotation to every adjacent (2m, 2m+1) plane,
1016            // including the dominant top plane, so the LEADING eigenvector rotates
1017            // too (a rotation confined to the small-eigenvalue planes would leave
1018            // the leading eigenvector fixed and make the gauge check vacuous).
1019            let mut row = b.clone();
1020            let mut p = 0;
1021            while p + 1 < k {
1022                let (bp, bq) = (b[p], b[p + 1]);
1023                row[p] = c * bp - s * bq;
1024                row[p + 1] = s * bp + c * bq;
1025                p += 2;
1026            }
1027            for (j, &v) in row.iter().enumerate() {
1028                x[[i, j]] = v;
1029            }
1030        }
1031        Ok(x)
1032    }
1033
1034    fn exact_gram(psi: f64, n: usize, k: usize, w: &Array1<f64>) -> Array2<f64> {
1035        let design = synth_design(psi, n, k).unwrap();
1036        let mut wd = design.clone();
1037        for (mut row, &wi) in wd.outer_iter_mut().zip(w.iter()) {
1038            row.mapv_inplace(|v| v * wi);
1039        }
1040        design.t().dot(&wd)
1041    }
1042
1043    /// #1033b primitive gate: the certified tensor must reproduce the exact
1044    /// Gram/rhs at arbitrary in-window ψ to certification accuracy, and its
1045    /// analytic ψ-derivative must match central finite differences of the
1046    /// exact Gram — value and gradient from one representation.
1047    #[test]
1048    fn psi_gram_tensor_matches_exact_gram_and_fd_gradient() {
1049        let (n, k) = (160usize, 7usize);
1050        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1051        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.37).sin()));
1052        let (psi_lo, psi_hi) = (-1.2_f64, 1.0_f64);
1053
1054        let tensor = PsiGramTensor::build(
1055            |psi| synth_design(psi, n, k),
1056            w.view(),
1057            z.view(),
1058            psi_lo,
1059            psi_hi,
1060        )
1061        .expect("analytic synthetic design must certify");
1062
1063        for &psi in &[-1.1, -0.63, 0.0, 0.41, 0.97] {
1064            assert!(tensor.contains(psi));
1065            let exact = exact_gram(psi, n, k, &w);
1066            let fast = tensor.gram_at(psi);
1067            let scale = exact.iter().fold(0.0_f64, |a, &v| a.max(v.abs()));
1068            for (a, b) in fast.iter().zip(exact.iter()) {
1069                assert!(
1070                    (a - b).abs() <= 1e-9 * scale,
1071                    "gram mismatch at psi={psi}: fast={a}, exact={b}"
1072                );
1073            }
1074            // rhs check against the exact weighted cross-product.
1075            let design = synth_design(psi, n, k).unwrap();
1076            let mut wz = Array1::<f64>::zeros(n);
1077            for ((slot, &wi), &zi) in wz.iter_mut().zip(w.iter()).zip(z.iter()) {
1078                *slot = wi * zi;
1079            }
1080            let exact_rhs = design.t().dot(&wz);
1081            let fast_rhs = tensor.rhs_at(psi);
1082            let rscale = exact_rhs.iter().fold(0.0_f64, |a, &v| a.max(v.abs()));
1083            for (a, b) in fast_rhs.iter().zip(exact_rhs.iter()) {
1084                assert!(
1085                    (a - b).abs() <= 1e-9 * rscale,
1086                    "rhs mismatch at psi={psi}: fast={a}, exact={b}"
1087                );
1088            }
1089            // Analytic ψ-gradient vs central FD of the EXACT gram.
1090            let h = 1e-5;
1091            let g_plus = exact_gram(psi + h, n, k, &w);
1092            let g_minus = exact_gram(psi - h, n, k, &w);
1093            let dg = tensor.dgram_dpsi(psi);
1094            let dscale = dg.iter().fold(0.0_f64, |a, &v| a.max(v.abs())).max(1e-12);
1095            for ((a, p), m_) in dg.iter().zip(g_plus.iter()).zip(g_minus.iter()) {
1096                let fd = (p - m_) / (2.0 * h);
1097                assert!(
1098                    (a - fd).abs() <= 1e-5 * dscale,
1099                    "dgram/dpsi mismatch at psi={psi}: analytic={a}, fd={fd}"
1100                );
1101            }
1102        }
1103        // Outside the window the caller must fall back to the exact path.
1104        assert!(!tensor.contains(psi_hi + 0.5));
1105        assert!(!tensor.contains(psi_lo - 0.5));
1106
1107        // Bridge gate (#1033b → GaussianFixedCache): the n-free cache must
1108        // reproduce the exactly streamed sufficient statistics, and the
1109        // ridge-penalized solves through both must agree — the inner PLS
1110        // consumes nothing else, so this certifies the trial-loop handoff.
1111        for &psi in &[-0.9, 0.2, 0.8] {
1112            let cache = tensor.gaussian_fixed_cache_at(psi);
1113            let design = synth_design(psi, n, k).unwrap();
1114            let mut wd = design.clone();
1115            for (mut row, &wi) in wd.outer_iter_mut().zip(w.iter()) {
1116                row.mapv_inplace(|v| v * wi);
1117            }
1118            let exact_gram = design.t().dot(&wd);
1119            let exact_rhs = wd.t().dot(&z);
1120            let exact_ztwz: f64 = w.iter().zip(z.iter()).map(|(&wi, &zi)| wi * zi * zi).sum();
1121            assert!(
1122                (cache.centered_weighted_y_sq - exact_ztwz).abs()
1123                    <= 1e-12 * exact_ztwz.abs().max(1e-300),
1124                "zᵀWz drift: cache={}, exact={exact_ztwz}",
1125                cache.centered_weighted_y_sq
1126            );
1127            // Ridge-penalized solve agreement: (G + I)β = r on both sides.
1128            let solve = |g: &Array2<f64>, r: &Array1<f64>| -> Array1<f64> {
1129                let mut a = g.clone();
1130                for i in 0..k {
1131                    a[[i, i]] += 1.0;
1132                }
1133                // Small dense Gauss elimination (k = 7 in this test).
1134                let mut aug = Array2::<f64>::zeros((k, k + 1));
1135                aug.slice_mut(ndarray::s![.., ..k]).assign(&a);
1136                aug.slice_mut(ndarray::s![.., k]).assign(r);
1137                for col in 0..k {
1138                    let piv = (col..k)
1139                        .max_by(|&p, &q| aug[[p, col]].abs().total_cmp(&aug[[q, col]].abs()))
1140                        .unwrap();
1141                    if piv != col {
1142                        for j in 0..=k {
1143                            let tmp = aug[[col, j]];
1144                            aug[[col, j]] = aug[[piv, j]];
1145                            aug[[piv, j]] = tmp;
1146                        }
1147                    }
1148                    let p = aug[[col, col]];
1149                    for row in 0..k {
1150                        if row == col {
1151                            continue;
1152                        }
1153                        let f = aug[[row, col]] / p;
1154                        for j in col..=k {
1155                            aug[[row, j]] -= f * aug[[col, j]];
1156                        }
1157                    }
1158                }
1159                Array1::from_iter((0..k).map(|i| aug[[i, k]] / aug[[i, i]]))
1160            };
1161            let beta_fast = solve(&cache.xtwx_orig, &cache.xtwy_orig);
1162            let beta_exact = solve(&exact_gram, &exact_rhs);
1163            let bscale = beta_exact
1164                .iter()
1165                .fold(0.0_f64, |a, &v| a.max(v.abs()))
1166                .max(1e-300);
1167            for (a, b) in beta_fast.iter().zip(beta_exact.iter()) {
1168                assert!(
1169                    (a - b).abs() <= 1e-8 * bscale,
1170                    "penalized solve drift at psi={psi}: fast={a}, exact={b}"
1171                );
1172            }
1173        }
1174    }
1175
1176    /// #1033 rank-stable κ-floor: the conditioned radial Gram goes numerically
1177    /// rank-deficient at the LARGE-length-scale (small-ψ) window edge — the
1178    /// `synth_design` Matérn columns over a narrow `r`-range collapse toward the
1179    /// polynomial nullspace there. `rank_stable_psi_floor` must (a) detect that the
1180    /// maximal-rank band does NOT reach `psi_lo` and return a floor strictly inside
1181    /// the window, (b) report that floor as the lower edge of the maximal-rank band
1182    /// containing the seed, and (c) be a pure k-space property — IDENTICAL whether
1183    /// the tensor was built from n=200 or n=4000 rows (the n-independence the κ
1184    /// outer loop relies on). A design that is full-rank across the whole window
1185    /// must return `None` (no lift needed).
1186    #[test]
1187    fn rank_stable_psi_floor_is_inside_window_and_n_independent() {
1188        let k = 7usize;
1189        // Window spanning the small-ψ rank cliff. Kept moderate so the Chebyshev
1190        // ladder certifies at a low rung (the build is the only n-pass; a wide
1191        // window forces high rungs = many design realizations = slow test).
1192        let (psi_lo, psi_hi) = (-1.6_f64, 1.0_f64);
1193        let build_at = |n: usize| {
1194            let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1195            let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.37).sin()));
1196            PsiGramTensor::build(|psi| synth_design(psi, n, k), w.view(), z.view(), psi_lo, psi_hi)
1197                .expect("analytic synthetic design must certify")
1198        };
1199
1200        let t_small = build_at(120);
1201        // Seed at the well-conditioned (small-length-scale) window end — the
1202        // κ-optimum's neighbourhood, guaranteed to be at the window-maximal rank
1203        // (the synthetic radial design's rank rises toward large ψ). The floor is
1204        // the lower edge of the maximal-rank band reaching this seed.
1205        let seed = psi_hi;
1206        let floor_small = t_small.rank_stable_psi_floor(seed);
1207
1208        // (a) the band does not reach psi_lo → a floor is returned, strictly inside.
1209        let floor = floor_small.expect("collapsing-rank design must lift the floor off psi_lo");
1210        assert!(
1211            floor > psi_lo && floor <= seed,
1212            "floor {floor} must lie in (psi_lo {psi_lo}, seed {seed}]"
1213        );
1214
1215        // (b) below the floor the Gram is rank-deficient relative to the seed; at/
1216        // above the floor it holds the window-maximal rank. Verify the rank at the
1217        // floor equals the rank at the seed, and the rank just below the floor is
1218        // strictly lower (the floor is a genuine rank edge, not an interior node).
1219        let rank_at = |psi: f64| t_small.gram_numerical_rank(psi).unwrap();
1220        let max_rank = rank_at(seed);
1221        assert_eq!(
1222            rank_at(floor),
1223            max_rank,
1224            "the floor must sit at the window-maximal rank"
1225        );
1226        let probe_below = floor - 0.25;
1227        if t_small.contains(probe_below) {
1228            assert!(
1229                rank_at(probe_below) < max_rank,
1230                "rank just below the floor ({}) must drop under the band rank {max_rank}",
1231                rank_at(probe_below)
1232            );
1233        }
1234
1235        // (c) n-independence: the floor from a 5× larger build must match to grid
1236        // resolution. The tensor is certified to the same Chebyshev tolerance, so
1237        // the k-space rank structure — hence the floor — is the same object.
1238        let t_big = build_at(1000);
1239        let floor_big = t_big
1240            .rank_stable_psi_floor(seed)
1241            .expect("the rank cliff is an n-free property; the big build must also lift");
1242        let grid_step = (psi_hi - psi_lo) / 95.0; // NODES - 1 = 95
1243        assert!(
1244            (floor_small.unwrap() - floor_big).abs() <= 1.5 * grid_step,
1245            "rank-stable floor must be n-independent: n=200 → {}, n=4000 → {floor_big} \
1246             (grid step {grid_step})",
1247            floor_small.unwrap()
1248        );
1249
1250        // A genuinely full-rank, well-conditioned design across the window needs no
1251        // lift → None. `synth_full_rank_design` requires an even k.
1252        let n = 200usize;
1253        let kk = 6usize;
1254        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1255        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.29).cos()));
1256        let full = PsiGramTensor::build(
1257            |psi| synth_full_rank_design(psi, n, kk),
1258            w.view(),
1259            z.view(),
1260            psi_lo,
1261            psi_hi,
1262        )
1263        .expect("full-rank design must certify");
1264        assert!(
1265            full.rank_stable_psi_floor(seed).is_none(),
1266            "a window-wide full-rank design must not lift the floor"
1267        );
1268    }
1269
1270    /// #1033 (rank-stable κ-CEILING): the symmetric twin of the floor test. The
1271    /// `synth_design` radial columns `(1+s)e^{-s}` with `s = r·e^ψ` collapse at the
1272    /// HIGH ψ edge — every column decays toward zero as `s→∞`, so the conditioned
1273    /// Gram drops rank near `psi_hi`. `rank_stable_psi_ceiling` must (a) detect that
1274    /// the maximal-rank band does NOT reach `psi_hi` and return a ceiling strictly
1275    /// inside the window, (b) report that ceiling as the upper edge of the band
1276    /// containing the seed (rank at the ceiling = window-maximal, rank just above it
1277    /// strictly lower), and (c) be a pure k-space property — IDENTICAL whether built
1278    /// from few or many rows (the n-independence the κ outer loop relies on). A
1279    /// design full-rank up to `psi_hi` must return `None` (no clamp needed). This is
1280    /// the regression guard for the n=16000 fast-ladder resets: the κ line search
1281    /// overshot above the band to a rank-deficient ψ and tripped two O(n) resets.
1282    #[test]
1283    fn rank_stable_psi_ceiling_is_inside_window_and_n_independent() {
1284        let k = 7usize;
1285        // `synth_design`'s radial Gram rank RISES with ψ (the columns separate as
1286        // s = r·e^ψ grows), so it collapses at the LOW edge — the floor's setting.
1287        // To exercise the CEILING we feed the ψ-REFLECTED design `synth_design(-ψ)`,
1288        // whose rank instead collapses at the HIGH edge (rank 7→3 as ψ→psi_hi),
1289        // exactly the high-edge degeneracy the κ-ceiling guards against. Seed at a
1290        // window-maximal-rank node (located by a coarse scan, not assumed at an
1291        // edge); the ceiling is the upper edge of the maximal-rank band.
1292        let (psi_lo, psi_hi) = (-2.6_f64, 1.0_f64);
1293        let build_at = |n: usize| {
1294            let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1295            let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.41).cos()));
1296            PsiGramTensor::build(|psi| synth_design(-psi, n, k), w.view(), z.view(), psi_lo, psi_hi)
1297                .expect("analytic synthetic design must certify")
1298        };
1299
1300        let t_small = build_at(120);
1301        let rank_at = |psi: f64| t_small.gram_numerical_rank(psi).unwrap();
1302        let scan: Vec<(f64, usize)> = (0..96)
1303            .map(|i| {
1304                let p = psi_lo + (psi_hi - psi_lo) * (i as f64) / 95.0;
1305                (p, rank_at(p))
1306            })
1307            .collect();
1308        let window_max_rank = scan.iter().map(|&(_, r)| r).max().unwrap();
1309        let seed = scan
1310            .iter()
1311            .find(|&&(_, r)| r == window_max_rank)
1312            .map(|&(p, _)| p)
1313            .expect("some node must hold the window-maximal rank");
1314        let ceil_small = t_small.rank_stable_psi_ceiling(seed);
1315
1316        // (a) the band does not reach psi_hi → a ceiling is returned, strictly inside.
1317        let ceiling =
1318            ceil_small.expect("high-edge-collapsing design must clamp the ceiling off psi_hi");
1319        assert!(
1320            ceiling < psi_hi && ceiling >= seed,
1321            "ceiling {ceiling} must lie in [seed {seed}, psi_hi {psi_hi})"
1322        );
1323
1324        // (b) at/below the ceiling the Gram holds the window-maximal rank; above it
1325        // the rank drops. The ceiling is a genuine rank edge, not an interior node.
1326        let max_rank = window_max_rank;
1327        assert_eq!(
1328            rank_at(seed),
1329            max_rank,
1330            "the seed must sit at the window-maximal rank"
1331        );
1332        assert_eq!(
1333            rank_at(ceiling),
1334            max_rank,
1335            "the ceiling must sit at the window-maximal rank"
1336        );
1337        let probe_above = ceiling + 0.25;
1338        if t_small.contains(probe_above) {
1339            assert!(
1340                rank_at(probe_above) < max_rank,
1341                "rank just above the ceiling ({}) must drop under the band rank {max_rank}",
1342                rank_at(probe_above)
1343            );
1344        }
1345
1346        // (c) n-independence: the ceiling from a larger build matches to grid
1347        // resolution — the rank cliff is an n-free k-space property.
1348        let t_big = build_at(1000);
1349        let ceil_big = t_big
1350            .rank_stable_psi_ceiling(seed)
1351            .expect("the rank cliff is an n-free property; the big build must also clamp");
1352        let grid_step = (psi_hi - psi_lo) / 95.0; // NODES - 1 = 95
1353        assert!(
1354            (ceil_small.unwrap() - ceil_big).abs() <= 1.5 * grid_step,
1355            "rank-stable ceiling must be n-independent: n=120 → {}, n=1000 → {ceil_big} \
1356             (grid step {grid_step})",
1357            ceil_small.unwrap()
1358        );
1359
1360        // A genuinely full-rank design across the window needs no clamp → None.
1361        let n = 200usize;
1362        let kk = 6usize;
1363        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1364        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.23).sin()));
1365        let full = PsiGramTensor::build(
1366            |psi| synth_full_rank_design(psi, n, kk),
1367            w.view(),
1368            z.view(),
1369            psi_lo,
1370            psi_hi,
1371        )
1372        .expect("full-rank design must certify");
1373        assert!(
1374            full.rank_stable_psi_ceiling(seed).is_none(),
1375            "a window-wide full-rank design must not clamp the ceiling"
1376        );
1377    }
1378
1379    /// #1033 n-independence invariant (structural, build-free, bit-tight):
1380    /// after the one-time `build` n-pass, EVERY per-trial accessor the certified
1381    /// κ/ψ outer-loop hot path consumes — the value `(gram_at, rhs_at)`, the
1382    /// gradient `(dgram_dpsi, drhs_dpsi)`, the Hessian-channel curvature
1383    /// `(d2gram_dpsi2, d2rhs_dpsi2)`, and the inner-solver bridge
1384    /// `gaussian_fixed_cache_at` — must touch ZERO data rows. We prove this by
1385    /// instrumenting the `eval_design` closure with an invocation counter (the
1386    /// closure is the ONLY route to the n×k design): the counter advances during
1387    /// `build` (the certified node ladder + off-node spot checks) and must then
1388    /// stay FROZEN across an entire ψ-trial sweep. This is the
1389    /// "no surface rebuild / no n×k re-realization on a cache-hit trial"
1390    /// invariant the outer-loop seam (`SpatialJointContext::eval_full`,
1391    /// `skip_design_realization`) relies on — asserted here at the tensor source
1392    /// of truth, independent of whether the full κ-fit converges or of any
1393    /// wall-clock measurement.
1394    #[test]
1395    fn psi_gram_tensor_trial_accessors_touch_no_data_rows() {
1396        use std::cell::Cell;
1397
1398        let (n, k) = (256usize, 6usize);
1399        let w = Array1::from_iter((0..n).map(|i| 0.8 + 0.4 * ((i % 4) as f64) / 3.0));
1400        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.21).sin() + 0.2));
1401        let (psi_lo, psi_hi) = (-1.1_f64, 0.95_f64);
1402
1403        // The closure is the SOLE path to the n×k design; count every call.
1404        let calls = Cell::new(0usize);
1405        let tensor = PsiGramTensor::build(
1406            |psi| {
1407                calls.set(calls.get() + 1);
1408                synth_design(psi, n, k)
1409            },
1410            w.view(),
1411            z.view(),
1412            psi_lo,
1413            psi_hi,
1414        )
1415        .expect("analytic synthetic design must certify");
1416
1417        // The one-time build necessarily streamed the design at the Chebyshev
1418        // nodes plus off-node spot checks. Freeze the count.
1419        let build_calls = calls.get();
1420        assert!(
1421            build_calls > 0,
1422            "build must have streamed the design at least once (sanity)"
1423        );
1424
1425        // A dense ψ-trial sweep strictly inside the certified window. Every
1426        // accessor below is what a per-trial outer-loop eval consumes.
1427        let m = 64usize;
1428        let lo = psi_lo + 0.05;
1429        let hi = psi_hi - 0.05;
1430        for i in 0..m {
1431            let psi = lo + (hi - lo) * (i as f64) / (m as f64 - 1.0);
1432            assert!(tensor.contains(psi));
1433            // Value lane.
1434            let _g = tensor.gram_at(psi);
1435            let _r = tensor.rhs_at(psi);
1436            // Gradient lane.
1437            let _dg = tensor.dgram_dpsi(psi);
1438            let _dr = tensor.drhs_dpsi(psi);
1439            // Hessian-channel curvature.
1440            let _d2g = tensor.d2gram_dpsi2(psi);
1441            let _d2r = tensor.d2rhs_dpsi2(psi);
1442            // Inner-solver bridge (the GaussianFixedCache the PLS fast path reads).
1443            let _cache = tensor.gaussian_fixed_cache_at(psi);
1444        }
1445
1446        assert_eq!(
1447            calls.get(),
1448            build_calls,
1449            "n-independence VIOLATED: a per-trial accessor re-streamed the n×k \
1450             design ({} extra eval_design calls across {m} ψ-trials). The certified \
1451             κ/ψ outer loop must serve value + gradient + Hessian curvature + the \
1452             inner-solver cache from k-space sufficient statistics only, with NO \
1453             per-trial n-row work.",
1454            calls.get() - build_calls
1455        );
1456    }
1457
1458    /// #1033 Hessian-channel primitive gate: the n-free second ψ-derivatives
1459    /// `d2gram_dpsi2` / `d2rhs_dpsi2` must match central FD of the analytic FIRST
1460    /// derivatives (`dgram_dpsi` / `drhs_dpsi`) — the curvature the outer Newton
1461    /// /ARC step reads when the Gaussian Hessian channel is served from the
1462    /// tensor instead of a re-streamed O(n) slab. Differencing the analytic first
1463    /// derivative (not the exact gram) keeps this a pure check of the
1464    /// second-derivative recurrence, isolated from the build's value-lane tol.
1465    #[test]
1466    fn psi_gram_tensor_second_derivative_matches_fd_of_gradient() {
1467        let (n, k) = (160usize, 7usize);
1468        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1469        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.37).sin()));
1470        let (psi_lo, psi_hi) = (-1.2_f64, 1.0_f64);
1471
1472        let tensor = PsiGramTensor::build(
1473            |psi| synth_design(psi, n, k),
1474            w.view(),
1475            z.view(),
1476            psi_lo,
1477            psi_hi,
1478        )
1479        .expect("analytic synthetic design must certify");
1480
1481        let h = 1e-5;
1482        for &psi in &[-1.0, -0.5, 0.0, 0.4, 0.9] {
1483            // ∂²G/∂ψ² vs central FD of the analytic ∂G/∂ψ.
1484            let dg_plus = tensor.dgram_dpsi(psi + h);
1485            let dg_minus = tensor.dgram_dpsi(psi - h);
1486            let d2g = tensor.d2gram_dpsi2(psi);
1487            let gscale = d2g.iter().fold(0.0_f64, |a, &v| a.max(v.abs())).max(1e-9);
1488            for ((a, p), m_) in d2g.iter().zip(dg_plus.iter()).zip(dg_minus.iter()) {
1489                let fd = (p - m_) / (2.0 * h);
1490                assert!(
1491                    (a - fd).abs() <= 1e-4 * gscale,
1492                    "d2gram/dpsi2 mismatch at psi={psi}: analytic={a}, fd={fd}"
1493                );
1494            }
1495            // ∂²(XᵀWz)/∂ψ² vs central FD of the analytic ∂(XᵀWz)/∂ψ.
1496            let dr_plus = tensor.drhs_dpsi(psi + h);
1497            let dr_minus = tensor.drhs_dpsi(psi - h);
1498            let d2r = tensor.d2rhs_dpsi2(psi);
1499            let rscale = d2r.iter().fold(0.0_f64, |a, &v| a.max(v.abs())).max(1e-9);
1500            for ((a, p), m_) in d2r.iter().zip(dr_plus.iter()).zip(dr_minus.iter()) {
1501                let fd = (p - m_) / (2.0 * h);
1502                assert!(
1503                    (a - fd).abs() <= 1e-4 * rscale,
1504                    "d2rhs/dpsi2 mismatch at psi={psi}: analytic={a}, fd={fd}"
1505                );
1506            }
1507        }
1508    }
1509
1510    /// The penalized Gaussian profile deviance at a fixed ridge λ, assembled
1511    /// PURELY from the sufficient-statistic triple `(G, r, c) = (XᵀWX, XᵀWz, zᵀWz)`:
1512    ///
1513    /// ```text
1514    ///   β(λ) = (G + λS)⁻¹ r,   D(ψ;λ) = c − 2 βᵀr + βᵀ(G + λS)β = c − βᵀr
1515    /// ```
1516    ///
1517    /// (the second equality uses the normal equations `(G + λS)β = r`). This is
1518    /// EXACTLY the object the inner Gaussian PLS minimizes over β, and it is a
1519    /// pure function of `(G, r, c)` — n-free. Returns `(D, β)` so the caller can
1520    /// also probe the coefficient lane. `s_ridge` is the ridge penalty matrix.
1521    fn profile_deviance(
1522        g: &Array2<f64>,
1523        r: &Array1<f64>,
1524        c: f64,
1525        s_ridge: &Array2<f64>,
1526        lambda: f64,
1527        k: usize,
1528    ) -> (f64, Array1<f64>) {
1529        // Dense (G + λS) β = r via partial-pivot Gauss elimination (small k).
1530        let mut a = g.clone();
1531        a.scaled_add(lambda, s_ridge);
1532        let mut aug = Array2::<f64>::zeros((k, k + 1));
1533        aug.slice_mut(ndarray::s![.., ..k]).assign(&a);
1534        aug.slice_mut(ndarray::s![.., k]).assign(r);
1535        for col in 0..k {
1536            let piv = (col..k)
1537                .max_by(|&p, &q| aug[[p, col]].abs().total_cmp(&aug[[q, col]].abs()))
1538                .unwrap();
1539            if piv != col {
1540                for j in 0..=k {
1541                    let tmp = aug[[col, j]];
1542                    aug[[col, j]] = aug[[piv, j]];
1543                    aug[[piv, j]] = tmp;
1544                }
1545            }
1546            let p = aug[[col, col]];
1547            for row in 0..k {
1548                if row == col {
1549                    continue;
1550                }
1551                let f = aug[[row, col]] / p;
1552                for j in col..=k {
1553                    aug[[row, j]] -= f * aug[[col, j]];
1554                }
1555            }
1556        }
1557        let beta = Array1::from_iter((0..k).map(|i| aug[[i, k]] / aug[[i, i]]));
1558        let deviance = c - beta.dot(r);
1559        (deviance, beta)
1560    }
1561
1562    /// #1033 bit-tight Hessian + κ-optimum gate. The fast path's promise is not
1563    /// merely that the Gram VALUE matches at sampled ψ — it is that the WHOLE
1564    /// outer κ search (objective, its ψ-curvature, and therefore the located
1565    /// optimum) is reproduced by the n-free sufficient-statistic representation
1566    /// to machine precision. This harness certifies exactly that:
1567    ///
1568    ///   1. **Objective**: the penalized profile deviance `D(ψ)` assembled from
1569    ///      the tensor's `(gram_at, rhs_at, zᵀWz)` matches the exactly streamed
1570    ///      `XᵀWX/XᵀWz/zᵀWz` deviance bit-tight at every ψ on a fine grid.
1571    ///   2. **Curvature (Hessian)**: the second ψ-derivative `D''(ψ)` of the
1572    ///      fast-path objective matches the second ψ-derivative of the EXACT
1573    ///      objective (central FD of the streamed deviance) — the curvature the
1574    ///      outer Newton step reads must be the true curvature, not an
1575    ///      approximation that drifts off the value (the objective↔gradient
1576    ///      desync class, now extended to the second order).
1577    ///   3. **κ-optimum**: the argmin of `D(ψ)` over the grid is IDENTICAL
1578    ///      between the two assemblies — the fast path lands on the same κ as the
1579    ///      exact streamed search, to the grid resolution AND bit-tight in the
1580    ///      objective value at that node.
1581    #[test]
1582    fn psi_gram_tensor_bit_tight_hessian_and_kappa_optimum() {
1583        let (n, k) = (200usize, 6usize);
1584        // Heterogeneous weights + a response with genuine ψ-dependent curvature
1585        // so the deviance has a non-degenerate interior minimum in ψ.
1586        let w = Array1::from_iter((0..n).map(|i| 0.7 + 0.6 * (((i * 7) % 5) as f64) / 4.0));
1587        let z = Array1::from_iter((0..n).map(|i| {
1588            let t = (i as f64) / (n as f64 - 1.0);
1589            (3.0 * t).sin() + 0.3 * (7.0 * t).cos()
1590        }));
1591        let (psi_lo, psi_hi) = (-1.0_f64, 0.9_f64);
1592        // Fixed ridge λ over the search — the κ optimizer profiles ψ at fixed
1593        // smoothing here; identity-S ridge keeps the profile well-posed.
1594        let s_ridge = Array2::<f64>::eye(k);
1595        let lambda = 0.5_f64;
1596
1597        let tensor = PsiGramTensor::build(
1598            |psi| synth_design(psi, n, k),
1599            w.view(),
1600            z.view(),
1601            psi_lo,
1602            psi_hi,
1603        )
1604        .expect("analytic synthetic design must certify");
1605
1606        let exact_ztwz: f64 = w.iter().zip(z.iter()).map(|(&wi, &zi)| wi * zi * zi).sum();
1607
1608        // Exact streamed deviance at arbitrary ψ — the ground truth the n-free
1609        // path must reproduce.
1610        let exact_deviance = |psi: f64| -> f64 {
1611            let design = synth_design(psi, n, k).unwrap();
1612            let mut wd = design.clone();
1613            for (mut row, &wi) in wd.outer_iter_mut().zip(w.iter()) {
1614                row.mapv_inplace(|v| v * wi);
1615            }
1616            let g = design.t().dot(&wd);
1617            let r = wd.t().dot(&z);
1618            profile_deviance(&g, &r, exact_ztwz, &s_ridge, lambda, k).0
1619        };
1620
1621        // Fast n-free deviance from the certified tensor.
1622        let fast_deviance = |psi: f64| -> f64 {
1623            let g = tensor.gram_at(psi);
1624            let r = tensor.rhs_at(psi);
1625            profile_deviance(&g, &r, exact_ztwz, &s_ridge, lambda, k).0
1626        };
1627
1628        // Dense grid strictly inside the certified window (away from the edges,
1629        // where the build's value lane is still certified but we want a clean
1630        // central-FD second derivative to exist on both sides).
1631        let m = 81usize;
1632        let lo = psi_lo + 0.06;
1633        let hi = psi_hi - 0.06;
1634        let grid: Vec<f64> = (0..m)
1635            .map(|i| lo + (hi - lo) * (i as f64) / (m as f64 - 1.0))
1636            .collect();
1637
1638        // (1) Objective bit-tight across the whole grid; track argmin on both.
1639        let mut worst_value_rel = 0.0_f64;
1640        let (mut fast_argmin, mut fast_min) = (f64::NAN, f64::INFINITY);
1641        let (mut exact_argmin, mut exact_min) = (f64::NAN, f64::INFINITY);
1642        for &psi in &grid {
1643            let de = exact_deviance(psi);
1644            let df = fast_deviance(psi);
1645            let rel = (de - df).abs() / de.abs().max(1e-300);
1646            worst_value_rel = worst_value_rel.max(rel);
1647            if df < fast_min {
1648                fast_min = df;
1649                fast_argmin = psi;
1650            }
1651            if de < exact_min {
1652                exact_min = de;
1653                exact_argmin = psi;
1654            }
1655        }
1656        assert!(
1657            worst_value_rel <= 1e-9,
1658            "penalized profile deviance: fast n-free assembly diverged from exact \
1659             streamed by rel {worst_value_rel:.3e} (> 1e-9) somewhere on the ψ grid"
1660        );
1661
1662        // (3) κ-optimum: identical grid node AND bit-tight value there. The
1663        // argmin must be a true interior minimum (not a window edge) for this to
1664        // certify the OUTER search rather than a boundary artifact.
1665        assert_eq!(
1666            fast_argmin.to_bits(),
1667            exact_argmin.to_bits(),
1668            "κ-optimum mismatch: fast argmin ψ={fast_argmin}, exact argmin ψ={exact_argmin} \
1669             — the n-free objective located a different optimum"
1670        );
1671        assert!(
1672            fast_argmin > lo + 1e-9 && fast_argmin < hi - 1e-9,
1673            "κ-optimum landed on the grid edge ψ={fast_argmin}; the fixture must have \
1674             an INTERIOR minimum for this to test the outer search, not a boundary"
1675        );
1676        let opt_rel = (exact_min - fast_min).abs() / exact_min.abs().max(1e-300);
1677        assert!(
1678            opt_rel <= 1e-9,
1679            "κ-optimum objective value drift at ψ={fast_argmin}: fast={fast_min}, \
1680             exact={exact_min}, rel={opt_rel:.3e}"
1681        );
1682
1683        // (2) Gradient + curvature from the tensor's ANALYTIC ψ-derivatives.
1684        //
1685        // Differencing two objectives that agree only to ~1e-9 in VALUE cannot
1686        // certify their curvature: the central second difference divides by h²,
1687        // so the ~1e-9 value gap (which is NOT common-mode — they are different
1688        // assemblies) is amplified by 1/h² and swamps any real curvature signal.
1689        // The principled bit-tight curvature check uses the tensor's OWN analytic
1690        // ψ-derivatives `dgram_dpsi`/`drhs_dpsi`: the envelope gradient of the
1691        // profile deviance `D(ψ) = c − rᵀA⁻¹r`, `A = G + λS`, is
1692        //
1693        //   D'(ψ) = −2 βᵀ(∂r/∂ψ) + βᵀ(∂G/∂ψ)β,   β = A⁻¹r,
1694        //
1695        // assembled n-free from `(dgram_dpsi, drhs_dpsi)`. We certify this
1696        // analytic gradient against a central FD of the EXACT streamed objective
1697        // (first order ⇒ only 1/h amplification, so the ~1e-9 value agreement is
1698        // not destroyed), and certify the curvature by central-differencing the
1699        // ANALYTIC gradient (again 1/h, not 1/h²). This is the same one-
1700        // representation value↔gradient↔curvature consistency the production fast
1701        // path relies on for the outer Newton step.
1702        let solve_a = |g: &Array2<f64>, r: &Array1<f64>| -> Array1<f64> {
1703            profile_deviance(g, r, exact_ztwz, &s_ridge, lambda, k).1
1704        };
1705        // Analytic n-free ψ-gradient of the penalized profile deviance, valid on
1706        // the certified gradient sub-window where `dgram_dpsi` is bit-tight.
1707        let analytic_grad = |psi: f64| -> f64 {
1708            let g = tensor.gram_at(psi);
1709            let r = tensor.rhs_at(psi);
1710            let beta = solve_a(&g, &r);
1711            let dg = tensor.dgram_dpsi(psi);
1712            let dr = tensor.drhs_dpsi(psi);
1713            -2.0 * beta.dot(&dr) + beta.dot(&dg.dot(&beta))
1714        };
1715
1716        // Two finite-difference steps, each near the optimum of its own
1717        // truncation/rounding trade-off:
1718        //   * `h_grad = 1e-6` for the FIRST derivative (central FD ⇒ O(h²)
1719        //     truncation, O(ε/h) rounding ⇒ optimum near 1e-5..1e-6);
1720        //   * `h_curv = 2e-4` for the curvature. A SECOND difference divides by
1721        //     h², so its rounding floor is O(ε·|D|/h²): at h=1e-6 that is
1722        //     ~1e-16/1e-12 = 1e-4 of |D|, comparable to the curvature itself —
1723        //     useless. h≈2e-4 puts the rounding floor at ~1e-16/4e-8 ≈ 2.5e-9·|D|
1724        //     and the O(h²·D⁗) truncation around the same scale, so the second
1725        //     difference is meaningful. The analytic-gradient curvature is
1726        //     differenced at the SAME h_curv so the two carry the same
1727        //     truncation order and the comparison is apples-to-apples.
1728        let h_grad = 1e-6_f64;
1729        let h_curv = 2e-4_f64;
1730        let mut worst_grad_rel = 0.0_f64;
1731        let mut worst_hess_rel = 0.0_f64;
1732        let mut tested = 0usize;
1733        for &psi in &grid {
1734            // The exact-objective curvature stencil reaches ±2·h_curv; require the
1735            // whole stencil to stay inside the certified gradient sub-window so the
1736            // analytic-gradient differences are all bit-tight.
1737            if !tensor.contains_for_gradient(psi - 2.0 * h_curv)
1738                || !tensor.contains_for_gradient(psi + 2.0 * h_curv)
1739            {
1740                continue;
1741            }
1742            tested += 1;
1743            // Analytic gradient vs central FD of the EXACT streamed objective.
1744            let exact_g1 =
1745                (exact_deviance(psi + h_grad) - exact_deviance(psi - h_grad)) / (2.0 * h_grad);
1746            let ag = analytic_grad(psi);
1747            let gscale = exact_g1.abs().max(1e-6);
1748            worst_grad_rel = worst_grad_rel.max((exact_g1 - ag).abs() / gscale);
1749            // Curvature: central FD of the ANALYTIC gradient (n-free) vs central
1750            // second difference of the EXACT objective, both at h_curv.
1751            let analytic_h2 =
1752                (analytic_grad(psi + h_curv) - analytic_grad(psi - h_curv)) / (2.0 * h_curv);
1753            let exact_h2 = (exact_deviance(psi + h_curv) - 2.0 * exact_deviance(psi)
1754                + exact_deviance(psi - h_curv))
1755                / (h_curv * h_curv);
1756            let hscale = exact_h2.abs().max(1e-3);
1757            worst_hess_rel = worst_hess_rel.max((analytic_h2 - exact_h2).abs() / hscale);
1758        }
1759        assert!(
1760            tested > 0,
1761            "no ψ on the grid lay inside the certified gradient sub-window"
1762        );
1763        assert!(
1764            worst_grad_rel <= 1e-5,
1765            "ψ-gradient mismatch: the tensor's analytic n-free objective gradient diverged \
1766             from the exact streamed objective by rel {worst_grad_rel:.3e} (> 1e-5)"
1767        );
1768        // The curvature compares an analytic-gradient central difference against
1769        // an exact-objective second difference; the residual O(h²) truncation +
1770        // O(ε/h²) rounding floor at h_curv=2e-4 sets a realistic bit-tight bar of
1771        // ~1e-3 relative (any larger gap is a genuine curvature divergence, not FD
1772        // noise — the value/gradient lanes already certify the objective itself to
1773        // ~1e-9/1e-5).
1774        assert!(
1775            worst_hess_rel <= 1e-3,
1776            "ψ-curvature (Hessian) mismatch: fast n-free objective curvature diverged \
1777             from the exact streamed objective by rel {worst_hess_rel:.3e} (> 1e-3) — \
1778             the outer Newton step would read a different curvature than the truth"
1779        );
1780
1781        eprintln!(
1782            "[psi-gram-bittight] n={n} k={k} grid={m} grad-tested={tested}  \
1783             worst |ΔD|/D={worst_value_rel:.2e}  worst |ΔD'|/D'={worst_grad_rel:.2e}  \
1784             worst |ΔD''|/D''={worst_hess_rel:.2e}  κ-opt ψ={fast_argmin:.6} (interior, bit-identical)"
1785        );
1786    }
1787
1788    /// Certification negative: a NON-analytic (kinked) design must refuse to
1789    /// certify rather than silently approximate.
1790    #[test]
1791    fn psi_gram_tensor_refuses_non_analytic_design() {
1792        let (n, k) = (40usize, 3usize);
1793        let w = Array1::from_elem(n, 1.0);
1794        let z = Array1::from_elem(n, 0.5);
1795        let tensor = PsiGramTensor::build(
1796            |psi| {
1797                let mut x = Array2::<f64>::zeros((n, k));
1798                for i in 0..n {
1799                    for j in 0..k {
1800                        // |ψ| kink at 0 inside the window: not analytic.
1801                        x[[i, j]] = psi.abs() + (i + j) as f64 / (n + k) as f64;
1802                    }
1803                }
1804                Ok(x)
1805            },
1806            w.view(),
1807            z.view(),
1808            -1.0,
1809            1.0,
1810        );
1811        assert!(
1812            tensor.is_err(),
1813            "kinked design must fail the tail-decay/spot-check certificates"
1814        );
1815    }
1816
1817    /// #1264 reduced-basis-equality witness — REFLEXIVITY + GAUGE INVARIANCE.
1818    ///
1819    /// `reduced_basis_equal(ψ, ψ)` is trivially sound (the surface is its own
1820    /// reference), and the witness must accept two ψ's whose RANGE subspace is
1821    /// identical even when the per-ψ eigenvECTORS differ (the projector is
1822    /// gauge-invariant). The synthetic full-rank Matérn-shaped design's range is
1823    /// the whole k-space for every ψ, so every in-window pair shares a reduced
1824    /// basis and must certify.
1825    #[test]
1826    fn reduced_basis_witness_reflexive_and_gauge_invariant() {
1827        let (n, k) = (160usize, 6usize);
1828        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.3 * ((i % 5) as f64)));
1829        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.29).sin()));
1830        let (psi_lo, psi_hi) = (-1.0_f64, 0.8_f64);
1831        // Use the genuinely full-rank, well-conditioned design: its weighted Gram
1832        // has numerical rank `= k` at every ψ (range = whole k-space, identity
1833        // range projector), so the gauge-invariance premise actually holds. The
1834        // narrow-`r` `synth_design` does NOT satisfy this — its Gram is rank 3–4 of
1835        // 6 with a near-null subspace that ROTATES across the window, on which the
1836        // witness *correctly* refuses (refusing a rotating reduced basis is the
1837        // sound fallback the production skip gate exists for). See
1838        // `synth_full_rank_design`.
1839        let tensor = PsiGramTensor::build(
1840            |psi| synth_full_rank_design(psi, n, k),
1841            w.view(),
1842            z.view(),
1843            psi_lo,
1844            psi_hi,
1845        )
1846        .expect("analytic full-rank synthetic design must certify");
1847
1848        // PREMISE CHECK: the design is full column rank (numerical rank = k) and
1849        // the range projector is the identity at every grid ψ, so the test really
1850        // is exercising gauge invariance over a ψ-invariant subspace — not riding a
1851        // rank-deficient fixture the witness would (correctly) refuse.
1852        let grid: Vec<f64> = (0..=12).map(|i| psi_lo + 0.05 + 0.06 * i as f64).collect();
1853        let identity = Array2::<f64>::eye(k);
1854        for &psi in &grid {
1855            let (proj, rank) = tensor
1856                .range_projector(psi, PSI_GRAM_SKIP_RANK_RTOL)
1857                .expect("full-rank Gram must yield a range projector");
1858            assert_eq!(
1859                rank, k,
1860                "full-rank design must have numerical rank k={k} at psi={psi} \
1861                 (got {rank}) — otherwise the gauge-invariance premise is vacuous"
1862            );
1863            let proj_dev = (&proj - &identity)
1864                .iter()
1865                .fold(0.0_f64, |acc, &v| acc.max(v.abs()));
1866            assert!(
1867                proj_dev <= 1e-8,
1868                "range projector must be the identity at psi={psi} \
1869                 (max|P−I|={proj_dev:.2e})"
1870            );
1871        }
1872
1873        // GAUGE-INVARIANCE CHECK: the per-ψ eigenvectors genuinely rotate across
1874        // the window (so the witness is exercised against a moving gauge, not a
1875        // static one), yet the spanned subspace is identical. Confirm the rotation
1876        // is real by checking the leading eigenvector turns measurably end-to-end.
1877        let leading_evec = |psi: f64| -> Array1<f64> {
1878            use gam_linalg::faer_ndarray::FaerEigh;
1879            let g = tensor.gram_at(psi);
1880            let gsym = 0.5 * (&g + &g.t());
1881            let (evals, evecs) = gsym.eigh(faer::Side::Lower).unwrap();
1882            // `eigh` returns ascending eigenvalues; the leading one is the last.
1883            let top = evals.len() - 1;
1884            evecs.column(top).to_owned()
1885        };
1886        let v_lo = leading_evec(grid[0]);
1887        let v_hi = leading_evec(*grid.last().unwrap());
1888        let cos_angle = v_lo.dot(&v_hi).abs()
1889            / (v_lo.dot(&v_lo).sqrt() * v_hi.dot(&v_hi).sqrt()).max(1e-300);
1890        assert!(
1891            cos_angle <= 0.999,
1892            "the design's eigenvectors must rotate with ψ for the gauge-invariance \
1893             test to be non-trivial (|cos∠(v_lo,v_hi)|={cos_angle:.6} — too close to 1)"
1894        );
1895
1896        // Reflexive: same ψ is always sound.
1897        for &psi in &[-0.9, -0.2, 0.0, 0.5, 0.79] {
1898            assert!(
1899                tensor.reduced_basis_equal(psi, psi),
1900                "witness must be reflexive at psi={psi}"
1901            );
1902        }
1903        // The full-rank synthetic design spans all of k-space at every ψ, so the
1904        // range projector is the identity for all ψ → every pair certifies despite
1905        // the eigenvector rotation just verified (gauge invariance).
1906        for &a in &grid {
1907            for &b in &grid {
1908                assert!(
1909                    tensor.reduced_basis_equal(a, b),
1910                    "full-rank design: range is ψ-invariant (identity projector), \
1911                     so the skip witness must certify (ψ_ref={a}, ψ_new={b})"
1912                );
1913            }
1914        }
1915        // Off-window ψ refuses.
1916        assert!(!tensor.reduced_basis_equal(psi_lo - 0.5, 0.0));
1917        assert!(!tensor.reduced_basis_equal(0.0, psi_hi + 0.5));
1918    }
1919
1920    /// #1264 reduced-basis-equality witness — REFUSES across a genuine subspace
1921    /// change (the exact failure mode of the old RRQR-only gate).
1922    ///
1923    /// Construct a design whose first two columns are fixed (ψ-invariant) profiles
1924    /// and whose third column's AMPLITUDE `ε(ψ) = e^{αψ}` analytically sweeps the
1925    /// third eigendirection's eigenvalue `∝ ε²` across the rank-revealing cutoff.
1926    /// Below the cutoff the reduced (range) basis is the 2-D span of the first two
1927    /// profiles; above it the range is 3-D. Two ψ's on the SAME side of the
1928    /// threshold share a reduced basis (witness accepts); two ψ's STRADDLING it do
1929    /// not (witness refuses) — exactly the stale-basis pairing the design-revision
1930    /// fast path must not perform. The amplitude is smooth/analytic so the tensor
1931    /// still certifies (this is a reduced-basis change, not a non-analytic kink).
1932    #[test]
1933    fn reduced_basis_witness_refuses_across_subspace_change() {
1934        let (n, k) = (200usize, 3usize);
1935        // Three fixed, well-separated column profiles (full column rank when all
1936        // present). The third is scaled by ε(ψ).
1937        let base = |i: usize, j: usize| -> f64 {
1938            let t = (i as f64 + 0.5) / n as f64;
1939            match j {
1940                0 => 1.0,
1941                1 => (2.0 * std::f64::consts::PI * t).sin(),
1942                _ => (4.0 * std::f64::consts::PI * t).cos(),
1943            }
1944        };
1945        // ε(ψ) crosses √cutoff (relative to λ_max ~ O(n)) within the window: at
1946        // λ_max ≈ n the cutoff is rank_rtol·n ≈ 1e-10·200 = 2e-8, so the third
1947        // eigenvalue ε²·‖c3‖² ≈ ε²·(n/2) crosses it at ε ≈ sqrt(4e-8/n) ≈ 1.4e-5,
1948        // i.e. ψ* ≈ ln(1.4e-5)/α. With α = 10 and window [−1.6,−0.8], ψ* ≈ −1.12
1949        // sits inside the window, giving a clean below/above split.
1950        let alpha = 10.0_f64;
1951        let design = move |psi: f64| -> Result<Array2<f64>, String> {
1952            let eps = (alpha * psi).exp();
1953            let mut x = Array2::<f64>::zeros((n, k));
1954            for i in 0..n {
1955                x[[i, 0]] = base(i, 0);
1956                x[[i, 1]] = base(i, 1);
1957                x[[i, 2]] = eps * base(i, 2);
1958            }
1959            Ok(x)
1960        };
1961        let w = Array1::from_elem(n, 1.0);
1962        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.13).sin()));
1963        let (psi_lo, psi_hi) = (-1.6_f64, -0.8_f64);
1964        let tensor = PsiGramTensor::build(design, w.view(), z.view(), psi_lo, psi_hi)
1965            .expect("smooth ε(ψ) design must still certify (analytic, no kink)");
1966
1967        // Find the actual threshold by scanning the rank.
1968        let rank_at = |psi: f64| -> usize {
1969            tensor
1970                .range_projector(psi, PSI_GRAM_SKIP_RANK_RTOL)
1971                .map(|(_, r)| r)
1972                .unwrap_or(0)
1973        };
1974        let lo_rank = rank_at(psi_lo + 0.02);
1975        let hi_rank = rank_at(psi_hi - 0.02);
1976        assert_eq!(
1977            lo_rank, 2,
1978            "low-ψ end must be rank-2 (third column below cutoff)"
1979        );
1980        assert_eq!(
1981            hi_rank, 3,
1982            "high-ψ end must be rank-3 (third column above cutoff)"
1983        );
1984
1985        // Same-side pairs (both rank-2) certify; straddling pairs refuse.
1986        let psi_low_a = psi_lo + 0.05;
1987        let psi_low_b = psi_lo + 0.10;
1988        assert_eq!(rank_at(psi_low_a), 2);
1989        assert_eq!(rank_at(psi_low_b), 2);
1990        assert!(
1991            tensor.reduced_basis_equal(psi_low_a, psi_low_b),
1992            "two low-ψ trials share the rank-2 reduced basis → skip is sound"
1993        );
1994        let psi_high_a = psi_hi - 0.05;
1995        let psi_high_b = psi_hi - 0.10;
1996        assert_eq!(rank_at(psi_high_a), 3);
1997        assert_eq!(rank_at(psi_high_b), 3);
1998        // High-side: the range is the full 3-D space at both, so the projector is
1999        // the identity at both → still a shared reduced basis.
2000        assert!(
2001            tensor.reduced_basis_equal(psi_high_a, psi_high_b),
2002            "two high-ψ trials share the rank-3 reduced basis → skip is sound"
2003        );
2004        // Straddling the rank change: the reduced basis MOVED (2-D → 3-D). The
2005        // witness MUST refuse — this is precisely the stale-basis pairing the old
2006        // RRQR-only gate let through.
2007        assert!(
2008            !tensor.reduced_basis_equal(psi_low_a, psi_high_a),
2009            "witness must REFUSE a skip that straddles the reduced-basis (rank) \
2010             change — freezing the low-ψ rank-2 basis and re-keying the high-ψ \
2011             rank-3 Gram is the exact ~7.8e-2 β̂ regression #1264 guards"
2012        );
2013        assert!(
2014            !tensor.reduced_basis_equal(psi_high_a, psi_low_a),
2015            "witness must refuse symmetrically (high pin, low trial)"
2016        );
2017    }
2018
2019    /// #1033 ROTATION WALL — the subspace-distance certificate must CERTIFY a
2020    /// skip across a pure basis ROTATION at fixed rank, where the old entrywise
2021    /// max-abs projector gate would have refused.
2022    ///
2023    /// Build a rank-2 (in a k=3 space) design whose 2-D range ROTATES smoothly
2024    /// with ψ but whose RANK stays 2: two ψ-dependent in-plane directions span the
2025    /// same fixed 2-plane (cols 0,1 of a fixed orthonormal pair) rotated by an
2026    /// analytic angle φ(ψ). The SUBSPACE (the 2-plane) is ψ-invariant — only the
2027    /// basis within it rotates — so the range projector is mathematically
2028    /// IDENTICAL at every ψ, but its eigenVECTORS rotate. A correct
2029    /// subspace-identity witness must certify every in-window pair; the spectral
2030    /// (principal-angle) distance is ~0 throughout while a naive entrywise
2031    /// comparison of rotated eigenbases would not be guaranteed to.
2032    #[test]
2033    fn reduced_basis_witness_certifies_across_pure_rotation_1033() {
2034        let (n, k) = (240usize, 3usize);
2035        // Two fixed orthogonal ambient profiles spanning a fixed 2-plane; the
2036        // third ambient direction is left empty so the range is exactly that
2037        // 2-plane (rank 2) for every ψ.
2038        let p0 = |i: usize| -> f64 {
2039            let t = (i as f64 + 0.5) / n as f64;
2040            (2.0 * std::f64::consts::PI * t).sin()
2041        };
2042        let p1 = |i: usize| -> f64 {
2043            let t = (i as f64 + 0.5) / n as f64;
2044            (2.0 * std::f64::consts::PI * t).cos()
2045        };
2046        // Within the fixed 2-plane, rotate the two design columns by φ(ψ): the
2047        // SPAN is unchanged (still the {p0,p1} plane) but the basis rotates, so
2048        // the per-ψ eigenvectors of the Gram rotate while the range projector is
2049        // ψ-invariant.
2050        let design = move |psi: f64| -> Result<Array2<f64>, String> {
2051            let phi = 0.7 * psi; // analytic angle sweep
2052            let (c, s) = (phi.cos(), phi.sin());
2053            let mut x = Array2::<f64>::zeros((n, k));
2054            for i in 0..n {
2055                let (a, b) = (p0(i), p1(i));
2056                x[[i, 0]] = c * a - s * b;
2057                x[[i, 1]] = s * a + c * b;
2058                // column 2 stays zero → range is the fixed 2-plane, rank 2.
2059            }
2060            Ok(x)
2061        };
2062        let w = Array1::from_elem(n, 1.0);
2063        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.17).cos()));
2064        let (psi_lo, psi_hi) = (-1.0_f64, 1.0_f64);
2065        let tensor = PsiGramTensor::build(design, w.view(), z.view(), psi_lo, psi_hi)
2066            .expect("smooth rotation design must certify (analytic, no kink)");
2067
2068        // Rank is a constant 2 across the window (the third direction is empty).
2069        let rank_at = |psi: f64| -> usize {
2070            tensor
2071                .range_projector(psi, PSI_GRAM_SKIP_RANK_RTOL)
2072                .map(|(_, r)| r)
2073                .unwrap_or(0)
2074        };
2075        for &psi in &[-0.95, -0.4, 0.0, 0.4, 0.95] {
2076            assert_eq!(rank_at(psi), 2, "rotation keeps rank 2 at psi={psi}");
2077        }
2078
2079        // Every in-window pair spans the SAME 2-plane (only the basis rotates),
2080        // so the subspace-distance witness MUST certify the skip — this is the
2081        // rotation that the entrywise gate kept refusing (the #1033 wall).
2082        let grid: Vec<f64> = (0..=10).map(|i| psi_lo + 0.05 + 0.09 * i as f64).collect();
2083        for &a in &grid {
2084            for &b in &grid {
2085                assert!(
2086                    tensor.reduced_basis_equal(a, b),
2087                    "pure in-plane rotation preserves the range subspace → the \
2088                     subspace-distance skip witness must certify (#1033) \
2089                     (ψ_ref={a}, ψ_new={b})"
2090                );
2091            }
2092        }
2093    }
2094}