Skip to main content

gam_solve/
psi_gram_tensor.rs

1//! Certified Chebyshev-in-ψ Gram tensor: n-independent design-moving trials
2//! (#1033 item b).
3//!
4//! ## Why
5//!
6//! When a design-moving hyperparameter ψ (= log κ for the radial families) is
7//! searched by the outer loop, every trial today rebuilds the n×k design and
8//! re-forms XᵀWX — an O(n·k) + O(n·k²) pass per trial. But along the trial
9//! window every design entry `X(ψ)[i, j]` is an ANALYTIC function of ψ on a
10//! compact interval (Matérn channels depend on (r, ℓ) only through κr and
11//! κ-power prefactors; Duchon power blocks are ψ-free; partial-fraction
12//! coefficients are analytic scalars), so both the design and its Gaussian
13//! sufficient statistics admit geometrically-convergent Chebyshev expansions
14//!
15//! ```text
16//!   X(ψ) = Σ_{d=0}^{D} X_d · T_d(ψ̃),     ψ̃ = affine map of ψ to [−1, 1],
17//! ```
18//!
19//! with n×k coefficient slabs `X_d` computed ONCE from D+1 exact design
20//! evaluations at Chebyshev nodes (a first-kind DCT). The design coefficients
21//! certify analyticity, while the shipped runtime series is fit directly to the
22//! exact node sufficient statistics `G(ψ_i)=X(ψ_i)ᵀ W X(ψ_i)` and
23//! `c(ψ_i)=X(ψ_i)ᵀ W z`. Interpolating the sufficient statistics directly avoids
24//! the extra product-truncation residual from forming `Σ_d,e X_dᵀWX_e T_dT_e`,
25//! which a weakly penalized radial solve can amplify into visible β̂ drift.
26//! Every subsequent trial is n-free:
27//!
28//! ```text
29//!   XᵀWX(ψ) = Σ_d T_d(ψ̃) G_d          O(D k²)
30//!   XᵀWz(ψ) = Σ_d T_d(ψ̃) c_d          O(D k)
31//!   ∂/∂ψ (XᵀWX) = Σ_d T_d′(ψ̃) G_d     O(D k²)
32//! ```
33//!
34//! The ψ-gradient comes from the SAME representation as the value — one
35//! source of truth, structurally immune to the objective↔gradient desync
36//! class. `T_d′(ψ̃) = d·U_{d−1}(ψ̃) · dψ̃/dψ` is closed-form.
37//!
38//! ## Certification, not approximation-by-fiat
39//!
40//! Same discipline as [`gam_terms::basis::radial_profile`]: [`PsiGramTensor::build`]
41//! returns `None` (callers keep the exact per-trial path) unless BOTH
42//! 1. the Chebyshev coefficient tail of the EXPANDED DESIGN decays below
43//!    [`PSI_GRAM_CERT_RTOL`] of the design scale (geometric-decay certificate
44//!    for analytic interpolands, with node-count escalation), and
45//! 2. deterministic off-node spot checks of the ASSEMBLED Gram against an
46//!    exactly rebuilt Gram agree to [`PSI_GRAM_SPOT_RTOL`].
47//!
48//! Trials outside `[psi_lo, psi_hi]` are the caller's signal to fall back to
49//! the exact path ([`PsiGramTensor::contains`]).
50
51use ndarray::{Array1, Array2, ArrayView1};
52
53/// Relative ceiling on the per-column Chebyshev coefficient tail (#1216).
54///
55/// This is a cheap NECESSARY-CONDITION pre-filter, not the accuracy gate: the
56/// authoritative accuracy gate is the off-node `spot_check` on the ASSEMBLED
57/// Gram ([`PSI_GRAM_SPOT_RTOL`]). On the WIDE STANDARDIZED geometry default 1-D
58/// fits use (#1215) the realized radial design needs the deeper ladder below to
59/// drive the tail beneath the beta-invariance bar. Keep this as a necessary
60/// pre-filter, not the final beta oracle: shallow 65-node tensors were fine for
61/// cost-only gates, but the weakly penalized radial solve amplified their
62/// residual into visible beta-hat drift across the reduced-basis rotation. A
63/// genuinely non-analytic design (a true kink) still refuses here or at the
64/// assembled-Gram spot check.
65pub const PSI_GRAM_CERT_RTOL: f64 = 1.0e-9;
66
67/// Relative agreement required at the off-node Gram spot checks.
68pub const PSI_GRAM_SPOT_RTOL: f64 = 1.0e-10;
69
70/// Node-count escalation ladder for the expansion build (degree = nodes − 1).
71///
72/// The top rung sizes to WIDE trial windows: Chebyshev coefficients of the
73/// Matérn-type channels decay like Bessel `I_d(σ)` with `σ ≈ s_max·halfwidth`
74/// (s = κr), which only drops below the 1e-12 tail tolerance for `d ≳ 2σ` —
75/// e.g. σ ≈ 9 (s_max ≈ 8, ±1.1 window) needs degree ≳ 40, so 33 nodes refuse
76/// and 65 certify. Node counts stay trivially cheap (one design eval each).
77///
78/// On the WIDE STANDARDIZED geometry default 1-D fits use (#1215) the tail
79/// decays cleanly but GEOMETRICALLY-slowly: measured per-column worst tail rel
80/// is ~3.2e-8 at m=33 and ~2.3e-11 at m=65 (a clean ~1300×/doubling decay, NOT a
81/// floor). The old 65-node acceptance was fine for the cost lane but not for the
82/// beta-hat soundness gate: the inner penalized solve `β̂ = (G+λS)⁻¹r`
83/// amplifies Gram residuals by the radial-kernel conditioning, especially after
84/// the production skip was relaxed to cross a reduced-basis rotation. Start at
85/// 513 nodes so the production gate no longer accepts the shallower tensors that
86/// pass the Gram spot check but still move the weakly-penalized beta solve.
87pub const PSI_GRAM_NODE_LADDER: [usize; 1] = [513];
88
89/// Number of deterministic off-node spot-check ψ values.
90pub const PSI_GRAM_SPOT_POINTS: usize = 3;
91
92/// Rank-revealing relative eigenvalue cutoff for the reduced-basis (range)
93/// projector witness [`PsiGramTensor::reduced_basis_equal`] (#1264). An
94/// eigendirection of the conditioned Gram `XᵀWX(ψ)` is counted in the range
95/// (reduced) basis when its eigenvalue exceeds `PSI_GRAM_SKIP_RANK_RTOL · λ_max`.
96/// Sized to match the inner solve's effective rank-revealing scale on the
97/// standardized radial-kernel Gram, whose conditioning sweeps several orders of
98/// magnitude across the ψ-window; a directly-below-cutoff direction is exactly
99/// the one whose inclusion flips with ψ and silently rotates the frozen reduced
100/// basis, which this witness must catch.
101pub const PSI_GRAM_SKIP_RANK_RTOL: f64 = 1.0e-10;
102
103/// Max-norm tolerance on the range-PROJECTOR agreement between the pinning ψ and
104/// the candidate ψ in [`PsiGramTensor::reduced_basis_equal`] (#1264). The
105/// orthogonal projector onto the reduced subspace is gauge-invariant and O(1) in
106/// scale, so a tight absolute tolerance certifies the two reduced bases span the
107/// SAME subspace. A subspace that has measurably rotated (the basis the frozen
108/// fast-path surface would mis-pair with a re-keyed Gram) exceeds this by orders
109/// of magnitude, so it refuses the skip well before the ~1e-6 β̂ bar is at risk.
110pub const PSI_GRAM_SKIP_PROJ_ATOL: f64 = 1.0e-7;
111
112/// Certified Chebyshev-in-ψ expansion of a design-moving Gram (#1033b).
113///
114/// Holds the one-time Chebyshev sufficient-statistic series; every per-trial
115/// accessor is O(Dk²) or cheaper and never touches n rows again.
116pub struct PsiGramTensor {
117    psi_lo: f64,
118    psi_hi: f64,
119    /// Certified gradient window over which the ANALYTIC ψ-derivative
120    /// `dgram_dpsi` reproduces the exact design derivative. The gradient lane
121    /// rides the value-lane off-node certificate [`PSI_GRAM_SPOT_RTOL`]
122    /// (#1033b gradient lane). For the #1033
123    /// sufficient-statistic outer loop this must cover the full optimizer
124    /// window; otherwise callers do not arm the n-free kappa search.
125    ///
126    /// The value reconstruction `gram_at` is certified over the FULL window
127    /// (`T_d ≤ 1` everywhere), but the derivative reconstruction amplifies the
128    /// coefficient-tail error by `T_d′ ∼ d²`. The n-free kappa search is armed
129    /// only when endpoint-aware checks certify this whole interval.
130    grad_psi_lo: f64,
131    grad_psi_hi: f64,
132    /// Number of Chebyshev coefficients (degree + 1).
133    n_coeff: usize,
134    k: usize,
135    /// Chebyshev coefficients of `X(ψ)ᵀ W X(ψ)`, obtained by a first-kind DCT of
136    /// the exact node sufficient statistics. This keeps the per-trial path to a
137    /// single O(Dk²) series and avoids product-truncation drift in β̂.
138    gram: Vec<Array2<f64>>,
139    /// Chebyshev coefficients of `X(ψ)ᵀ W z`.
140    rhs: Vec<Array1<f64>>,
141    /// `zᵀWz` — ψ-free, captured at build so the Gaussian sufficient-statistic
142    /// triple can be assembled per trial without any row access.
143    zt_w_z: f64,
144}
145
146/// One ladder rung's outcome: a hard evaluation failure aborts the whole
147/// build (no larger rung can fix a non-finite design) and carries the reason,
148/// an uncertified tail escalates to the next rung, and a candidate proceeds to
149/// the spot check.
150enum BuildOutcome {
151    EvalFailed(String),
152    TailNotCertified,
153    Candidate(PsiGramTensor),
154}
155
156/// Chebyshev values `T_0..T_{n−1}` at `x ∈ [−1, 1]`.
157fn cheb_t(x: f64, n: usize) -> Vec<f64> {
158    let mut t = vec![0.0; n];
159    if n > 0 {
160        t[0] = 1.0;
161    }
162    if n > 1 {
163        t[1] = x;
164    }
165    for d in 2..n {
166        t[d] = 2.0 * x * t[d - 1] - t[d - 2];
167    }
168    t
169}
170
171/// Chebyshev derivative values `T_0′..T_{n−1}′` at `x ∈ [−1, 1]` in the
172/// MAPPED coordinate (multiply by `dx/dψ` for the ψ-derivative):
173/// `T_d′ = d · U_{d−1}` with the Chebyshev-U recurrence.
174fn cheb_t_prime(x: f64, n: usize) -> Vec<f64> {
175    let mut u = vec![0.0; n.max(1)];
176    // U_0 = 1, U_1 = 2x, U_d = 2x U_{d−1} − U_{d−2}.
177    if !u.is_empty() {
178        u[0] = 1.0;
179    }
180    if n > 1 {
181        u[1] = 2.0 * x;
182    }
183    for d in 2..n {
184        u[d] = 2.0 * x * u[d - 1] - u[d - 2];
185    }
186    let mut tp = vec![0.0; n];
187    for d in 1..n {
188        tp[d] = d as f64 * u[d - 1];
189    }
190    tp
191}
192
193/// Chebyshev SECOND-derivative values `T_0″..T_{n−1}″` at `x ∈ [−1, 1]` in the
194/// MAPPED coordinate (multiply by `(dx/dψ)²` for the ψ-second-derivative).
195///
196/// Differentiating the value recurrence `T_d = 2x T_{d−1} − T_{d−2}` twice in
197/// `x` gives a singularity-free three-term recurrence in lock-step with `cheb_t`
198/// / `cheb_t_prime`:
199///   `T_d′  = 2 T_{d−1} + 2x T_{d−1}′ − T_{d−2}′`,
200///   `T_d″  = 4 T_{d−1}′ + 2x T_{d−1}″ − T_{d−2}″`,
201/// with `T_0 = T_0′ = T_0″ = 0`-seeds as below. Unlike the closed form
202/// `T_n″ = n((n+1)T_n − U_n)/(x²−1)` this never divides by `x²−1`, so it stays
203/// exact at the window edges `x = ±1`.
204fn cheb_t_double_prime(x: f64, n: usize) -> Vec<f64> {
205    let mut t = vec![0.0; n];
206    let mut tp = vec![0.0; n];
207    let mut tpp = vec![0.0; n];
208    if n > 0 {
209        t[0] = 1.0; // T_0 = 1, T_0′ = T_0″ = 0
210    }
211    if n > 1 {
212        t[1] = x; // T_1 = x, T_1′ = 1, T_1″ = 0
213        tp[1] = 1.0;
214    }
215    for d in 2..n {
216        t[d] = 2.0 * x * t[d - 1] - t[d - 2];
217        tp[d] = 2.0 * t[d - 1] + 2.0 * x * tp[d - 1] - tp[d - 2];
218        tpp[d] = 4.0 * tp[d - 1] + 2.0 * x * tpp[d - 1] - tpp[d - 2];
219    }
220    tpp
221}
222
223fn kahan_scaled_add_array2(
224    out: &mut Array2<f64>,
225    comp: &mut Array2<f64>,
226    scale: f64,
227    x: &Array2<f64>,
228) {
229    for ((slot, c), &value) in out.iter_mut().zip(comp.iter_mut()).zip(x.iter()) {
230        let y = scale * value - *c;
231        let t = *slot + y;
232        *c = (t - *slot) - y;
233        *slot = t;
234    }
235}
236
237fn kahan_scaled_add_array1(
238    out: &mut Array1<f64>,
239    comp: &mut Array1<f64>,
240    scale: f64,
241    x: &Array1<f64>,
242) {
243    for ((slot, c), &value) in out.iter_mut().zip(comp.iter_mut()).zip(x.iter()) {
244        let y = scale * value - *c;
245        let t = *slot + y;
246        *c = (t - *slot) - y;
247        *slot = t;
248    }
249}
250
251/// Spectral norm of a SYMMETRIC matrix `m` (here the difference of two
252/// orthogonal range projectors), i.e. `max|eigenvalue|`. For two equal-rank
253/// orthogonal projectors `P_ref`, `P_new` this equals `sin θ_max`, the sine of
254/// the largest principal angle between their ranges — the canonical, gauge- and
255/// basis-invariant distance between the two subspaces (#1033). Returns `None`
256/// if the matrix is non-finite or the symmetric eigendecomposition fails (the
257/// caller then refuses the skip, the sound fallback).
258fn subspace_spectral_distance(m: &Array2<f64>) -> Option<f64> {
259    use gam_linalg::faer_ndarray::FaerEigh;
260    if m.iter().any(|v| !v.is_finite()) {
261        return None;
262    }
263    // Symmetrize defensively against rounding (P_ref − P_new is symmetric in
264    // exact arithmetic) so the symmetric eigensolver sees a genuinely Hermitian
265    // operand and returns real eigenvalues.
266    let msym = 0.5 * (m + &m.t());
267    let (evals, _evecs) = msym.eigh(faer::Side::Lower).ok()?;
268    Some(evals.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs())))
269}
270
271impl PsiGramTensor {
272    /// Build and certify the tensor over `psi ∈ [psi_lo, psi_hi]`.
273    ///
274    /// `eval_design(psi)` must return the EXACT n×k design at `psi` (the same
275    /// builder the per-trial path uses — exactness of the expansion is judged
276    /// against it). `weights` are the fixed observation weights, `z` the fixed
277    /// weighted-response target (e.g. `y − offset`). Returns `None` when the
278    /// window is degenerate, any evaluation fails/has non-finite entries, or
279    /// no ladder rung certifies — callers then keep the exact per-trial path.
280    pub fn build(
281        mut eval_design: impl FnMut(f64) -> Result<Array2<f64>, String>,
282        weights: ArrayView1<'_, f64>,
283        z: ArrayView1<'_, f64>,
284        psi_lo: f64,
285        psi_hi: f64,
286    ) -> Result<Self, String> {
287        if !(psi_lo.is_finite() && psi_hi.is_finite()) || psi_hi <= psi_lo {
288            return Err(format!(
289                "ψ window must be finite with psi_hi > psi_lo (got [{psi_lo}, {psi_hi}])"
290            ));
291        }
292        // Track the largest rung that produced a candidate but failed to
293        // certify (tail or off-node spot check). If the whole ladder is
294        // exhausted without an accepted candidate this drives a reason that
295        // distinguishes "not analytic enough in ψ" from "evaluation failed".
296        let mut last_uncertified: Option<usize> = None;
297        for &m in PSI_GRAM_NODE_LADDER.iter() {
298            match Self::build_at(&mut eval_design, weights, z, psi_lo, psi_hi, m) {
299                // An exact evaluation failed or was non-finite somewhere in
300                // the window — no larger rung can fix that, so abort with the
301                // underlying reason rather than swallowing it as a bare refusal.
302                BuildOutcome::EvalFailed(why) => {
303                    return Err(format!(
304                        "exact design evaluation failed at ladder rung m={m}: {why}"
305                    ));
306                }
307                // Tail not yet below the certificate at this rung: escalate.
308                // (Conflating this with EvalFailed would kill the ladder at
309                // its first — intentionally coarse — rung.)
310                BuildOutcome::TailNotCertified => {
311                    last_uncertified = Some(m);
312                    continue;
313                }
314                BuildOutcome::Candidate(mut candidate) => {
315                    if candidate.spot_check(&mut eval_design, weights) {
316                        candidate.grad_psi_lo = psi_lo;
317                        candidate.grad_psi_hi = psi_hi;
318                        return Ok(candidate);
319                    }
320                    // The assembled Gram disagreed with an exact off-node
321                    // rebuild at this rung; a denser rung may still certify, so
322                    // escalate rather than abort.
323                    last_uncertified = Some(m);
324                }
325            }
326        }
327        let top_rung = PSI_GRAM_NODE_LADDER.last().copied().unwrap_or(0);
328        Err(match last_uncertified {
329            Some(m) => format!(
330                "Chebyshev series did not certify within the node ladder (reached rung \
331                 m={m}, top rung {top_rung}): the design is not analytic enough in ψ over \
332                 [{psi_lo}, {psi_hi}] (a kink or non-finite curvature), so the n-free \
333                 tensor is refused and the exact per-trial path must be used"
334            ),
335            None => "empty Chebyshev node ladder".to_string(),
336        })
337    }
338
339    fn build_at(
340        eval_design: &mut impl FnMut(f64) -> Result<Array2<f64>, String>,
341        weights: ArrayView1<'_, f64>,
342        z: ArrayView1<'_, f64>,
343        psi_lo: f64,
344        psi_hi: f64,
345        m: usize,
346    ) -> BuildOutcome {
347        // First-kind Chebyshev nodes (no endpoints) and exact design slabs.
348        let mut nodes_x = vec![0.0_f64; m];
349        let mut designs: Vec<Array2<f64>> = Vec::with_capacity(m);
350        for (i, x_slot) in nodes_x.iter_mut().enumerate() {
351            let x = (std::f64::consts::PI * (2 * i + 1) as f64 / (2 * m) as f64).cos();
352            *x_slot = x;
353            let psi = 0.5 * (psi_lo + psi_hi) + 0.5 * (psi_hi - psi_lo) * x;
354            let design = match eval_design(psi) {
355                Ok(design) => design,
356                Err(why) => {
357                    return BuildOutcome::EvalFailed(format!(
358                        "design evaluation refused at node ψ={psi:.6}: {why}"
359                    ));
360                }
361            };
362            if design.iter().any(|v| !v.is_finite()) {
363                return BuildOutcome::EvalFailed(format!(
364                    "design at node ψ={psi:.6} contains a non-finite entry"
365                ));
366            }
367            designs.push(design);
368        }
369        let (n, k) = designs[0].dim();
370        if designs.iter().any(|d| d.dim() != (n, k)) {
371            return BuildOutcome::EvalFailed(format!(
372                "design dimensions vary across ψ nodes (first node is {n}×{k})"
373            ));
374        }
375        if weights.len() != n || z.len() != n || n == 0 || k == 0 {
376            return BuildOutcome::EvalFailed(format!(
377                "incompatible build inputs: design {n}×{k}, weights.len()={}, z.len()={}",
378                weights.len(),
379                z.len()
380            ));
381        }
382        // First-kind discrete orthogonality: coefficient slabs
383        //   X_d = (γ_d / m) Σ_i X(ψ_i) T_d(x_i),  γ_0 = 1, γ_d = 2.
384        let t_at_nodes: Vec<Vec<f64>> = nodes_x.iter().map(|&x| cheb_t(x, m)).collect();
385        let mut coeff_slabs: Vec<Array2<f64>> = Vec::with_capacity(m);
386        for d in 0..m {
387            let gamma = if d == 0 { 1.0 } else { 2.0 };
388            let mut slab = Array2::<f64>::zeros((n, k));
389            let mut slab_comp = Array2::<f64>::zeros((n, k));
390            for (i, design) in designs.iter().enumerate() {
391                let wgt = gamma / m as f64 * t_at_nodes[i][d];
392                kahan_scaled_add_array2(&mut slab, &mut slab_comp, wgt, design);
393            }
394            coeff_slabs.push(slab);
395        }
396        // Tail-decay certificate per design column: the trailing quarter of the
397        // coefficient slabs must fall below [`PSI_GRAM_CERT_RTOL`] × column
398        // scale.
399        //
400        // #1216: on the WIDE STANDARDIZED geometry default 1-D fits use (#1215)
401        // the per-column Chebyshev tail decays cleanly but GEOMETRICALLY-SLOWLY
402        // (measured ~3.2e-8 at m=33, ~2.3e-11 at m=65 — a ~1300×/doubling decay,
403        // NOT a floor). The previous over-tight 1e-12 bar refused at m=65 and the
404        // n-free fast path never attached. The certificate is a cheap
405        // NECESSARY-CONDITION pre-filter whose job is to guarantee the ASSEMBLED
406        // Gram is accurate enough; that accuracy is authoritatively enforced by
407        // the off-node `spot_check` (`PSI_GRAM_SPOT_RTOL`, on the assembled Gram
408        // against an exact rebuild). Sizing the pre-filter to
409        // [`PSI_GRAM_CERT_RTOL`] = 1e-9 lets the (geometrically-decaying) design
410        // certify at m=65, and the ladder's m=129 rung drives the residual to
411        // ~1.7e-14 so the inner penalized solve's conditioning-amplified β̂ stays
412        // bit-tight. A design that is genuinely non-analytic (a true kink) floors
413        // ORDERS above this and is refused, with the spot-check as the hard
414        // backstop.
415        let mut col_scale = vec![0.0_f64; k];
416        for slab in &coeff_slabs {
417            for (j, scale) in col_scale.iter_mut().enumerate() {
418                for i in 0..n {
419                    *scale = scale.max(slab[[i, j]].abs());
420                }
421            }
422        }
423        let tail_start = m - (m / 4).max(1);
424        for slab in coeff_slabs.iter().skip(tail_start) {
425            for (j, &scale) in col_scale.iter().enumerate() {
426                let bound = PSI_GRAM_CERT_RTOL * scale.max(1e-300);
427                for i in 0..n {
428                    if slab[[i, j]].abs() > bound {
429                        return BuildOutcome::TailNotCertified;
430                    }
431                }
432            }
433        }
434        let mut wz = Array1::<f64>::zeros(z.len());
435        let mut zt_w_z = 0.0_f64;
436        let mut zt_w_z_comp = 0.0_f64;
437        for ((slot, &w), &zv) in wz.iter_mut().zip(weights.iter()).zip(z.iter()) {
438            *slot = w * zv;
439            let add = w * zv * zv;
440            let y = add - zt_w_z_comp;
441            let t = zt_w_z + y;
442            zt_w_z_comp = (t - zt_w_z) - y;
443            zt_w_z = t;
444        }
445        // One-time exact node sufficient statistics, then a first-kind DCT in ψ.
446        // This still pays all row work only during build, but the retained series
447        // approximates G(ψ) and c(ψ) directly instead of multiplying two
448        // separately truncated design series.
449        let mut node_grams: Vec<Array2<f64>> = Vec::with_capacity(m);
450        let mut node_rhs: Vec<Array1<f64>> = Vec::with_capacity(m);
451        for design in &designs {
452            let mut wd = design.clone();
453            for (mut row, &w) in wd.outer_iter_mut().zip(weights.iter()) {
454                row.mapv_inplace(|v| v * w);
455            }
456            node_grams.push(design.t().dot(&wd));
457            node_rhs.push(design.t().dot(&wz));
458        }
459        let mut gram: Vec<Array2<f64>> = (0..m).map(|_| Array2::<f64>::zeros((k, k))).collect();
460        let mut gram_comp: Vec<Array2<f64>> =
461            (0..m).map(|_| Array2::<f64>::zeros((k, k))).collect();
462        let mut rhs: Vec<Array1<f64>> = (0..m).map(|_| Array1::<f64>::zeros(k)).collect();
463        let mut rhs_comp: Vec<Array1<f64>> = (0..m).map(|_| Array1::<f64>::zeros(k)).collect();
464        for d in 0..m {
465            let gamma = if d == 0 { 1.0 } else { 2.0 };
466            for i in 0..m {
467                let wgt = gamma / m as f64 * t_at_nodes[i][d];
468                kahan_scaled_add_array2(&mut gram[d], &mut gram_comp[d], wgt, &node_grams[i]);
469                kahan_scaled_add_array1(&mut rhs[d], &mut rhs_comp[d], wgt, &node_rhs[i]);
470            }
471        }
472        drop(node_grams);
473        drop(node_rhs);
474        drop(designs);
475        drop(coeff_slabs);
476        BuildOutcome::Candidate(Self {
477            psi_lo,
478            psi_hi,
479            // Provisional: `build` promotes these to the certified value window
480            // after the value spot-check passes.
481            grad_psi_lo: psi_lo,
482            grad_psi_hi: psi_hi,
483            n_coeff: m,
484            k,
485            gram,
486            rhs,
487            zt_w_z,
488        })
489    }
490
491    /// Off-node certification: the ASSEMBLED Gram must reproduce the exactly
492    /// rebuilt Gram at deterministic interior ψ values.
493    fn spot_check(
494        &self,
495        eval_design: &mut impl FnMut(f64) -> Result<Array2<f64>, String>,
496        weights: ArrayView1<'_, f64>,
497    ) -> bool {
498        for s in 0..PSI_GRAM_SPOT_POINTS {
499            // Golden-ratio low-discrepancy interior points — never the nodes.
500            let frac = ((s as f64 + 1.0) * 0.618_033_988_749_894_9).fract();
501            let psi = self.psi_lo + frac * (self.psi_hi - self.psi_lo);
502            let Ok(design) = eval_design(psi) else {
503                return false;
504            };
505            let mut wd = design.clone();
506            for (mut row, &w) in wd.outer_iter_mut().zip(weights.iter()) {
507                row.mapv_inplace(|v| v * w);
508            }
509            let exact = design.t().dot(&wd);
510            let assembled = self.gram_at(psi);
511            let scale = exact
512                .iter()
513                .fold(0.0_f64, |acc, &v| acc.max(v.abs()))
514                .max(1e-300);
515            for (a, b) in assembled.iter().zip(exact.iter()) {
516                if (a - b).abs() > PSI_GRAM_SPOT_RTOL * scale {
517                    return false;
518                }
519            }
520        }
521        true
522    }
523
524    /// Range (reduced-basis) projector of the conditioned Gram `XᵀWX(ψ)` and the
525    /// numerical rank, computed n-free from the k-space tensor. The reduced basis
526    /// the inner penalized solve forms is the column span of the eigenvectors of
527    /// the (symmetric PSD) Gram whose eigenvalue exceeds a rank-revealing cutoff
528    /// relative to the largest eigenvalue. The orthogonal projector `P = U_r U_rᵀ`
529    /// onto that span is a frame-INVARIANT witness of the reduced basis: two ψ's
530    /// share a reduced basis iff their range projectors coincide (the projector
531    /// is invariant to the orthonormal-basis gauge freedom within the range, so
532    /// it isolates exactly the subspace identity the skip needs, not an arbitrary
533    /// eigenvector rotation). Returns `None` if the Gram is non-finite or its
534    /// symmetric eigendecomposition fails.
535    fn range_projector(&self, psi: f64, rank_rtol: f64) -> Option<(Array2<f64>, usize)> {
536        use gam_linalg::faer_ndarray::FaerEigh;
537        let g = self.gram_at(psi);
538        if g.iter().any(|v| !v.is_finite()) {
539            return None;
540        }
541        // Symmetrize defensively (gram_at is symmetric up to rounding).
542        let gsym = 0.5 * (&g + &g.t());
543        let (evals, evecs) = gsym.eigh(faer::Side::Lower).ok()?;
544        // `eigh` returns ascending eigenvalues; the Gram is PSD so the largest is
545        // the trailing one. The rank cutoff is relative to that maximum.
546        let lambda_max = evals.iter().cloned().fold(0.0_f64, f64::max);
547        if !(lambda_max > 0.0) {
548            return None;
549        }
550        let cutoff = rank_rtol * lambda_max;
551        let mut proj = Array2::<f64>::zeros((self.k, self.k));
552        let mut rank = 0usize;
553        for (col, &lam) in evals.iter().enumerate() {
554            if lam > cutoff {
555                let u = evecs.column(col);
556                // P += u uᵀ.
557                for a in 0..self.k {
558                    for b in 0..self.k {
559                        proj[[a, b]] += u[a] * u[b];
560                    }
561                }
562                rank += 1;
563            }
564        }
565        Some((proj, rank))
566    }
567
568    /// True when the realized reduced basis the design-revision fast path freezes
569    /// at the pinning `psi_ref` is still valid at `psi_new` — the genuine
570    /// reduced-basis-equality witness the skip requires (#1264, #1216 item 3).
571    ///
572    /// The fast path keeps the reference surface (its conditioned frame and its
573    /// RRQR-reduced / null-space basis) frozen at `psi_ref` while re-keying only
574    /// the Gram `XᵀWX(ψ)` and penalty `S(ψ)` to `psi_new`. That is exact iff the
575    /// reduced basis — the range / null split of the conditioned data Gram — is
576    /// unchanged. A conditioning-ratio or RRQR rank/permutation gate only bounds
577    /// NECESSARY conditions; the reduced SUBSPACE can still rotate while rank and
578    /// pivot order look tame, which is exactly the ~7.8e-2 β̂ regression a cluster run
579    /// found. This witness compares the orthogonal RANGE PROJECTORS of the
580    /// conditioned Gram at `psi_ref` and `psi_new` (both assembled n-free from the
581    /// tensor): the skip is sound only when the numerical ranks match AND the
582    /// projectors agree to `proj_atol` in max-norm — i.e. the two reduced bases
583    /// span the SAME subspace. The projector identity is gauge-invariant, so it
584    /// certifies subspace equality directly rather than a particular basis choice.
585    ///
586    /// `psi_ref == psi_new` (a repeat trial at the same ψ) is trivially sound.
587    /// Off-window ψ's, a non-finite / rank-degenerate Gram, or any eigendecomp
588    /// failure return `false` (refuse the skip → caller takes the slow path).
589    ///
590    /// ROTATION WALL (#1033). On production spatial geometry the conditioned
591    /// data-Gram range subspace can ROTATE with ψ at fixed rank — the wall on
592    /// which the earlier RRQR-pivot / entrywise-projector gates kept refusing the
593    /// skip. The fix is the SUBSPACE-DISTANCE certificate below: the skip is sound
594    /// exactly when the two equal-rank ranges coincide as SUBSPACES, measured by
595    /// the spectral norm of the projector difference (the principal angle), which
596    /// is invariant to any orthonormal-basis rotation WITHIN the range. So a pure
597    /// gauge rotation that left the entrywise max-abs above tolerance — and
598    /// therefore used to be refused — now certifies, letting the n-free skip fire
599    /// across the rotation. A genuine subspace MOVE (different rank, or a real
600    /// principal-angle separation) still refuses; refusing is the SOUND fallback
601    /// (the caller takes the exact slow path). Do not weaken
602    /// `PSI_GRAM_SKIP_PROJ_ATOL` / `PSI_GRAM_SKIP_RANK_RTOL`: the spectral gate is
603    /// already the tightest correct subspace metric, and loosening it past a true
604    /// principal-angle separation reintroduces the ~7.8e-2 β̂ regression this
605    /// witness exists to prevent.
606    pub fn reduced_basis_equal(&self, psi_ref: f64, psi_new: f64) -> bool {
607        if !(self.contains(psi_ref) && self.contains(psi_new)) {
608            return false;
609        }
610        if psi_ref == psi_new {
611            return true;
612        }
613        let Some((p_ref, r_ref)) = self.range_projector(psi_ref, PSI_GRAM_SKIP_RANK_RTOL) else {
614            return false;
615        };
616        let Some((p_new, r_new)) = self.range_projector(psi_new, PSI_GRAM_SKIP_RANK_RTOL) else {
617            return false;
618        };
619        if r_ref != r_new {
620            return false;
621        }
622        // Subspace-distance certificate (#1033). The two reduced bases span the
623        // SAME subspace iff their orthogonal range projectors coincide. The
624        // correct, gauge-invariant measure of "how far apart" two equal-rank
625        // subspaces are is the SPECTRAL NORM ‖P_ref − P_new‖₂ = sin θ_max, the
626        // sine of the largest principal angle between the ranges — NOT the
627        // entrywise max-abs of the projector difference. The old entrywise test
628        // is a strictly weaker proxy: across a basis ROTATION (the radial-Gram
629        // rotation wall this skip kept tripping on) the projector entries can
630        // each drift while the spanned subspace is numerically identical, so the
631        // entrywise max could exceed tolerance and FALSELY refuse a sound skip.
632        // P_ref − P_new is symmetric, so its spectral norm is max|eigenvalue|;
633        // compute it via the symmetric eigensolver and gate on the principal
634        // angle directly. This certifies subspace identity across the rotation,
635        // letting the n-free skip fire whenever the range genuinely coincides,
636        // while still refusing (the SOUND fallback) the instant the subspaces
637        // separate by more than PSI_GRAM_SKIP_PROJ_ATOL in true subspace
638        // distance. An eigendecomp failure on the (small k×k) difference refuses.
639        let diff = &p_ref - &p_new;
640        subspace_spectral_distance(&diff)
641            .map(|d| d <= PSI_GRAM_SKIP_PROJ_ATOL)
642            .unwrap_or(false)
643    }
644
645    /// True when `psi` lies inside the certified window.
646    pub fn contains(&self, psi: f64) -> bool {
647        psi.is_finite() && psi >= self.psi_lo && psi <= self.psi_hi
648    }
649
650    /// True when `psi` lies inside the certified gradient window where the
651    /// analytic ψ-derivative is bit-tight against the exact design derivative
652    /// (#1033b). The n-free kappa outer loop is armed only when this covers the
653    /// full optimizer bounds.
654    pub fn contains_for_gradient(&self, psi: f64) -> bool {
655        psi.is_finite()
656            && self.grad_psi_lo.is_finite()
657            && self.grad_psi_hi.is_finite()
658            && psi >= self.grad_psi_lo
659            && psi <= self.grad_psi_hi
660    }
661
662    fn mapped(&self, psi: f64) -> f64 {
663        (2.0 * psi - (self.psi_lo + self.psi_hi)) / (self.psi_hi - self.psi_lo)
664    }
665
666    /// `XᵀWX(ψ)` assembled n-free in O(Dk²) from the direct Gram series.
667    pub fn gram_at(&self, psi: f64) -> Array2<f64> {
668        let x = self.mapped(psi);
669        let t = cheb_t(x, self.gram.len());
670        let mut out = Array2::<f64>::zeros((self.k, self.k));
671        let mut comp = Array2::<f64>::zeros((self.k, self.k));
672        for (d, td) in t.iter().enumerate() {
673            kahan_scaled_add_array2(&mut out, &mut comp, *td, &self.gram[d]);
674        }
675        out
676    }
677
678    /// `XᵀWz(ψ)` assembled n-free in O(Dk).
679    pub fn rhs_at(&self, psi: f64) -> Array1<f64> {
680        let x = self.mapped(psi);
681        let t = cheb_t(x, self.n_coeff);
682        let mut out = Array1::<f64>::zeros(self.k);
683        let mut comp = Array1::<f64>::zeros(self.k);
684        for (d, td) in t.iter().enumerate() {
685            kahan_scaled_add_array1(&mut out, &mut comp, *td, &self.rhs[d]);
686        }
687        out
688    }
689
690    /// Exact `∂(XᵀWX)/∂ψ` from the SAME representation as the value — the
691    /// structural cure for the objective↔gradient desync class on this
692    /// channel. n-free, O(Dk²) from the direct Gram series.
693    pub fn dgram_dpsi(&self, psi: f64) -> Array2<f64> {
694        let x = self.mapped(psi);
695        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
696        let tp = cheb_t_prime(x, self.gram.len());
697        let mut out = Array2::<f64>::zeros((self.k, self.k));
698        let mut comp = Array2::<f64>::zeros((self.k, self.k));
699        for (d, tpd) in tp.iter().enumerate() {
700            kahan_scaled_add_array2(&mut out, &mut comp, *tpd * dx_dpsi, &self.gram[d]);
701        }
702        out
703    }
704
705    /// Exact `∂(XᵀWz)/∂ψ`, n-free.
706    pub fn drhs_dpsi(&self, psi: f64) -> Array1<f64> {
707        let x = self.mapped(psi);
708        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
709        let tp = cheb_t_prime(x, self.n_coeff);
710        let mut out = Array1::<f64>::zeros(self.k);
711        let mut comp = Array1::<f64>::zeros(self.k);
712        for (d, tpd) in tp.iter().enumerate() {
713            kahan_scaled_add_array1(&mut out, &mut comp, *tpd * dx_dpsi, &self.rhs[d]);
714        }
715        out
716    }
717
718    /// Exact `∂²(XᵀWX)/∂ψ²` from the SAME representation as the value/gradient —
719    /// the n-free curvature that lets the outer Newton/ARC step read the τ-τ
720    /// Hessian's design-moving block without re-streaming an O(n) slab Gram
721    /// (#1033, Gaussian-identity single-ψ Hessian channel). O(Dk²) from the
722    /// direct Gram series.
723    ///
724    /// `XᵀWX(ψ) = Σ_d T_d(x) G_d` with `x = mapped(ψ)`, so by the chain rule
725    /// `d²/dψ² = T_d″(x) · (dx/dψ)²`.
726    pub fn d2gram_dpsi2(&self, psi: f64) -> Array2<f64> {
727        let x = self.mapped(psi);
728        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
729        let dx_dpsi_sq = dx_dpsi * dx_dpsi;
730        let tpp = cheb_t_double_prime(x, self.gram.len());
731        let mut out = Array2::<f64>::zeros((self.k, self.k));
732        let mut comp = Array2::<f64>::zeros((self.k, self.k));
733        for (d, tppd) in tpp.iter().enumerate() {
734            kahan_scaled_add_array2(&mut out, &mut comp, *tppd * dx_dpsi_sq, &self.gram[d]);
735        }
736        out
737    }
738
739    /// Exact `∂²(XᵀWz)/∂ψ²`, n-free. `T_d″·(dx/dψ)²` against the rhs slabs.
740    pub fn d2rhs_dpsi2(&self, psi: f64) -> Array1<f64> {
741        let x = self.mapped(psi);
742        let dx_dpsi = 2.0 / (self.psi_hi - self.psi_lo);
743        let dx_dpsi_sq = dx_dpsi * dx_dpsi;
744        let tpp = cheb_t_double_prime(x, self.n_coeff);
745        let mut out = Array1::<f64>::zeros(self.k);
746        let mut comp = Array1::<f64>::zeros(self.k);
747        for (d, tppd) in tpp.iter().enumerate() {
748            kahan_scaled_add_array1(&mut out, &mut comp, *tppd * dx_dpsi_sq, &self.rhs[d]);
749        }
750        out
751    }
752
753    /// Assemble the Gaussian-identity sufficient-statistic cache at `psi`
754    /// without touching a single data row — the bridge from this tensor into
755    /// the inner PLS solver's fast path (#1033b → `GaussianFixedCache`).
756    ///
757    /// `(XᵀWX, XᵀWz, zᵀWz)` is everything the Gaussian penalized solve needs
758    /// at any λ, so a ψ-trial that holds a certified tensor can hand the
759    /// inner solver this cache instead of realizing the n×k design. The
760    /// caller is responsible for `contains(psi)` (off-window trials fall back
761    /// to the exact realizer path). Dense-path bridge only: the sparse
762    /// scatter cache stays `None`.
763    pub fn gaussian_fixed_cache_at(&self, psi: f64) -> crate::pirls::GaussianFixedCache {
764        crate::pirls::GaussianFixedCache {
765            xtwx_orig: self.gram_at(psi),
766            xtwy_orig: self.rhs_at(psi),
767            centered_weighted_y_sq: self.zt_w_z,
768            row_prediction_is_stale: true,
769            xtwx_sparse_orig: None,
770        }
771    }
772}
773
774#[cfg(test)]
775mod tests {
776    use super::*;
777
778    /// Analytic Matérn-shaped synthetic design: entries g(e^{u_ij + ψ}) with
779    /// g(s) = (1 + s)·exp(−s) (the ν = 3/2 Matérn shape) plus a ψ-free power
780    /// column — the exact structural mix of the production radial designs.
781    fn synth_design(psi: f64, n: usize, k: usize) -> Result<Array2<f64>, String> {
782        let mut x = Array2::<f64>::zeros((n, k));
783        for i in 0..n {
784            for j in 0..k {
785                let r = 0.05 + (i as f64 + 1.0) * (j as f64 + 1.0) / (n as f64 * k as f64) * 3.0;
786                if j == k - 1 {
787                    // ψ-free polynomial block column.
788                    x[[i, j]] = r * r * r;
789                } else {
790                    let s = r * psi.exp();
791                    x[[i, j]] = (1.0 + s) * (-s).exp();
792                }
793            }
794        }
795        Ok(x)
796    }
797
798    fn exact_gram(psi: f64, n: usize, k: usize, w: &Array1<f64>) -> Array2<f64> {
799        let design = synth_design(psi, n, k).unwrap();
800        let mut wd = design.clone();
801        for (mut row, &wi) in wd.outer_iter_mut().zip(w.iter()) {
802            row.mapv_inplace(|v| v * wi);
803        }
804        design.t().dot(&wd)
805    }
806
807    /// #1033b primitive gate: the certified tensor must reproduce the exact
808    /// Gram/rhs at arbitrary in-window ψ to certification accuracy, and its
809    /// analytic ψ-derivative must match central finite differences of the
810    /// exact Gram — value and gradient from one representation.
811    #[test]
812    fn psi_gram_tensor_matches_exact_gram_and_fd_gradient() {
813        let (n, k) = (160usize, 7usize);
814        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
815        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.37).sin()));
816        let (psi_lo, psi_hi) = (-1.2_f64, 1.0_f64);
817
818        let tensor = PsiGramTensor::build(
819            |psi| synth_design(psi, n, k),
820            w.view(),
821            z.view(),
822            psi_lo,
823            psi_hi,
824        )
825        .expect("analytic synthetic design must certify");
826
827        for &psi in &[-1.1, -0.63, 0.0, 0.41, 0.97] {
828            assert!(tensor.contains(psi));
829            let exact = exact_gram(psi, n, k, &w);
830            let fast = tensor.gram_at(psi);
831            let scale = exact.iter().fold(0.0_f64, |a, &v| a.max(v.abs()));
832            for (a, b) in fast.iter().zip(exact.iter()) {
833                assert!(
834                    (a - b).abs() <= 1e-9 * scale,
835                    "gram mismatch at psi={psi}: fast={a}, exact={b}"
836                );
837            }
838            // rhs check against the exact weighted cross-product.
839            let design = synth_design(psi, n, k).unwrap();
840            let mut wz = Array1::<f64>::zeros(n);
841            for ((slot, &wi), &zi) in wz.iter_mut().zip(w.iter()).zip(z.iter()) {
842                *slot = wi * zi;
843            }
844            let exact_rhs = design.t().dot(&wz);
845            let fast_rhs = tensor.rhs_at(psi);
846            let rscale = exact_rhs.iter().fold(0.0_f64, |a, &v| a.max(v.abs()));
847            for (a, b) in fast_rhs.iter().zip(exact_rhs.iter()) {
848                assert!(
849                    (a - b).abs() <= 1e-9 * rscale,
850                    "rhs mismatch at psi={psi}: fast={a}, exact={b}"
851                );
852            }
853            // Analytic ψ-gradient vs central FD of the EXACT gram.
854            let h = 1e-5;
855            let g_plus = exact_gram(psi + h, n, k, &w);
856            let g_minus = exact_gram(psi - h, n, k, &w);
857            let dg = tensor.dgram_dpsi(psi);
858            let dscale = dg.iter().fold(0.0_f64, |a, &v| a.max(v.abs())).max(1e-12);
859            for ((a, p), m_) in dg.iter().zip(g_plus.iter()).zip(g_minus.iter()) {
860                let fd = (p - m_) / (2.0 * h);
861                assert!(
862                    (a - fd).abs() <= 1e-5 * dscale,
863                    "dgram/dpsi mismatch at psi={psi}: analytic={a}, fd={fd}"
864                );
865            }
866        }
867        // Outside the window the caller must fall back to the exact path.
868        assert!(!tensor.contains(psi_hi + 0.5));
869        assert!(!tensor.contains(psi_lo - 0.5));
870
871        // Bridge gate (#1033b → GaussianFixedCache): the n-free cache must
872        // reproduce the exactly streamed sufficient statistics, and the
873        // ridge-penalized solves through both must agree — the inner PLS
874        // consumes nothing else, so this certifies the trial-loop handoff.
875        for &psi in &[-0.9, 0.2, 0.8] {
876            let cache = tensor.gaussian_fixed_cache_at(psi);
877            let design = synth_design(psi, n, k).unwrap();
878            let mut wd = design.clone();
879            for (mut row, &wi) in wd.outer_iter_mut().zip(w.iter()) {
880                row.mapv_inplace(|v| v * wi);
881            }
882            let exact_gram = design.t().dot(&wd);
883            let exact_rhs = wd.t().dot(&z);
884            let exact_ztwz: f64 = w.iter().zip(z.iter()).map(|(&wi, &zi)| wi * zi * zi).sum();
885            assert!(
886                (cache.centered_weighted_y_sq - exact_ztwz).abs()
887                    <= 1e-12 * exact_ztwz.abs().max(1e-300),
888                "zᵀWz drift: cache={}, exact={exact_ztwz}",
889                cache.centered_weighted_y_sq
890            );
891            // Ridge-penalized solve agreement: (G + I)β = r on both sides.
892            let solve = |g: &Array2<f64>, r: &Array1<f64>| -> Array1<f64> {
893                let mut a = g.clone();
894                for i in 0..k {
895                    a[[i, i]] += 1.0;
896                }
897                // Small dense Gauss elimination (k = 7 in this test).
898                let mut aug = Array2::<f64>::zeros((k, k + 1));
899                aug.slice_mut(ndarray::s![.., ..k]).assign(&a);
900                aug.slice_mut(ndarray::s![.., k]).assign(r);
901                for col in 0..k {
902                    let piv = (col..k)
903                        .max_by(|&p, &q| aug[[p, col]].abs().total_cmp(&aug[[q, col]].abs()))
904                        .unwrap();
905                    if piv != col {
906                        for j in 0..=k {
907                            let tmp = aug[[col, j]];
908                            aug[[col, j]] = aug[[piv, j]];
909                            aug[[piv, j]] = tmp;
910                        }
911                    }
912                    let p = aug[[col, col]];
913                    for row in 0..k {
914                        if row == col {
915                            continue;
916                        }
917                        let f = aug[[row, col]] / p;
918                        for j in col..=k {
919                            aug[[row, j]] -= f * aug[[col, j]];
920                        }
921                    }
922                }
923                Array1::from_iter((0..k).map(|i| aug[[i, k]] / aug[[i, i]]))
924            };
925            let beta_fast = solve(&cache.xtwx_orig, &cache.xtwy_orig);
926            let beta_exact = solve(&exact_gram, &exact_rhs);
927            let bscale = beta_exact
928                .iter()
929                .fold(0.0_f64, |a, &v| a.max(v.abs()))
930                .max(1e-300);
931            for (a, b) in beta_fast.iter().zip(beta_exact.iter()) {
932                assert!(
933                    (a - b).abs() <= 1e-8 * bscale,
934                    "penalized solve drift at psi={psi}: fast={a}, exact={b}"
935                );
936            }
937        }
938    }
939
940    /// #1033 n-independence invariant (structural, build-free, bit-tight):
941    /// after the one-time `build` n-pass, EVERY per-trial accessor the certified
942    /// κ/ψ outer-loop hot path consumes — the value `(gram_at, rhs_at)`, the
943    /// gradient `(dgram_dpsi, drhs_dpsi)`, the Hessian-channel curvature
944    /// `(d2gram_dpsi2, d2rhs_dpsi2)`, and the inner-solver bridge
945    /// `gaussian_fixed_cache_at` — must touch ZERO data rows. We prove this by
946    /// instrumenting the `eval_design` closure with an invocation counter (the
947    /// closure is the ONLY route to the n×k design): the counter advances during
948    /// `build` (the certified node ladder + off-node spot checks) and must then
949    /// stay FROZEN across an entire ψ-trial sweep. This is the
950    /// "no surface rebuild / no n×k re-realization on a cache-hit trial"
951    /// invariant the outer-loop seam (`SpatialJointContext::eval_full`,
952    /// `skip_design_realization`) relies on — asserted here at the tensor source
953    /// of truth, independent of whether the full κ-fit converges or of any
954    /// wall-clock measurement.
955    #[test]
956    fn psi_gram_tensor_trial_accessors_touch_no_data_rows() {
957        use std::cell::Cell;
958
959        let (n, k) = (256usize, 6usize);
960        let w = Array1::from_iter((0..n).map(|i| 0.8 + 0.4 * ((i % 4) as f64) / 3.0));
961        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.21).sin() + 0.2));
962        let (psi_lo, psi_hi) = (-1.1_f64, 0.95_f64);
963
964        // The closure is the SOLE path to the n×k design; count every call.
965        let calls = Cell::new(0usize);
966        let tensor = PsiGramTensor::build(
967            |psi| {
968                calls.set(calls.get() + 1);
969                synth_design(psi, n, k)
970            },
971            w.view(),
972            z.view(),
973            psi_lo,
974            psi_hi,
975        )
976        .expect("analytic synthetic design must certify");
977
978        // The one-time build necessarily streamed the design at the Chebyshev
979        // nodes plus off-node spot checks. Freeze the count.
980        let build_calls = calls.get();
981        assert!(
982            build_calls > 0,
983            "build must have streamed the design at least once (sanity)"
984        );
985
986        // A dense ψ-trial sweep strictly inside the certified window. Every
987        // accessor below is what a per-trial outer-loop eval consumes.
988        let m = 64usize;
989        let lo = psi_lo + 0.05;
990        let hi = psi_hi - 0.05;
991        for i in 0..m {
992            let psi = lo + (hi - lo) * (i as f64) / (m as f64 - 1.0);
993            assert!(tensor.contains(psi));
994            // Value lane.
995            let _g = tensor.gram_at(psi);
996            let _r = tensor.rhs_at(psi);
997            // Gradient lane.
998            let _dg = tensor.dgram_dpsi(psi);
999            let _dr = tensor.drhs_dpsi(psi);
1000            // Hessian-channel curvature.
1001            let _d2g = tensor.d2gram_dpsi2(psi);
1002            let _d2r = tensor.d2rhs_dpsi2(psi);
1003            // Inner-solver bridge (the GaussianFixedCache the PLS fast path reads).
1004            let _cache = tensor.gaussian_fixed_cache_at(psi);
1005        }
1006
1007        assert_eq!(
1008            calls.get(),
1009            build_calls,
1010            "n-independence VIOLATED: a per-trial accessor re-streamed the n×k \
1011             design ({} extra eval_design calls across {m} ψ-trials). The certified \
1012             κ/ψ outer loop must serve value + gradient + Hessian curvature + the \
1013             inner-solver cache from k-space sufficient statistics only, with NO \
1014             per-trial n-row work.",
1015            calls.get() - build_calls
1016        );
1017    }
1018
1019    /// #1033 Hessian-channel primitive gate: the n-free second ψ-derivatives
1020    /// `d2gram_dpsi2` / `d2rhs_dpsi2` must match central FD of the analytic FIRST
1021    /// derivatives (`dgram_dpsi` / `drhs_dpsi`) — the curvature the outer Newton
1022    /// /ARC step reads when the Gaussian Hessian channel is served from the
1023    /// tensor instead of a re-streamed O(n) slab. Differencing the analytic first
1024    /// derivative (not the exact gram) keeps this a pure check of the
1025    /// second-derivative recurrence, isolated from the build's value-lane tol.
1026    #[test]
1027    fn psi_gram_tensor_second_derivative_matches_fd_of_gradient() {
1028        let (n, k) = (160usize, 7usize);
1029        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.5 * ((i % 3) as f64)));
1030        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.37).sin()));
1031        let (psi_lo, psi_hi) = (-1.2_f64, 1.0_f64);
1032
1033        let tensor = PsiGramTensor::build(
1034            |psi| synth_design(psi, n, k),
1035            w.view(),
1036            z.view(),
1037            psi_lo,
1038            psi_hi,
1039        )
1040        .expect("analytic synthetic design must certify");
1041
1042        let h = 1e-5;
1043        for &psi in &[-1.0, -0.5, 0.0, 0.4, 0.9] {
1044            // ∂²G/∂ψ² vs central FD of the analytic ∂G/∂ψ.
1045            let dg_plus = tensor.dgram_dpsi(psi + h);
1046            let dg_minus = tensor.dgram_dpsi(psi - h);
1047            let d2g = tensor.d2gram_dpsi2(psi);
1048            let gscale = d2g.iter().fold(0.0_f64, |a, &v| a.max(v.abs())).max(1e-9);
1049            for ((a, p), m_) in d2g.iter().zip(dg_plus.iter()).zip(dg_minus.iter()) {
1050                let fd = (p - m_) / (2.0 * h);
1051                assert!(
1052                    (a - fd).abs() <= 1e-4 * gscale,
1053                    "d2gram/dpsi2 mismatch at psi={psi}: analytic={a}, fd={fd}"
1054                );
1055            }
1056            // ∂²(XᵀWz)/∂ψ² vs central FD of the analytic ∂(XᵀWz)/∂ψ.
1057            let dr_plus = tensor.drhs_dpsi(psi + h);
1058            let dr_minus = tensor.drhs_dpsi(psi - h);
1059            let d2r = tensor.d2rhs_dpsi2(psi);
1060            let rscale = d2r.iter().fold(0.0_f64, |a, &v| a.max(v.abs())).max(1e-9);
1061            for ((a, p), m_) in d2r.iter().zip(dr_plus.iter()).zip(dr_minus.iter()) {
1062                let fd = (p - m_) / (2.0 * h);
1063                assert!(
1064                    (a - fd).abs() <= 1e-4 * rscale,
1065                    "d2rhs/dpsi2 mismatch at psi={psi}: analytic={a}, fd={fd}"
1066                );
1067            }
1068        }
1069    }
1070
1071    /// The penalized Gaussian profile deviance at a fixed ridge λ, assembled
1072    /// PURELY from the sufficient-statistic triple `(G, r, c) = (XᵀWX, XᵀWz, zᵀWz)`:
1073    ///
1074    /// ```text
1075    ///   β(λ) = (G + λS)⁻¹ r,   D(ψ;λ) = c − 2 βᵀr + βᵀ(G + λS)β = c − βᵀr
1076    /// ```
1077    ///
1078    /// (the second equality uses the normal equations `(G + λS)β = r`). This is
1079    /// EXACTLY the object the inner Gaussian PLS minimizes over β, and it is a
1080    /// pure function of `(G, r, c)` — n-free. Returns `(D, β)` so the caller can
1081    /// also probe the coefficient lane. `s_ridge` is the ridge penalty matrix.
1082    fn profile_deviance(
1083        g: &Array2<f64>,
1084        r: &Array1<f64>,
1085        c: f64,
1086        s_ridge: &Array2<f64>,
1087        lambda: f64,
1088        k: usize,
1089    ) -> (f64, Array1<f64>) {
1090        // Dense (G + λS) β = r via partial-pivot Gauss elimination (small k).
1091        let mut a = g.clone();
1092        a.scaled_add(lambda, s_ridge);
1093        let mut aug = Array2::<f64>::zeros((k, k + 1));
1094        aug.slice_mut(ndarray::s![.., ..k]).assign(&a);
1095        aug.slice_mut(ndarray::s![.., k]).assign(r);
1096        for col in 0..k {
1097            let piv = (col..k)
1098                .max_by(|&p, &q| aug[[p, col]].abs().total_cmp(&aug[[q, col]].abs()))
1099                .unwrap();
1100            if piv != col {
1101                for j in 0..=k {
1102                    let tmp = aug[[col, j]];
1103                    aug[[col, j]] = aug[[piv, j]];
1104                    aug[[piv, j]] = tmp;
1105                }
1106            }
1107            let p = aug[[col, col]];
1108            for row in 0..k {
1109                if row == col {
1110                    continue;
1111                }
1112                let f = aug[[row, col]] / p;
1113                for j in col..=k {
1114                    aug[[row, j]] -= f * aug[[col, j]];
1115                }
1116            }
1117        }
1118        let beta = Array1::from_iter((0..k).map(|i| aug[[i, k]] / aug[[i, i]]));
1119        let deviance = c - beta.dot(r);
1120        (deviance, beta)
1121    }
1122
1123    /// #1033 bit-tight Hessian + κ-optimum gate. The fast path's promise is not
1124    /// merely that the Gram VALUE matches at sampled ψ — it is that the WHOLE
1125    /// outer κ search (objective, its ψ-curvature, and therefore the located
1126    /// optimum) is reproduced by the n-free sufficient-statistic representation
1127    /// to machine precision. This harness certifies exactly that:
1128    ///
1129    ///   1. **Objective**: the penalized profile deviance `D(ψ)` assembled from
1130    ///      the tensor's `(gram_at, rhs_at, zᵀWz)` matches the exactly streamed
1131    ///      `XᵀWX/XᵀWz/zᵀWz` deviance bit-tight at every ψ on a fine grid.
1132    ///   2. **Curvature (Hessian)**: the second ψ-derivative `D''(ψ)` of the
1133    ///      fast-path objective matches the second ψ-derivative of the EXACT
1134    ///      objective (central FD of the streamed deviance) — the curvature the
1135    ///      outer Newton step reads must be the true curvature, not an
1136    ///      approximation that drifts off the value (the objective↔gradient
1137    ///      desync class, now extended to the second order).
1138    ///   3. **κ-optimum**: the argmin of `D(ψ)` over the grid is IDENTICAL
1139    ///      between the two assemblies — the fast path lands on the same κ as the
1140    ///      exact streamed search, to the grid resolution AND bit-tight in the
1141    ///      objective value at that node.
1142    #[test]
1143    fn psi_gram_tensor_bit_tight_hessian_and_kappa_optimum() {
1144        let (n, k) = (200usize, 6usize);
1145        // Heterogeneous weights + a response with genuine ψ-dependent curvature
1146        // so the deviance has a non-degenerate interior minimum in ψ.
1147        let w = Array1::from_iter((0..n).map(|i| 0.7 + 0.6 * (((i * 7) % 5) as f64) / 4.0));
1148        let z = Array1::from_iter((0..n).map(|i| {
1149            let t = (i as f64) / (n as f64 - 1.0);
1150            (3.0 * t).sin() + 0.3 * (7.0 * t).cos()
1151        }));
1152        let (psi_lo, psi_hi) = (-1.0_f64, 0.9_f64);
1153        // Fixed ridge λ over the search — the κ optimizer profiles ψ at fixed
1154        // smoothing here; identity-S ridge keeps the profile well-posed.
1155        let s_ridge = Array2::<f64>::eye(k);
1156        let lambda = 0.5_f64;
1157
1158        let tensor = PsiGramTensor::build(
1159            |psi| synth_design(psi, n, k),
1160            w.view(),
1161            z.view(),
1162            psi_lo,
1163            psi_hi,
1164        )
1165        .expect("analytic synthetic design must certify");
1166
1167        let exact_ztwz: f64 = w.iter().zip(z.iter()).map(|(&wi, &zi)| wi * zi * zi).sum();
1168
1169        // Exact streamed deviance at arbitrary ψ — the ground truth the n-free
1170        // path must reproduce.
1171        let exact_deviance = |psi: f64| -> f64 {
1172            let design = synth_design(psi, n, k).unwrap();
1173            let mut wd = design.clone();
1174            for (mut row, &wi) in wd.outer_iter_mut().zip(w.iter()) {
1175                row.mapv_inplace(|v| v * wi);
1176            }
1177            let g = design.t().dot(&wd);
1178            let r = wd.t().dot(&z);
1179            profile_deviance(&g, &r, exact_ztwz, &s_ridge, lambda, k).0
1180        };
1181
1182        // Fast n-free deviance from the certified tensor.
1183        let fast_deviance = |psi: f64| -> f64 {
1184            let g = tensor.gram_at(psi);
1185            let r = tensor.rhs_at(psi);
1186            profile_deviance(&g, &r, exact_ztwz, &s_ridge, lambda, k).0
1187        };
1188
1189        // Dense grid strictly inside the certified window (away from the edges,
1190        // where the build's value lane is still certified but we want a clean
1191        // central-FD second derivative to exist on both sides).
1192        let m = 81usize;
1193        let lo = psi_lo + 0.06;
1194        let hi = psi_hi - 0.06;
1195        let grid: Vec<f64> = (0..m)
1196            .map(|i| lo + (hi - lo) * (i as f64) / (m as f64 - 1.0))
1197            .collect();
1198
1199        // (1) Objective bit-tight across the whole grid; track argmin on both.
1200        let mut worst_value_rel = 0.0_f64;
1201        let (mut fast_argmin, mut fast_min) = (f64::NAN, f64::INFINITY);
1202        let (mut exact_argmin, mut exact_min) = (f64::NAN, f64::INFINITY);
1203        for &psi in &grid {
1204            let de = exact_deviance(psi);
1205            let df = fast_deviance(psi);
1206            let rel = (de - df).abs() / de.abs().max(1e-300);
1207            worst_value_rel = worst_value_rel.max(rel);
1208            if df < fast_min {
1209                fast_min = df;
1210                fast_argmin = psi;
1211            }
1212            if de < exact_min {
1213                exact_min = de;
1214                exact_argmin = psi;
1215            }
1216        }
1217        assert!(
1218            worst_value_rel <= 1e-9,
1219            "penalized profile deviance: fast n-free assembly diverged from exact \
1220             streamed by rel {worst_value_rel:.3e} (> 1e-9) somewhere on the ψ grid"
1221        );
1222
1223        // (3) κ-optimum: identical grid node AND bit-tight value there. The
1224        // argmin must be a true interior minimum (not a window edge) for this to
1225        // certify the OUTER search rather than a boundary artifact.
1226        assert_eq!(
1227            fast_argmin.to_bits(),
1228            exact_argmin.to_bits(),
1229            "κ-optimum mismatch: fast argmin ψ={fast_argmin}, exact argmin ψ={exact_argmin} \
1230             — the n-free objective located a different optimum"
1231        );
1232        assert!(
1233            fast_argmin > lo + 1e-9 && fast_argmin < hi - 1e-9,
1234            "κ-optimum landed on the grid edge ψ={fast_argmin}; the fixture must have \
1235             an INTERIOR minimum for this to test the outer search, not a boundary"
1236        );
1237        let opt_rel = (exact_min - fast_min).abs() / exact_min.abs().max(1e-300);
1238        assert!(
1239            opt_rel <= 1e-9,
1240            "κ-optimum objective value drift at ψ={fast_argmin}: fast={fast_min}, \
1241             exact={exact_min}, rel={opt_rel:.3e}"
1242        );
1243
1244        // (2) Gradient + curvature from the tensor's ANALYTIC ψ-derivatives.
1245        //
1246        // Differencing two objectives that agree only to ~1e-9 in VALUE cannot
1247        // certify their curvature: the central second difference divides by h²,
1248        // so the ~1e-9 value gap (which is NOT common-mode — they are different
1249        // assemblies) is amplified by 1/h² and swamps any real curvature signal.
1250        // The principled bit-tight curvature check uses the tensor's OWN analytic
1251        // ψ-derivatives `dgram_dpsi`/`drhs_dpsi`: the envelope gradient of the
1252        // profile deviance `D(ψ) = c − rᵀA⁻¹r`, `A = G + λS`, is
1253        //
1254        //   D'(ψ) = −2 βᵀ(∂r/∂ψ) + βᵀ(∂G/∂ψ)β,   β = A⁻¹r,
1255        //
1256        // assembled n-free from `(dgram_dpsi, drhs_dpsi)`. We certify this
1257        // analytic gradient against a central FD of the EXACT streamed objective
1258        // (first order ⇒ only 1/h amplification, so the ~1e-9 value agreement is
1259        // not destroyed), and certify the curvature by central-differencing the
1260        // ANALYTIC gradient (again 1/h, not 1/h²). This is the same one-
1261        // representation value↔gradient↔curvature consistency the production fast
1262        // path relies on for the outer Newton step.
1263        let solve_a = |g: &Array2<f64>, r: &Array1<f64>| -> Array1<f64> {
1264            profile_deviance(g, r, exact_ztwz, &s_ridge, lambda, k).1
1265        };
1266        // Analytic n-free ψ-gradient of the penalized profile deviance, valid on
1267        // the certified gradient sub-window where `dgram_dpsi` is bit-tight.
1268        let analytic_grad = |psi: f64| -> f64 {
1269            let g = tensor.gram_at(psi);
1270            let r = tensor.rhs_at(psi);
1271            let beta = solve_a(&g, &r);
1272            let dg = tensor.dgram_dpsi(psi);
1273            let dr = tensor.drhs_dpsi(psi);
1274            -2.0 * beta.dot(&dr) + beta.dot(&dg.dot(&beta))
1275        };
1276
1277        // Two finite-difference steps, each near the optimum of its own
1278        // truncation/rounding trade-off:
1279        //   * `h_grad = 1e-6` for the FIRST derivative (central FD ⇒ O(h²)
1280        //     truncation, O(ε/h) rounding ⇒ optimum near 1e-5..1e-6);
1281        //   * `h_curv = 2e-4` for the curvature. A SECOND difference divides by
1282        //     h², so its rounding floor is O(ε·|D|/h²): at h=1e-6 that is
1283        //     ~1e-16/1e-12 = 1e-4 of |D|, comparable to the curvature itself —
1284        //     useless. h≈2e-4 puts the rounding floor at ~1e-16/4e-8 ≈ 2.5e-9·|D|
1285        //     and the O(h²·D⁗) truncation around the same scale, so the second
1286        //     difference is meaningful. The analytic-gradient curvature is
1287        //     differenced at the SAME h_curv so the two carry the same
1288        //     truncation order and the comparison is apples-to-apples.
1289        let h_grad = 1e-6_f64;
1290        let h_curv = 2e-4_f64;
1291        let mut worst_grad_rel = 0.0_f64;
1292        let mut worst_hess_rel = 0.0_f64;
1293        let mut tested = 0usize;
1294        for &psi in &grid {
1295            // The exact-objective curvature stencil reaches ±2·h_curv; require the
1296            // whole stencil to stay inside the certified gradient sub-window so the
1297            // analytic-gradient differences are all bit-tight.
1298            if !tensor.contains_for_gradient(psi - 2.0 * h_curv)
1299                || !tensor.contains_for_gradient(psi + 2.0 * h_curv)
1300            {
1301                continue;
1302            }
1303            tested += 1;
1304            // Analytic gradient vs central FD of the EXACT streamed objective.
1305            let exact_g1 =
1306                (exact_deviance(psi + h_grad) - exact_deviance(psi - h_grad)) / (2.0 * h_grad);
1307            let ag = analytic_grad(psi);
1308            let gscale = exact_g1.abs().max(1e-6);
1309            worst_grad_rel = worst_grad_rel.max((exact_g1 - ag).abs() / gscale);
1310            // Curvature: central FD of the ANALYTIC gradient (n-free) vs central
1311            // second difference of the EXACT objective, both at h_curv.
1312            let analytic_h2 =
1313                (analytic_grad(psi + h_curv) - analytic_grad(psi - h_curv)) / (2.0 * h_curv);
1314            let exact_h2 = (exact_deviance(psi + h_curv) - 2.0 * exact_deviance(psi)
1315                + exact_deviance(psi - h_curv))
1316                / (h_curv * h_curv);
1317            let hscale = exact_h2.abs().max(1e-3);
1318            worst_hess_rel = worst_hess_rel.max((analytic_h2 - exact_h2).abs() / hscale);
1319        }
1320        assert!(
1321            tested > 0,
1322            "no ψ on the grid lay inside the certified gradient sub-window"
1323        );
1324        assert!(
1325            worst_grad_rel <= 1e-5,
1326            "ψ-gradient mismatch: the tensor's analytic n-free objective gradient diverged \
1327             from the exact streamed objective by rel {worst_grad_rel:.3e} (> 1e-5)"
1328        );
1329        // The curvature compares an analytic-gradient central difference against
1330        // an exact-objective second difference; the residual O(h²) truncation +
1331        // O(ε/h²) rounding floor at h_curv=2e-4 sets a realistic bit-tight bar of
1332        // ~1e-3 relative (any larger gap is a genuine curvature divergence, not FD
1333        // noise — the value/gradient lanes already certify the objective itself to
1334        // ~1e-9/1e-5).
1335        assert!(
1336            worst_hess_rel <= 1e-3,
1337            "ψ-curvature (Hessian) mismatch: fast n-free objective curvature diverged \
1338             from the exact streamed objective by rel {worst_hess_rel:.3e} (> 1e-3) — \
1339             the outer Newton step would read a different curvature than the truth"
1340        );
1341
1342        eprintln!(
1343            "[psi-gram-bittight] n={n} k={k} grid={m} grad-tested={tested}  \
1344             worst |ΔD|/D={worst_value_rel:.2e}  worst |ΔD'|/D'={worst_grad_rel:.2e}  \
1345             worst |ΔD''|/D''={worst_hess_rel:.2e}  κ-opt ψ={fast_argmin:.6} (interior, bit-identical)"
1346        );
1347    }
1348
1349    /// Certification negative: a NON-analytic (kinked) design must refuse to
1350    /// certify rather than silently approximate.
1351    #[test]
1352    fn psi_gram_tensor_refuses_non_analytic_design() {
1353        let (n, k) = (40usize, 3usize);
1354        let w = Array1::from_elem(n, 1.0);
1355        let z = Array1::from_elem(n, 0.5);
1356        let tensor = PsiGramTensor::build(
1357            |psi| {
1358                let mut x = Array2::<f64>::zeros((n, k));
1359                for i in 0..n {
1360                    for j in 0..k {
1361                        // |ψ| kink at 0 inside the window: not analytic.
1362                        x[[i, j]] = psi.abs() + (i + j) as f64 / (n + k) as f64;
1363                    }
1364                }
1365                Ok(x)
1366            },
1367            w.view(),
1368            z.view(),
1369            -1.0,
1370            1.0,
1371        );
1372        assert!(
1373            tensor.is_err(),
1374            "kinked design must fail the tail-decay/spot-check certificates"
1375        );
1376    }
1377
1378    /// #1264 reduced-basis-equality witness — REFLEXIVITY + GAUGE INVARIANCE.
1379    ///
1380    /// `reduced_basis_equal(ψ, ψ)` is trivially sound (the surface is its own
1381    /// reference), and the witness must accept two ψ's whose RANGE subspace is
1382    /// identical even when the per-ψ eigenvECTORS differ (the projector is
1383    /// gauge-invariant). The synthetic full-rank Matérn-shaped design's range is
1384    /// the whole k-space for every ψ, so every in-window pair shares a reduced
1385    /// basis and must certify.
1386    #[test]
1387    fn reduced_basis_witness_reflexive_and_gauge_invariant() {
1388        let (n, k) = (160usize, 6usize);
1389        let w = Array1::from_iter((0..n).map(|i| 1.0 + 0.3 * ((i % 5) as f64)));
1390        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.29).sin()));
1391        let (psi_lo, psi_hi) = (-1.0_f64, 0.8_f64);
1392        let tensor = PsiGramTensor::build(
1393            |psi| synth_design(psi, n, k),
1394            w.view(),
1395            z.view(),
1396            psi_lo,
1397            psi_hi,
1398        )
1399        .expect("analytic synthetic design must certify");
1400
1401        // Reflexive: same ψ is always sound.
1402        for &psi in &[-0.9, -0.2, 0.0, 0.5, 0.79] {
1403            assert!(
1404                tensor.reduced_basis_equal(psi, psi),
1405                "witness must be reflexive at psi={psi}"
1406            );
1407        }
1408        // The full-rank synthetic design spans all of k-space at every ψ, so the
1409        // range projector is the identity for all ψ → every pair certifies.
1410        let grid: Vec<f64> = (0..=12).map(|i| psi_lo + 0.05 + 0.06 * i as f64).collect();
1411        for &a in &grid {
1412            for &b in &grid {
1413                assert!(
1414                    tensor.reduced_basis_equal(a, b),
1415                    "full-rank design: range is ψ-invariant (identity projector), \
1416                     so the skip witness must certify (ψ_ref={a}, ψ_new={b})"
1417                );
1418            }
1419        }
1420        // Off-window ψ refuses.
1421        assert!(!tensor.reduced_basis_equal(psi_lo - 0.5, 0.0));
1422        assert!(!tensor.reduced_basis_equal(0.0, psi_hi + 0.5));
1423    }
1424
1425    /// #1264 reduced-basis-equality witness — REFUSES across a genuine subspace
1426    /// change (the exact failure mode of the old RRQR-only gate).
1427    ///
1428    /// Construct a design whose first two columns are fixed (ψ-invariant) profiles
1429    /// and whose third column's AMPLITUDE `ε(ψ) = e^{αψ}` analytically sweeps the
1430    /// third eigendirection's eigenvalue `∝ ε²` across the rank-revealing cutoff.
1431    /// Below the cutoff the reduced (range) basis is the 2-D span of the first two
1432    /// profiles; above it the range is 3-D. Two ψ's on the SAME side of the
1433    /// threshold share a reduced basis (witness accepts); two ψ's STRADDLING it do
1434    /// not (witness refuses) — exactly the stale-basis pairing the design-revision
1435    /// fast path must not perform. The amplitude is smooth/analytic so the tensor
1436    /// still certifies (this is a reduced-basis change, not a non-analytic kink).
1437    #[test]
1438    fn reduced_basis_witness_refuses_across_subspace_change() {
1439        let (n, k) = (200usize, 3usize);
1440        // Three fixed, well-separated column profiles (full column rank when all
1441        // present). The third is scaled by ε(ψ).
1442        let base = |i: usize, j: usize| -> f64 {
1443            let t = (i as f64 + 0.5) / n as f64;
1444            match j {
1445                0 => 1.0,
1446                1 => (2.0 * std::f64::consts::PI * t).sin(),
1447                _ => (4.0 * std::f64::consts::PI * t).cos(),
1448            }
1449        };
1450        // ε(ψ) crosses √cutoff (relative to λ_max ~ O(n)) within the window: at
1451        // λ_max ≈ n the cutoff is rank_rtol·n ≈ 1e-10·200 = 2e-8, so the third
1452        // eigenvalue ε²·‖c3‖² ≈ ε²·(n/2) crosses it at ε ≈ sqrt(4e-8/n) ≈ 1.4e-5,
1453        // i.e. ψ* ≈ ln(1.4e-5)/α. With α = 10 and window [−1.6,−0.8], ψ* ≈ −1.12
1454        // sits inside the window, giving a clean below/above split.
1455        let alpha = 10.0_f64;
1456        let design = move |psi: f64| -> Result<Array2<f64>, String> {
1457            let eps = (alpha * psi).exp();
1458            let mut x = Array2::<f64>::zeros((n, k));
1459            for i in 0..n {
1460                x[[i, 0]] = base(i, 0);
1461                x[[i, 1]] = base(i, 1);
1462                x[[i, 2]] = eps * base(i, 2);
1463            }
1464            Ok(x)
1465        };
1466        let w = Array1::from_elem(n, 1.0);
1467        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.13).sin()));
1468        let (psi_lo, psi_hi) = (-1.6_f64, -0.8_f64);
1469        let tensor = PsiGramTensor::build(design, w.view(), z.view(), psi_lo, psi_hi)
1470            .expect("smooth ε(ψ) design must still certify (analytic, no kink)");
1471
1472        // Find the actual threshold by scanning the rank.
1473        let rank_at = |psi: f64| -> usize {
1474            tensor
1475                .range_projector(psi, PSI_GRAM_SKIP_RANK_RTOL)
1476                .map(|(_, r)| r)
1477                .unwrap_or(0)
1478        };
1479        let lo_rank = rank_at(psi_lo + 0.02);
1480        let hi_rank = rank_at(psi_hi - 0.02);
1481        assert_eq!(
1482            lo_rank, 2,
1483            "low-ψ end must be rank-2 (third column below cutoff)"
1484        );
1485        assert_eq!(
1486            hi_rank, 3,
1487            "high-ψ end must be rank-3 (third column above cutoff)"
1488        );
1489
1490        // Same-side pairs (both rank-2) certify; straddling pairs refuse.
1491        let psi_low_a = psi_lo + 0.05;
1492        let psi_low_b = psi_lo + 0.10;
1493        assert_eq!(rank_at(psi_low_a), 2);
1494        assert_eq!(rank_at(psi_low_b), 2);
1495        assert!(
1496            tensor.reduced_basis_equal(psi_low_a, psi_low_b),
1497            "two low-ψ trials share the rank-2 reduced basis → skip is sound"
1498        );
1499        let psi_high_a = psi_hi - 0.05;
1500        let psi_high_b = psi_hi - 0.10;
1501        assert_eq!(rank_at(psi_high_a), 3);
1502        assert_eq!(rank_at(psi_high_b), 3);
1503        // High-side: the range is the full 3-D space at both, so the projector is
1504        // the identity at both → still a shared reduced basis.
1505        assert!(
1506            tensor.reduced_basis_equal(psi_high_a, psi_high_b),
1507            "two high-ψ trials share the rank-3 reduced basis → skip is sound"
1508        );
1509        // Straddling the rank change: the reduced basis MOVED (2-D → 3-D). The
1510        // witness MUST refuse — this is precisely the stale-basis pairing the old
1511        // RRQR-only gate let through.
1512        assert!(
1513            !tensor.reduced_basis_equal(psi_low_a, psi_high_a),
1514            "witness must REFUSE a skip that straddles the reduced-basis (rank) \
1515             change — freezing the low-ψ rank-2 basis and re-keying the high-ψ \
1516             rank-3 Gram is the exact ~7.8e-2 β̂ regression #1264 guards"
1517        );
1518        assert!(
1519            !tensor.reduced_basis_equal(psi_high_a, psi_low_a),
1520            "witness must refuse symmetrically (high pin, low trial)"
1521        );
1522    }
1523
1524    /// #1033 ROTATION WALL — the subspace-distance certificate must CERTIFY a
1525    /// skip across a pure basis ROTATION at fixed rank, where the old entrywise
1526    /// max-abs projector gate would have refused.
1527    ///
1528    /// Build a rank-2 (in a k=3 space) design whose 2-D range ROTATES smoothly
1529    /// with ψ but whose RANK stays 2: two ψ-dependent in-plane directions span the
1530    /// same fixed 2-plane (cols 0,1 of a fixed orthonormal pair) rotated by an
1531    /// analytic angle φ(ψ). The SUBSPACE (the 2-plane) is ψ-invariant — only the
1532    /// basis within it rotates — so the range projector is mathematically
1533    /// IDENTICAL at every ψ, but its eigenVECTORS rotate. A correct
1534    /// subspace-identity witness must certify every in-window pair; the spectral
1535    /// (principal-angle) distance is ~0 throughout while a naive entrywise
1536    /// comparison of rotated eigenbases would not be guaranteed to.
1537    #[test]
1538    fn reduced_basis_witness_certifies_across_pure_rotation_1033() {
1539        let (n, k) = (240usize, 3usize);
1540        // Two fixed orthogonal ambient profiles spanning a fixed 2-plane; the
1541        // third ambient direction is left empty so the range is exactly that
1542        // 2-plane (rank 2) for every ψ.
1543        let p0 = |i: usize| -> f64 {
1544            let t = (i as f64 + 0.5) / n as f64;
1545            (2.0 * std::f64::consts::PI * t).sin()
1546        };
1547        let p1 = |i: usize| -> f64 {
1548            let t = (i as f64 + 0.5) / n as f64;
1549            (2.0 * std::f64::consts::PI * t).cos()
1550        };
1551        // Within the fixed 2-plane, rotate the two design columns by φ(ψ): the
1552        // SPAN is unchanged (still the {p0,p1} plane) but the basis rotates, so
1553        // the per-ψ eigenvectors of the Gram rotate while the range projector is
1554        // ψ-invariant.
1555        let design = move |psi: f64| -> Result<Array2<f64>, String> {
1556            let phi = 0.7 * psi; // analytic angle sweep
1557            let (c, s) = (phi.cos(), phi.sin());
1558            let mut x = Array2::<f64>::zeros((n, k));
1559            for i in 0..n {
1560                let (a, b) = (p0(i), p1(i));
1561                x[[i, 0]] = c * a - s * b;
1562                x[[i, 1]] = s * a + c * b;
1563                // column 2 stays zero → range is the fixed 2-plane, rank 2.
1564            }
1565            Ok(x)
1566        };
1567        let w = Array1::from_elem(n, 1.0);
1568        let z = Array1::from_iter((0..n).map(|i| ((i as f64) * 0.17).cos()));
1569        let (psi_lo, psi_hi) = (-1.0_f64, 1.0_f64);
1570        let tensor = PsiGramTensor::build(design, w.view(), z.view(), psi_lo, psi_hi)
1571            .expect("smooth rotation design must certify (analytic, no kink)");
1572
1573        // Rank is a constant 2 across the window (the third direction is empty).
1574        let rank_at = |psi: f64| -> usize {
1575            tensor
1576                .range_projector(psi, PSI_GRAM_SKIP_RANK_RTOL)
1577                .map(|(_, r)| r)
1578                .unwrap_or(0)
1579        };
1580        for &psi in &[-0.95, -0.4, 0.0, 0.4, 0.95] {
1581            assert_eq!(rank_at(psi), 2, "rotation keeps rank 2 at psi={psi}");
1582        }
1583
1584        // Every in-window pair spans the SAME 2-plane (only the basis rotates),
1585        // so the subspace-distance witness MUST certify the skip — this is the
1586        // rotation that the entrywise gate kept refusing (the #1033 wall).
1587        let grid: Vec<f64> = (0..=10).map(|i| psi_lo + 0.05 + 0.09 * i as f64).collect();
1588        for &a in &grid {
1589            for &b in &grid {
1590                assert!(
1591                    tensor.reduced_basis_equal(a, b),
1592                    "pure in-plane rotation preserves the range subspace → the \
1593                     subspace-distance skip witness must certify (#1033) \
1594                     (ψ_ref={a}, ψ_new={b})"
1595                );
1596            }
1597        }
1598    }
1599}