gam 0.3.115

Generalized penalized likelihood engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
//! Behaviorally-anchored SAE head (issue #912).
//!
//! # What this is
//!
//! An unsupervised SAE dictionary — manifold or linear — pins a concept's
//! *shape* and its coordinate up to isometry. Both are functionals of `p(x)`.
//! It cannot pin the map from coordinate to behavior, `p(y|x)`, which is
//! formally independent of `p(x)`: two models with identical activation
//! geometry can read out differently. The behavioral coefficient an atom
//! "gets" from a post-hoc probe is therefore not a refinement of the
//! dictionary — it is the *only* labeled source of the one thing the
//! dictionary structurally lacks.
//!
//! The fix is to make the behavior part of the model. Instead of treating the
//! auxiliary `u` as a fixed covariate in a conditional Gaussian *prior* on the
//! latent codes (`LatentIdMode::AuxPrior`, the iVAE gauge), this module
//! promotes the auxiliary signal to a *modeled outcome*: a GLM behavioral head
//!
//! ```text
//!   g(E[y_n | t_n]) = a + t_n · w
//! ```
//!
//! whose design columns are the latent codes `t_n` themselves. The head's
//! coefficients `(a, w)` live in the β tier and the head log-likelihood enters
//! the *same* Laplace/REML objective as the reconstruction channel, so REML
//! balances reconstruction vs. behavioral fit automatically — no hand-tuned
//! trade-off scalar (magic by default). Because the design depends on `ψ` (the
//! latent codes move during the joint fit), the head couples to the latent
//! block exactly the way the arrow-Schur border already hosts a β-border
//! coupled to per-row latent blocks.
//!
//! # The three pieces
//!
//! 1. [`BehavioralHead`] — the head GLM itself: value + gradient of the head
//!    log-likelihood w.r.t. the head coefficients `(a, w)` AND w.r.t. the
//!    latent codes `t` (the cross-channel coupling), under a [`RowMeasure`]
//!    weighting so unlabeled rows carry zero head weight (semi-supervised).
//!
//! 2. [`LeakageAbsorber`] — the #461 Neyman-orthogonal device. Joint fitting
//!    can sculpt the dictionary to *encode the label* (rediscover your own
//!    probe). The absorber widens the reconstruction design with the head's
//!    score-influence directions so the dictionary update is orthogonalized
//!    against the label channel. The boundary it enforces is precisely
//!    "orient what `p(x)` put there" vs. "hallucinate geometry from the label"
//!    — the novel statistical content of the whole construction.
//!
//! 3. [`head_feature_significance`] — per-feature (per-atom) significance of
//!    the behavioral loading via [`wood_smooth_test`], converted to an
//!    FDR-controlled report through the e-BH multiplicity machinery
//!    ([`e_benjamini_hochberg`]). Features whose behavioral signal survives
//!    orthogonalization AND the multiplicity correction are the reportable
//!    ones.

use crate::inference::smooth_test::{SmoothTestInput, SmoothTestScale, wood_smooth_test};
use crate::inference::structure_evidence::e_benjamini_hochberg;
use crate::linalg::faer_ndarray::FaerSvd;
use crate::solver::row_measure::RowMeasure;
use ndarray::{Array1, Array2, ArrayView1, ArrayView2};

/// Outcome family for the behavioral head.
///
/// `Binomial` is the canonical safety-probe family (a binary label —
/// deception, harmfulness — read out from the latent codes). `Multinomial`
/// covers a categorical behavioral label with `n_classes` levels via a
/// shared-design softmax head; class 0 is the reference. Both are the same
/// families already in `src/families/`; this enum only selects the head's
/// link + log-likelihood, it does not re-implement the families.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AuxOutcomeFamily {
    /// Logistic head: `logit P(y=1 | t) = a + t·w`. A single binary label
    /// pins roughly one gauge dimension, which is why `AuxOutcome` composes
    /// with `DimSelection` ARD + the isometry pin rather than replacing them.
    Binomial,
    /// Softmax head over `n_classes` levels (class 0 reference). The behavioral
    /// subspace it can orient has dimension at most `n_classes − 1`, still
    /// low-dimensional — the geometry does the shape work.
    Multinomial { n_classes: usize },
}

impl AuxOutcomeFamily {
    /// Number of linear-predictor channels the head produces per row.
    /// Binomial = 1; Multinomial with `K` classes = `K − 1` (reference-coded).
    pub fn n_eta_channels(&self) -> usize {
        match self {
            AuxOutcomeFamily::Binomial => 1,
            AuxOutcomeFamily::Multinomial { n_classes } => n_classes.saturating_sub(1),
        }
    }

    /// A single binary label pins ~1 gauge dimension; a `K`-level categorical
    /// label pins at most `K − 1`. This is the honest behavioral-subspace
    /// dimension the head can orient — the load-bearing scope assumption from
    /// the Khemakhem precondition discussion (the geometry does the rest).
    pub fn behavioral_subspace_dim(&self) -> usize {
        self.n_eta_channels()
    }
}

/// The behavioral head GLM.
///
/// Stores the labels `y` (length `n`), the per-row head weights `w_row`
/// (length `n`; 0 ⇒ unlabeled, the semi-supervised seam), and the family.
/// The design is *not* stored: it is the live latent-code matrix `t`
/// (`n × d`) passed at evaluation time, because `t` moves during the joint
/// fit. The head coefficients are `(intercept, loadings)` with one
/// `(1 + d)`-vector per η-channel.
#[derive(Debug, Clone)]
pub struct BehavioralHead {
    family: AuxOutcomeFamily,
    /// For `Binomial`: the 0/1 label per row. For `Multinomial`: the class
    /// index in `0..n_classes` per row (stored as `f64`, integral-valued).
    y: Array1<f64>,
    /// Per-row head-channel weight. `0.0` on unlabeled rows (semi-supervised);
    /// `1.0` on labeled rows by default. Derived from a [`RowMeasure`] via
    /// [`BehavioralHead::with_row_measure`].
    w_row: Array1<f64>,
}

impl BehavioralHead {
    /// Build a head from labels and an explicit per-row weight vector.
    ///
    /// `family.n_eta_channels()` channels each get a `(1 + d)` coefficient
    /// block at fit time. Validates label range against the family.
    pub fn new(
        family: AuxOutcomeFamily,
        y: Array1<f64>,
        w_row: Array1<f64>,
    ) -> Result<Self, String> {
        let n = y.len();
        if w_row.len() != n {
            return Err(format!(
                "BehavioralHead: w_row length {} != labels length {n}",
                w_row.len()
            ));
        }
        for &v in w_row.iter() {
            if !(v.is_finite() && v >= 0.0) {
                return Err(format!(
                    "BehavioralHead: row weights must be finite and ≥ 0, got {v}"
                ));
            }
        }
        match family {
            AuxOutcomeFamily::Binomial => {
                for (i, &label) in y.iter().enumerate() {
                    if label != 0.0 && label != 1.0 {
                        return Err(format!(
                            "BehavioralHead(Binomial): label[{i}] = {label} is not 0/1"
                        ));
                    }
                }
            }
            AuxOutcomeFamily::Multinomial { n_classes } => {
                if n_classes < 2 {
                    return Err(format!(
                        "BehavioralHead(Multinomial): need ≥ 2 classes, got {n_classes}"
                    ));
                }
                for (i, &label) in y.iter().enumerate() {
                    let k = label as usize;
                    if k as f64 != label || k >= n_classes {
                        return Err(format!(
                            "BehavioralHead(Multinomial): label[{i}] = {label} not an \
                             integer class index in 0..{n_classes}"
                        ));
                    }
                }
            }
        }
        Ok(Self { family, y, w_row })
    }

    /// Build a head where every row carries unit head weight (fully supervised).
    pub fn fully_supervised(family: AuxOutcomeFamily, y: Array1<f64>) -> Result<Self, String> {
        let n = y.len();
        Self::new(family, y, Array1::from_elem(n, 1.0))
    }

    /// Build a head whose per-row weights come from a [`RowMeasure`]: rows
    /// outside the measure (unlabeled) get weight 0, rows inside get the
    /// measure's weight. This is the semi-supervised seam — no new mechanism,
    /// the same row-weighting the inner solver already enforces for ρ-coherence.
    pub fn with_row_measure(
        family: AuxOutcomeFamily,
        y: Array1<f64>,
        measure: &RowMeasure,
    ) -> Result<Self, String> {
        let n = y.len();
        let (indices, weights) = measure.indices_and_weights(n);
        let mut w_row = Array1::<f64>::zeros(n);
        for &idx in &indices {
            if idx < n {
                w_row[idx] = weights[idx];
            }
        }
        Self::new(family, y, w_row)
    }

    pub fn family(&self) -> AuxOutcomeFamily {
        self.family
    }

    pub fn n_obs(&self) -> usize {
        self.y.len()
    }

    /// Number of head coefficients given latent dimension `d`: one
    /// `(1 + d)` block (intercept + per-axis loading) per η-channel.
    pub fn n_coeffs(&self, latent_dim: usize) -> usize {
        self.family.n_eta_channels() * (1 + latent_dim)
    }

    /// Total effective head-channel sample size `Σ_n w_row[n]` — the number of
    /// labeled rows (weighted). Zero ⇒ a vacuous head (every row unlabeled),
    /// which the validator rejects: a head with no labels cannot anchor a gauge.
    pub fn effective_labeled_count(&self) -> f64 {
        self.w_row.iter().sum()
    }

    /// Per-row, per-channel linear predictor `η[n, c] = a_c + t_n · w_c`.
    fn eta(&self, t: ArrayView2<'_, f64>, coeffs: ArrayView1<'_, f64>) -> Array2<f64> {
        let (n, d) = t.dim();
        let n_eta = self.family.n_eta_channels();
        let mut eta = Array2::<f64>::zeros((n, n_eta));
        for c in 0..n_eta {
            let base = c * (1 + d);
            let a = coeffs[base];
            for row in 0..n {
                let mut acc = a;
                for axis in 0..d {
                    acc += t[[row, axis]] * coeffs[base + 1 + axis];
                }
                eta[[row, c]] = acc;
            }
        }
        eta
    }

    /// Negative head log-likelihood and its gradient w.r.t. **both** the head
    /// coefficients and the latent codes `t`.
    ///
    /// Returns `(nll, grad_coeffs, grad_t)` where:
    /// * `nll = −Σ_n w_row[n] · log p(y_n | η_n)` (weighted),
    /// * `grad_coeffs` has length `n_coeffs(d)` — the head-coefficient gradient
    ///   that drives the β-tier update,
    /// * `grad_t` is `(n, d)` — the cross-channel coupling that flows into the
    ///   latent-block gradient (the arrow-Schur border coupling).
    ///
    /// Convention matches the rest of the latent objective: this is the
    /// *negative* log-likelihood, so it adds to the joint cost and its gradient
    /// adds to the joint gradient.
    pub fn neg_loglik_and_grad(
        &self,
        t: ArrayView2<'_, f64>,
        coeffs: ArrayView1<'_, f64>,
    ) -> Result<(f64, Array1<f64>, Array2<f64>), String> {
        let (n, d) = t.dim();
        if n != self.y.len() {
            return Err(format!(
                "BehavioralHead: latent rows {n} != labels {}",
                self.y.len()
            ));
        }
        let n_eta = self.family.n_eta_channels();
        if coeffs.len() != n_eta * (1 + d) {
            return Err(format!(
                "BehavioralHead: coeffs length {} != n_eta·(1+d) = {}",
                coeffs.len(),
                n_eta * (1 + d)
            ));
        }
        let eta = self.eta(t, coeffs);
        let mut nll = 0.0_f64;
        let mut grad_coeffs = Array1::<f64>::zeros(n_eta * (1 + d));
        let mut grad_t = Array2::<f64>::zeros((n, d));

        match self.family {
            AuxOutcomeFamily::Binomial => {
                for row in 0..n {
                    let w = self.w_row[row];
                    if w == 0.0 {
                        continue;
                    }
                    let e = eta[[row, 0]];
                    // Numerically-stable logistic NLL: log(1+exp(η)) − y·η.
                    let log1p = if e > 0.0 {
                        e + (-e).exp().ln_1p()
                    } else {
                        e.exp().ln_1p()
                    };
                    let y = self.y[row];
                    nll += w * (log1p - y * e);
                    // dNLL/dη = p − y, p = σ(η).
                    let p = 1.0 / (1.0 + (-e).exp());
                    let r = w * (p - y);
                    grad_coeffs[0] += r;
                    for axis in 0..d {
                        grad_coeffs[1 + axis] += r * t[[row, axis]];
                        grad_t[[row, axis]] += r * coeffs[1 + axis];
                    }
                }
            }
            AuxOutcomeFamily::Multinomial { .. } => {
                for row in 0..n {
                    let w = self.w_row[row];
                    if w == 0.0 {
                        continue;
                    }
                    // Softmax over the K−1 free channels plus the implicit
                    // reference channel (η_0 ≡ 0). LSE includes the 0 term.
                    let mut max_eta = 0.0_f64;
                    for c in 0..n_eta {
                        if eta[[row, c]] > max_eta {
                            max_eta = eta[[row, c]];
                        }
                    }
                    let mut denom = (0.0 - max_eta).exp();
                    for c in 0..n_eta {
                        denom += (eta[[row, c]] - max_eta).exp();
                    }
                    let lse = max_eta + denom.ln();
                    let label = self.y[row] as usize;
                    // log p(y) = η_y − lse, with η_0 = 0 for the reference class.
                    let eta_y = if label == 0 {
                        0.0
                    } else {
                        eta[[row, label - 1]]
                    };
                    nll += w * (lse - eta_y);
                    // dNLL/dη_c = p_c − 1{y = c+1}, for free channel c (class c+1).
                    for c in 0..n_eta {
                        let p_c = (eta[[row, c]] - lse).exp();
                        let indicator = if label == c + 1 { 1.0 } else { 0.0 };
                        let r = w * (p_c - indicator);
                        let base = c * (1 + d);
                        grad_coeffs[base] += r;
                        for axis in 0..d {
                            grad_coeffs[base + 1 + axis] += r * t[[row, axis]];
                            grad_t[[row, axis]] += r * coeffs[base + 1 + axis];
                        }
                    }
                }
            }
        }
        Ok((nll, grad_coeffs, grad_t))
    }

    /// Per-row, per-channel head working weights `s[n, c]` — the diagonal of the
    /// Fisher information in η-space (`p(1−p)` logistic, `p_c(1−p_c)` softmax
    /// diagonal). These weight the score-influence directions the leakage
    /// absorber appends to the reconstruction design (the realized label-channel
    /// leverage per row), times the row's head weight so unlabeled rows
    /// contribute nothing to the absorber, exactly as they contribute nothing
    /// to the head likelihood.
    pub fn head_working_weights(
        &self,
        t: ArrayView2<'_, f64>,
        coeffs: ArrayView1<'_, f64>,
    ) -> Result<Array2<f64>, String> {
        let (n, d) = t.dim();
        let n_eta = self.family.n_eta_channels();
        if coeffs.len() != n_eta * (1 + d) {
            return Err("BehavioralHead::head_working_weights: coeff length mismatch".to_string());
        }
        let eta = self.eta(t, coeffs);
        let mut s = Array2::<f64>::zeros((n, n_eta));
        match self.family {
            AuxOutcomeFamily::Binomial => {
                for row in 0..n {
                    let p = 1.0 / (1.0 + (-eta[[row, 0]]).exp());
                    s[[row, 0]] = self.w_row[row] * p * (1.0 - p);
                }
            }
            AuxOutcomeFamily::Multinomial { .. } => {
                for row in 0..n {
                    let mut max_eta = 0.0_f64;
                    for c in 0..n_eta {
                        if eta[[row, c]] > max_eta {
                            max_eta = eta[[row, c]];
                        }
                    }
                    let mut denom = (0.0 - max_eta).exp();
                    for c in 0..n_eta {
                        denom += (eta[[row, c]] - max_eta).exp();
                    }
                    let lse = max_eta + denom.ln();
                    for c in 0..n_eta {
                        let p_c = (eta[[row, c]] - lse).exp();
                        s[[row, c]] = self.w_row[row] * p_c * (1.0 - p_c);
                    }
                }
            }
        }
        Ok(s)
    }
}

/// The #461 Neyman-orthogonal leakage absorber for the behavioral head.
///
/// # The boundary it enforces
///
/// You want the behavioral channel to *orient* the existing manifold (fix the
/// frame) but *not* to *invent* geometry absent from `p(x)` (sculpt a manifold
/// to fit the label). The orthogonalization is precisely the boundary between
/// those two — between "orient what's there" and "hallucinate structure from
/// the label." Getting it exactly right is the single most important statistical
/// content of the whole construction.
///
/// # Mechanism (mirrors the survival/BMS install)
///
/// The head's score-influence directions in latent-code space are the rows of
/// the *score-influence Jacobian*
///
/// ```text
///   Z[n, :] = √s_n · ∂η_n/∂t_n = √s_n · w   (per η-channel)
/// ```
///
/// i.e. the realized, per-row, Fisher-weighted directions along which a change
/// in the latent codes moves the label-channel linear predictor. We
/// orthonormalize their span (thin QR) to obtain the *label-channel subspace*
/// `Q` in latent-code space. The dictionary (reconstruction) update is then
/// projected onto the orthogonal complement of `Q`:
///
/// ```text
///   Δt_recon  ←  (I − Q Qᵀ) Δt_recon
/// ```
///
/// so the reconstruction channel can only move the codes in directions the
/// label channel does *not* already explain. Equivalently, the reconstruction
/// design is widened with `Q` as a null-penalized absorbed block, making the
/// dictionary's estimating equation orthogonal to `span(Q)` — the label channel
/// orients the frame, but cannot drag the dictionary toward encoding the label.
#[derive(Debug, Clone)]
pub struct LeakageAbsorber {
    /// Orthonormal basis `Q ∈ ℝ^{d × r}` of the label-channel subspace in
    /// latent-code space (`r ≤ min(d, n_eta)`). The reconstruction update is
    /// projected onto `range(Q)^⊥`.
    q: Array2<f64>,
}

impl LeakageAbsorber {
    /// Build the absorber from the head's score-influence Jacobian.
    ///
    /// `score_influence` is `(n × (d · n_eta))`: for each row, the stacked
    /// per-channel Fisher-weighted direction `√s_{n,c} · w_c` flattened over
    /// channels. We take the *column* span (the directions in latent-code space
    /// the label channel is sensitive to), reduced to an orthonormal basis via
    /// thin SVD with a relative tolerance, so a rank-deficient or vacuous label
    /// channel yields a low-rank (possibly empty) `Q` and the absorber is a
    /// no-op — the honest behavior when the label pins little gauge.
    pub fn from_score_influence(
        score_influence: ArrayView2<'_, f64>,
        latent_dim: usize,
    ) -> Result<Self, String> {
        let (n, cols) = score_influence.dim();
        if latent_dim == 0 {
            return Ok(Self {
                q: Array2::<f64>::zeros((0, 0)),
            });
        }
        if cols % latent_dim != 0 {
            return Err(format!(
                "LeakageAbsorber: score_influence has {cols} columns, not a multiple of \
                 latent_dim {latent_dim}"
            ));
        }
        let n_eta = cols / latent_dim;
        // Accumulate the (d × d) latent-space Gram of the label-channel
        // directions: G = Σ_n Σ_c v_{n,c} v_{n,c}ᵀ where v_{n,c} ∈ ℝ^d is the
        // c-th channel's score-influence direction for row n.
        let mut gram = Array2::<f64>::zeros((latent_dim, latent_dim));
        for row in 0..n {
            for c in 0..n_eta {
                let base = c * latent_dim;
                for i in 0..latent_dim {
                    let vi = score_influence[[row, base + i]];
                    for j in 0..latent_dim {
                        gram[[i, j]] += vi * score_influence[[row, base + j]];
                    }
                }
            }
        }
        // Eigenvectors of G via SVD (G symmetric PSD) give the principal
        // label-channel directions; keep those above a relative tolerance.
        let (u_opt, sv, _vt) = gram
            .svd(true, false)
            .map_err(|e| format!("LeakageAbsorber: SVD of label-channel Gram failed: {e}"))?;
        let u = u_opt.ok_or_else(|| "LeakageAbsorber: SVD did not return U".to_string())?;
        let max_sv = sv.iter().cloned().fold(0.0_f64, f64::max);
        let tol = max_sv * (latent_dim as f64) * f64::EPSILON;
        let rank = sv.iter().filter(|&&s| s > tol).count();
        let mut q = Array2::<f64>::zeros((latent_dim, rank));
        for col in 0..rank {
            for r in 0..latent_dim {
                q[[r, col]] = u[[r, col]];
            }
        }
        Ok(Self { q })
    }

    /// Rank of the absorbed label-channel subspace (`r`). Zero ⇒ the absorber
    /// is a no-op (the label channel pins no direction the dictionary must be
    /// orthogonalized against).
    pub fn rank(&self) -> usize {
        self.q.ncols()
    }

    /// Orthonormal basis `Q` of the absorbed subspace (`d × r`).
    pub fn basis(&self) -> ArrayView2<'_, f64> {
        self.q.view()
    }

    /// Project a reconstruction-channel latent update `Δt` (`n × d`) onto the
    /// orthogonal complement of the label-channel subspace, in-place semantics
    /// returned as a fresh array. This is the operative orthogonalization: the
    /// dictionary update keeps only the component the label channel does not
    /// already explain.
    pub fn orthogonalize_recon_update(&self, delta_t: ArrayView2<'_, f64>) -> Array2<f64> {
        let (_, d) = delta_t.dim();
        if self.q.ncols() == 0 || self.q.nrows() != d {
            return delta_t.to_owned();
        }
        // Δt − (Δt Q) Qᵀ.
        let proj_coords = delta_t.dot(&self.q); // (n × r)
        let proj = proj_coords.dot(&self.q.t()); // (n × d)
        let mut out = delta_t.to_owned();
        out -= &proj;
        out
    }

    /// The absorbed block to *append* to the reconstruction design: per-row
    /// rows `Z_infl[n, :] = Δ`-direction in latent-code space, here the
    /// row's latent code projected onto `Q` — the realized label-channel
    /// leverage. Width `r`. This mirrors the survival/BMS install where the
    /// realized Stage-1 leakage directions are appended as a null-penalized
    /// absorbed block, making the β estimating equation orthogonal to its span.
    pub fn absorbed_design_block(&self, t: ArrayView2<'_, f64>) -> Array2<f64> {
        if self.q.ncols() == 0 {
            return Array2::<f64>::zeros((t.nrows(), 0));
        }
        t.dot(&self.q)
    }
}

/// Per-feature (per-atom) behavioral-significance report for the head.
#[derive(Debug, Clone)]
pub struct HeadFeatureSignificance {
    /// Wald statistic per latent axis (feature).
    pub statistic: Vec<f64>,
    /// Raw p-value per latent axis.
    pub p_value: Vec<f64>,
    /// Indices of features rejected by e-BH at the chosen FDR level — the
    /// features whose behavioral loading is statistically real after the
    /// multiplicity correction. These are the reportable behaviorally-anchored
    /// atoms.
    pub fdr_rejected: Vec<usize>,
    /// The FDR level the rejection set was computed at.
    pub alpha: f64,
}

/// Per-feature significance of the head's behavioral loadings, with FDR control.
///
/// For each latent axis (feature) `k`, the behavioral loading is the head
/// coefficient `w_{c,k}` across η-channels. We test the null `w_{·,k} = 0`
/// (the atom carries no behavioral signal) with [`wood_smooth_test`] on the
/// coefficient block for that axis, using the head's posterior covariance.
/// Raw p-values are converted to e-values (`e = 1/p`, the calibrated
/// p-to-e transform) and fed to [`e_benjamini_hochberg`] for an FDR-controlled
/// rejection set that needs no independence assumption across atoms — exactly
/// the case BH cannot legally handle when atoms share tokens.
///
/// `coeffs` is the fitted head coefficient vector (`n_eta · (1 + d)`),
/// `covariance` its posterior covariance of the same size, `latent_dim` is `d`,
/// `n_eta` the channel count, and `alpha` the target FDR.
pub fn head_feature_significance(
    coeffs: ArrayView1<'_, f64>,
    covariance: &Array2<f64>,
    latent_dim: usize,
    n_eta: usize,
    residual_df: f64,
    alpha: f64,
) -> Result<HeadFeatureSignificance, String> {
    let block = 1 + latent_dim;
    if coeffs.len() != n_eta * block {
        return Err(format!(
            "head_feature_significance: coeffs length {} != n_eta·(1+d) = {}",
            coeffs.len(),
            n_eta * block
        ));
    }
    if covariance.nrows() != coeffs.len() || covariance.ncols() != coeffs.len() {
        return Err(format!(
            "head_feature_significance: covariance must be {0}×{0}, got {1}×{2}",
            coeffs.len(),
            covariance.nrows(),
            covariance.ncols()
        ));
    }
    let beta = coeffs.to_owned();
    let mut statistic = Vec::with_capacity(latent_dim);
    let mut p_value = Vec::with_capacity(latent_dim);
    let mut log_e_values = Vec::with_capacity(latent_dim);
    for axis in 0..latent_dim {
        // Gather the coefficient indices for this axis across all channels:
        // for channel c the loading is at `c·block + 1 + axis`. wood_smooth_test
        // tests a contiguous range, so for the common Binomial (n_eta = 1) case
        // the range is a single coefficient; for multinomial we test each
        // channel's loading and combine by the strongest evidence (min p).
        let mut best_p = 1.0_f64;
        let mut best_stat = 0.0_f64;
        for c in 0..n_eta {
            let idx = c * block + 1 + axis;
            let input = SmoothTestInput {
                beta: beta.view(),
                covariance,
                influence_matrix: None,
                coeff_range: idx..idx + 1,
                edf: 1.0,
                nullspace_dim: 1,
                residual_df,
                scale: SmoothTestScale::Estimated,
            };
            if let Some(res) = wood_smooth_test(input) {
                if res.p_value < best_p {
                    best_p = res.p_value;
                    best_stat = res.statistic;
                }
            }
        }
        statistic.push(best_stat);
        p_value.push(best_p);
        // p-to-e calibration: e = 1/p is a valid (conservative) e-value for a
        // p-value uniform under the null. Clamp to avoid log(0)/∞ on p = 0.
        let p_clamped = best_p.clamp(1e-300, 1.0);
        log_e_values.push(-p_clamped.ln());
    }
    let fdr_rejected = e_benjamini_hochberg(&log_e_values, alpha);
    Ok(HeadFeatureSignificance {
        statistic,
        p_value,
        fdr_rejected,
        alpha,
    })
}

/// Reduce a per-row score-influence Jacobian `(n × d)` to a rank-truncated
/// orthonormal basis of its latent-code column span. Convenience used by the
/// absorber install site when the caller already has the `(n × d)` Jacobian
/// for a single η-channel (the common Binomial case). Returns `Q ∈ ℝ^{d × r}`,
/// `r` = numerical rank of the column span (rank-deficient inputs yield a
/// low-rank `Q`, so a vacuous label channel orthogonalizes against nothing).
pub fn orthonormal_span(jacobian: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
    let (n, d) = jacobian.dim();
    if d == 0 || n == 0 {
        return Ok(Array2::<f64>::zeros((d, 0)));
    }
    // The right singular vectors of `J` span its row space = the column span
    // of `Jᵀ` in ℝ^d. Equivalently the eigenvectors of the d × d Gram `JᵀJ`;
    // route through the SVD bridge for a stable, rank-truncated basis.
    let gram = jacobian.t().dot(&jacobian);
    let (u_opt, sv, _vt) = gram
        .svd(true, false)
        .map_err(|e| format!("orthonormal_span: SVD failed: {e}"))?;
    let u = u_opt.ok_or_else(|| "orthonormal_span: SVD did not return U".to_string())?;
    let max_sv = sv.iter().cloned().fold(0.0_f64, f64::max);
    let tol = max_sv * (d as f64) * f64::EPSILON;
    let rank = sv.iter().filter(|&&s| s > tol).count();
    let mut q = Array2::<f64>::zeros((d, rank));
    for col in 0..rank {
        for r in 0..d {
            q[[r, col]] = u[[r, col]];
        }
    }
    Ok(q)
}