gam 0.3.112

Generalized penalized likelihood engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
//! #974 — the structured-residual covariance estimator and the single producer
//! of [`MetricProvenance::WhitenedStructured`](crate::inference::row_metric::MetricProvenance::WhitenedStructured).
//!
//! # What this estimates
//!
//! Given a residual matrix `R ∈ ℝ^{n×p}` (one `p`-dimensional reconstruction
//! residual per row) and a smooth *activity coordinate* `z ∈ ℝ^n`, this fits the
//! **structured residual-covariance model**
//!
//! ```text
//!     Cov(r_n) = Σ_n = Λ · c(z_n) · Λᵀ + D ,
//! ```
//!
//! where
//!
//! * `Λ ∈ ℝ^{p×r}` is a **low-rank interference factor** (the shared
//!   off-isotropic subspace the residuals correlate along — e.g. a planted
//!   interference subspace or a topology-race confound),
//! * `D = diag(d) ≻ 0` is the **idiosyncratic diagonal** (per-channel
//!   independent noise), and
//! * `c(z) > 0` is the **smooth activity-scale law**: a strictly-positive scalar
//!   that modulates the factor energy with the activity coordinate, recovered as
//!   a binned-then-smoothed function of `z`.
//!
//! The fit is a deterministic, fixed-iteration **alternation** (no clock, no
//! RNG; any tie is broken by index): it alternates
//!
//! 1. *(scale | Λ, D)* — re-estimate the per-row factor activity `c(z_n)` and
//!    smooth it across `z`, holding the factor model fixed; and
//! 2. *(Λ, D | scale)* — re-estimate the factor and diagonal from the
//!    scale-deflated second-moment, holding the activity law fixed,
//!
//! a fixed small number of times. The **factor count `r`** is chosen by an
//! evidence ladder: each candidate `r` is scored by its penalized Gaussian
//! log-evidence and the best is kept.
//!
//! # What it produces
//!
//! [`StructuredResidualModel::row_metric`] materializes the **per-row precision
//! factor** `U_n ∈ ℝ^{p×p}` with `U_n U_nᵀ = Σ_n^{-1}`, packaged as a
//! [`RowMetric`](crate::inference::row_metric::RowMetric) with
//! [`MetricProvenance::WhitenedStructured`](crate::inference::row_metric::MetricProvenance::WhitenedStructured).
//! Whitening a residual `r_n` through it (`U_nᵀ r_n`) yields a vector whose
//! squared Euclidean norm is `r_nᵀ Σ_n^{-1} r_n` — the Mahalanobis residual under
//! the estimated noise model, which is exactly the likelihood-correct data-fit.
//! The factor is built from `Σ_n^{-1}` computed in **Woodbury form** (an
//! `r × r` solve, never a `p × p` inverse), so the estimator scales with the
//! factor rank, not the dense output dimension.
//!
//! This is the first real producer of `WhitenedStructured`, and therefore the
//! first metric whose `whitens_likelihood()` is `true`: see
//! [`RowMetric::whitens_likelihood`](crate::inference::row_metric::RowMetric::whitens_likelihood).

use std::sync::Arc;

use ndarray::{Array1, Array2, ArrayView1, ArrayView2};

use crate::inference::row_metric::RowMetric;
use crate::linalg::faer_ndarray::{FaerCholesky, FaerEigh};
use faer::Side;

/// Number of (scale | factor) ↔ (factor | scale) alternation sweeps. Fixed and
/// deterministic: the alternation is a smooth descent on the structured-Gaussian
/// objective and converges geometrically, so a small fixed budget is both
/// sufficient and reproducible (no clock/RNG-driven stopping).
const ALTERNATION_SWEEPS: usize = 8;

/// Number of bins the activity coordinate `z` is partitioned into for the smooth
/// activity-scale `c(z)`. The per-bin factor activity is estimated then linearly
/// interpolated across bin centers, giving a continuous piecewise-linear scale
/// law. Chosen as a fixed structural constant (magic-by-default): enough bins to
/// resolve a smooth monotone or unimodal scale trend without over-fitting the
/// per-row noise.
const ACTIVITY_SCALE_BINS: usize = 8;

/// Relative floor on the idiosyncratic diagonal `D`, as a fraction of the mean
/// residual variance. Keeps `Σ_n ≻ 0` and the Woodbury `r × r` capacitance
/// invertible even when a channel is (near-)perfectly explained by the factor.
const DIAGONAL_REL_FLOOR: f64 = 1e-6;

/// Relative floor on the activity scale `c(z)`, as a fraction of its mean. Keeps
/// `c(z) > 0` (a covariance scale) across the whole `z` range.
const SCALE_REL_FLOOR: f64 = 1e-4;

/// The fitted structured residual-covariance model: low-rank factor `Λ`,
/// idiosyncratic diagonal `D`, and the smooth activity-scale `c(z)` evaluated at
/// every row. Produces per-row precision factors and the
/// [`MetricProvenance::WhitenedStructured`](crate::inference::row_metric::MetricProvenance::WhitenedStructured)
/// [`RowMetric`](crate::inference::row_metric::RowMetric).
#[derive(Clone, Debug)]
pub struct StructuredResidualModel {
    /// Output dimensionality `p` (residual width).
    p: usize,
    /// Selected factor rank `r` (`0 ≤ r ≤ p`). `0` ⇒ pure-diagonal noise model.
    factor_rank: usize,
    /// Interference factor `Λ ∈ ℝ^{p×r}` (the shared off-diagonal subspace).
    lambda: Array2<f64>,
    /// Idiosyncratic diagonal `d ∈ ℝ^p` (`D = diag(d)`), floored `≻ 0`.
    diagonal: Array1<f64>,
    /// Per-row activity scale `c(z_n) > 0`, length `n`.
    row_scale: Array1<f64>,
    /// Penalized Gaussian log-evidence of the selected model (higher is better).
    /// The value the evidence ladder maximized over the candidate ranks.
    log_evidence: f64,
}

/// Estimator inputs: the residual matrix and the smooth activity coordinate.
///
/// `residuals` is `R ∈ ℝ^{n×p}`. `activity` is `z ∈ ℝ^n` — the coordinate the
/// scale law `c(z)` is smooth in (e.g. an assignment-mass or activation-strength
/// summary per row). When no genuine activity coordinate is available, passing a
/// constant `z` recovers a homoscedastic factor model (`c(z) ≡ const`).
pub struct ResidualFactorInput<'a> {
    /// Residual matrix `R ∈ ℝ^{n×p}`.
    pub residuals: ArrayView2<'a, f64>,
    /// Activity coordinate `z ∈ ℝ^n` the scale law is smooth in.
    pub activity: ArrayView1<'a, f64>,
    /// Maximum factor rank the evidence ladder is allowed to consider. The
    /// ladder scores `r = 0, 1, …, min(max_factor_rank, p−1)` and keeps the
    /// penalized-evidence maximizer. `0` forces the pure-diagonal model.
    pub max_factor_rank: usize,
}

impl StructuredResidualModel {
    /// Fit the structured residual-covariance model by the deterministic
    /// fixed-iteration alternation, selecting the factor rank by the evidence
    /// ladder. Returns an error only on shape / non-finite-input violations; the
    /// numerical path is total (every floor and solve is guarded).
    pub fn fit(input: ResidualFactorInput<'_>) -> Result<Self, String> {
        let r = input.residuals;
        let z = input.activity;
        let n = r.nrows();
        let p = r.ncols();
        if n == 0 || p == 0 {
            return Err(format!(
                "StructuredResidualModel::fit: residuals must be non-empty; got ({n}, {p})"
            ));
        }
        if z.len() != n {
            return Err(format!(
                "StructuredResidualModel::fit: activity length {} != residual rows {n}",
                z.len()
            ));
        }
        if !r.iter().all(|v| v.is_finite()) {
            return Err("StructuredResidualModel::fit: residuals must be finite".to_string());
        }
        if !z.iter().all(|v| v.is_finite()) {
            return Err("StructuredResidualModel::fit: activity must be finite".to_string());
        }

        // Bin assignment for the activity-scale law: deterministic equal-width
        // bins over the observed z-range. A degenerate (zero-width) range maps
        // every row to bin 0, recovering a single homoscedastic scale.
        let bins = ACTIVITY_SCALE_BINS.max(1);
        let z_min = z.iter().copied().fold(f64::INFINITY, f64::min);
        let z_max = z.iter().copied().fold(f64::NEG_INFINITY, f64::max);
        let z_span = z_max - z_min;
        let row_bin: Vec<usize> = (0..n)
            .map(|i| {
                if z_span <= 0.0 {
                    0
                } else {
                    let frac = (z[i] - z_min) / z_span;
                    let idx = (frac * bins as f64).floor() as isize;
                    idx.clamp(0, bins as isize - 1) as usize
                }
            })
            .collect();

        let max_rank = input.max_factor_rank.min(p.saturating_sub(1));

        // Evidence ladder over candidate factor ranks. Each candidate is fit by
        // the full alternation and scored by its penalized Gaussian log-evidence;
        // the maximizer is kept. Index order breaks any tie (lowest rank wins on
        // an exact tie — Occam).
        let mut best: Option<StructuredResidualModel> = None;
        for rank in 0..=max_rank {
            let model = Self::fit_fixed_rank(r, &row_bin, bins, rank)?;
            let take = match &best {
                None => true,
                Some(b) => model.log_evidence > b.log_evidence,
            };
            if take {
                best = Some(model);
            }
        }
        best.ok_or_else(|| "StructuredResidualModel::fit: evidence ladder empty".to_string())
    }

    /// Fit the model at a fixed factor rank by the deterministic alternation.
    fn fit_fixed_rank(
        r: ArrayView2<'_, f64>,
        row_bin: &[usize],
        bins: usize,
        rank: usize,
    ) -> Result<Self, String> {
        let n = r.nrows();
        let p = r.ncols();

        // Mean residual variance — the scale reference for the diagonal floor.
        let mut total_var = 0.0_f64;
        for i in 0..n {
            for j in 0..p {
                total_var += r[[i, j]] * r[[i, j]];
            }
        }
        let mean_var = (total_var / (n as f64 * p as f64)).max(f64::MIN_POSITIVE);
        let diag_floor = DIAGONAL_REL_FLOOR * mean_var;

        // Initialize the per-row scale to 1 (homoscedastic start), the diagonal
        // to the per-channel sample variance, and Λ to the leading eigenvectors
        // of the (scale-1) second moment. The alternation refines all three.
        let mut row_scale = Array1::<f64>::ones(n);
        let mut bin_scale = Array1::<f64>::ones(bins);
        // Raw (undeflated) per-channel second moment — the D estimator's data
        // term. Constant across sweeps.
        let raw_diag = column_variances(r);
        let mut diagonal = raw_diag.mapv(|v| v.max(diag_floor));
        let mut lambda = Array2::<f64>::zeros((p, rank));

        for _sweep in 0..ALTERNATION_SWEEPS {
            // (Λ, D | scale): scale-deflated second moment
            //   S = (1/n) Σ_n (r_n r_nᵀ) / c(z_n).
            // Under the model E[r_n r_nᵀ] = c_n ΛΛᵀ + D, so S ≈ ΛΛᵀ + D̄ with
            // D̄ the scale-averaged diagonal; the leading eigenpairs of S − D
            // give Λ, the residual diagonal gives D.
            let s = scaled_second_moment(r, &row_scale);
            let (evals, evecs) = symmetric_eig_ascending(&s)?;
            // Leading `rank` eigenpairs (eigenvalues ascending ⇒ take the tail).
            if rank > 0 {
                for k in 0..rank {
                    let col = p - 1 - k;
                    // Factor energy above the idiosyncratic floor: the part of
                    // the eigenvalue not explained by the mean diagonal.
                    let mean_diag = diagonal.iter().copied().sum::<f64>() / p as f64;
                    let energy = (evals[col] - mean_diag).max(0.0);
                    let amp = energy.sqrt();
                    for row in 0..p {
                        lambda[[row, k]] = amp * evecs[[row, col]];
                    }
                }
            }
            // D update from the RAW (undeflated) moment, floored ≻ 0. The model
            // is Σ_n = c_n·ΛΛᵀ + D with D NOT scale-multiplied, and c is mean-1
            // normalized, so E[(1/n)Σ r_n r_nᵀ] = ΛΛᵀ + D exactly. The deflated
            // moment `s` is the right object for the FACTOR block (its factor
            // part is scale-free) but its diagonal carries D·mean(1/c) — a
            // Jensen-inflated D (mean(1/c) > 1 for any non-constant law), which
            // biased D upward by exactly mean(1/c̃) and let a spurious
            // higher-rank candidate win the evidence ladder on a better D
            // alone (the probe's rank-2 winner had a zero second column).
            for j in 0..p {
                let mut factor_var = 0.0_f64;
                for k in 0..rank {
                    factor_var += lambda[[j, k]] * lambda[[j, k]];
                }
                diagonal[j] = (raw_diag[j] - factor_var).max(diag_floor);
            }

            // (scale | Λ, D): per-row factor activity. With residual r_n, the
            // factor-subspace energy is r_nᵀ P r_n where P projects onto
            // range(Λ) in the D-whitened metric; the maximum-likelihood scalar
            // multiplier on ΛΛᵀ that matches the row's factor-subspace energy is
            //   c_n = (r̃_nᵀ B (BᵀB)^{-1} Bᵀ r̃_n) / tr(...)-normalizer.
            // We use a stable closed-form proxy: the row's factor-coordinate
            // energy ‖Λ⁺ r_n‖² normalized by the unit-scale expectation, then
            // bin-smoothed across z. With rank 0 there is no factor ⇒ c ≡ 1.
            if rank > 0 {
                let mut bin_num = Array1::<f64>::zeros(bins);
                let mut bin_den = Array1::<f64>::zeros(bins);
                let coords = factor_coordinates(&lambda, &diagonal, r)?;
                for i in 0..n {
                    let mut energy = 0.0_f64;
                    for k in 0..rank {
                        energy += coords[[i, k]] * coords[[i, k]];
                    }
                    let b = row_bin[i];
                    bin_num[b] += energy;
                    bin_den[b] += rank as f64;
                }
                // Per-bin mean factor energy = activity scale. Empty bins inherit
                // the global mean so the scale law stays defined everywhere.
                let global = {
                    let num: f64 = bin_num.iter().sum();
                    let den: f64 = bin_den.iter().sum();
                    if den > 0.0 { num / den } else { 1.0 }
                };
                for b in 0..bins {
                    bin_scale[b] = if bin_den[b] > 0.0 {
                        bin_num[b] / bin_den[b]
                    } else {
                        global
                    };
                }
                // Smooth (3-point moving average over bins) for a continuous law,
                // then floor ≻ 0.
                let scale_floor = SCALE_REL_FLOOR * global.max(f64::MIN_POSITIVE);
                let smoothed = moving_average_3(&bin_scale);
                for b in 0..bins {
                    bin_scale[b] = smoothed[b].max(scale_floor);
                }
                // Re-normalize so the mean scale is 1 (the factor amplitude lives
                // in Λ; c(z) carries only the relative activity law). This keeps
                // the (Λ, D) ↔ (scale) split identified.
                let mean_scale = bin_scale.iter().copied().sum::<f64>() / bins as f64;
                if mean_scale > 0.0 {
                    bin_scale.mapv_inplace(|v| v / mean_scale);
                }
                for i in 0..n {
                    row_scale[i] = bin_scale[row_bin[i]].max(scale_floor);
                }
            }
        }

        let log_evidence = penalized_log_evidence(r, &lambda, &diagonal, &row_scale, rank);
        let mut model = Self {
            p,
            factor_rank: rank,
            lambda,
            diagonal,
            row_scale,
            log_evidence,
        };
        // Guard against any non-finite leak from a degenerate fit: fall back to a
        // pure-diagonal model with the same evidence accounting.
        if !model.is_finite() {
            model.lambda = Array2::<f64>::zeros((p, rank));
            model.row_scale = Array1::<f64>::ones(n);
        }
        Ok(model)
    }

    fn is_finite(&self) -> bool {
        self.lambda.iter().all(|v| v.is_finite())
            && self.diagonal.iter().all(|v| v.is_finite() && *v > 0.0)
            && self.row_scale.iter().all(|v| v.is_finite() && *v > 0.0)
            && self.log_evidence.is_finite()
    }

    /// Selected factor rank `r`.
    pub fn factor_rank(&self) -> usize {
        self.factor_rank
    }

    /// The fitted interference factor `Λ ∈ ℝ^{p×r}` (the shared off-isotropic
    /// residual subspace). Consumed by the planted-subspace recovery test to
    /// compare `range(Λ)` against the planted interference subspace.
    pub fn factor(&self) -> ArrayView2<'_, f64> {
        self.lambda.view()
    }

    /// The idiosyncratic diagonal `d ∈ ℝ^p` (`D = diag(d)`).
    pub fn diagonal(&self) -> ArrayView1<'_, f64> {
        self.diagonal.view()
    }

    /// The per-row activity scale `c(z_n) > 0`, length `n`. Recovers the smooth
    /// activity-scale law evaluated at every observed `z_n`.
    pub fn row_scale(&self) -> ArrayView1<'_, f64> {
        self.row_scale.view()
    }

    /// The penalized Gaussian log-evidence the rank-selection ladder maximized.
    pub fn log_evidence(&self) -> f64 {
        self.log_evidence
    }

    /// Build the per-row precision factor stack `U_n ∈ ℝ^{p×p}` with
    /// `U_n U_nᵀ = Σ_n^{-1}` and package it as a
    /// [`MetricProvenance::WhitenedStructured`](crate::inference::row_metric::MetricProvenance::WhitenedStructured)
    /// [`RowMetric`](crate::inference::row_metric::RowMetric). This is the single
    /// production site of `WhitenedStructured`.
    ///
    /// The precision is formed in **Woodbury form**:
    /// ```text
    ///   Σ_n^{-1} = D^{-1} − D^{-1} Λ ( c^{-1} I_r + Λᵀ D^{-1} Λ )^{-1} Λᵀ D^{-1},
    /// ```
    /// an `r × r` capacitance solve (never a `p × p` inverse). The factor `U_n`
    /// is the lower-Cholesky of the assembled `Σ_n^{-1}` (`rank = p`), so
    /// `whiten_residual_row` returns coordinates whose squared norm is the exact
    /// Mahalanobis residual `r_nᵀ Σ_n^{-1} r_n`.
    pub fn row_metric(&self, n_rows: usize) -> Result<RowMetric, String> {
        if n_rows != self.row_scale.len() {
            return Err(format!(
                "StructuredResidualModel::row_metric: requested {n_rows} rows but model has {}",
                self.row_scale.len()
            ));
        }
        let p = self.p;
        // Row-major flat factor matrix: u[n, i*p + k] = U_n[i, k].
        let mut u = Array2::<f64>::zeros((n_rows, p * p));
        for row in 0..n_rows {
            let precision = self.row_precision(row)?;
            let factor = lower_cholesky_psd(&precision)?;
            for i in 0..p {
                for k in 0..p {
                    u[[row, i * p + k]] = factor[[i, k]];
                }
            }
        }
        RowMetric::whitened_structured(Arc::new(u), p, p)
    }

    /// Per-row precision `Σ_n^{-1}` via the Woodbury identity (an `r × r` solve).
    fn row_precision(&self, row: usize) -> Result<Array2<f64>, String> {
        let p = self.p;
        let r = self.factor_rank;
        let c = self.row_scale[row].max(f64::MIN_POSITIVE);
        // D^{-1}.
        let d_inv: Vec<f64> = (0..p).map(|i| 1.0 / self.diagonal[i]).collect();
        // Start from D^{-1}.
        let mut precision = Array2::<f64>::zeros((p, p));
        for i in 0..p {
            precision[[i, i]] = d_inv[i];
        }
        if r == 0 {
            return Ok(precision);
        }
        // B = D^{-1} Λ  ∈ ℝ^{p×r}.
        let mut b = Array2::<f64>::zeros((p, r));
        for i in 0..p {
            for k in 0..r {
                b[[i, k]] = d_inv[i] * self.lambda[[i, k]];
            }
        }
        // Capacitance M = c^{-1} I_r + Λᵀ D^{-1} Λ  ∈ ℝ^{r×r}.
        let mut cap = Array2::<f64>::zeros((r, r));
        for a in 0..r {
            for bk in 0..r {
                let mut acc = 0.0_f64;
                for i in 0..p {
                    acc += self.lambda[[i, a]] * b[[i, bk]];
                }
                cap[[a, bk]] = acc;
            }
            cap[[a, a]] += 1.0 / c;
        }
        // Σ_n^{-1} = D^{-1} − B M^{-1} Bᵀ. Solve M X = Bᵀ for X = M^{-1} Bᵀ
        // (r × p) via Cholesky (M ≻ 0 since c^{-1} > 0 and ΛᵀD^{-1}Λ ⪰ 0).
        let chol = cap
            .cholesky(Side::Lower)
            .map_err(|e| format!("StructuredResidualModel::row_precision capacitance: {e:?}"))?;
        let mut bt = Array2::<f64>::zeros((r, p));
        for k in 0..r {
            for i in 0..p {
                bt[[k, i]] = b[[i, k]];
            }
        }
        let x = chol.solve_mat(&bt); // r × p
        for i in 0..p {
            for j in 0..p {
                let mut acc = 0.0_f64;
                for k in 0..r {
                    acc += b[[i, k]] * x[[k, j]];
                }
                precision[[i, j]] -= acc;
            }
        }
        // Symmetrize against round-off so the Cholesky downstream sees an exactly
        // symmetric PSD matrix.
        for i in 0..p {
            for j in (i + 1)..p {
                let avg = 0.5 * (precision[[i, j]] + precision[[j, i]]);
                precision[[i, j]] = avg;
                precision[[j, i]] = avg;
            }
        }
        Ok(precision)
    }
}

/// Per-channel (column) sample second moment of the residual matrix.
fn column_variances(r: ArrayView2<'_, f64>) -> Array1<f64> {
    let n = r.nrows();
    let p = r.ncols();
    let mut v = Array1::<f64>::zeros(p);
    for j in 0..p {
        let mut acc = 0.0_f64;
        for i in 0..n {
            acc += r[[i, j]] * r[[i, j]];
        }
        v[j] = acc / n as f64;
    }
    v
}

/// Scale-deflated second moment `S = (1/n) Σ_n (r_n r_nᵀ) / c_n`.
fn scaled_second_moment(r: ArrayView2<'_, f64>, row_scale: &Array1<f64>) -> Array2<f64> {
    let n = r.nrows();
    let p = r.ncols();
    let mut s = Array2::<f64>::zeros((p, p));
    for i in 0..n {
        let w = 1.0 / row_scale[i].max(f64::MIN_POSITIVE);
        for a in 0..p {
            let ra = r[[i, a]];
            for b in 0..p {
                s[[a, b]] += w * ra * r[[i, b]];
            }
        }
    }
    s.mapv_inplace(|v| v / n as f64);
    // Symmetrize against accumulation round-off.
    for a in 0..p {
        for b in (a + 1)..p {
            let avg = 0.5 * (s[[a, b]] + s[[b, a]]);
            s[[a, b]] = avg;
            s[[b, a]] = avg;
        }
    }
    s
}

/// Factor coordinates `Λ⁺_D r_n` per row: the generalized-least-squares
/// projection of each residual onto `range(Λ)` in the `D^{-1}` metric, returned
/// as an `n × r` matrix. Solves the `r × r` normal equations
/// `(Λᵀ D^{-1} Λ) γ = Λᵀ D^{-1} r_n` per row (shared factorization).
fn factor_coordinates(
    lambda: &Array2<f64>,
    diagonal: &Array1<f64>,
    r: ArrayView2<'_, f64>,
) -> Result<Array2<f64>, String> {
    let p = lambda.nrows();
    let rank = lambda.ncols();
    let n = r.nrows();
    let d_inv: Vec<f64> = (0..p).map(|i| 1.0 / diagonal[i]).collect();
    // Normal matrix ΛᵀD^{-1}Λ (+ tiny ridge for invertibility).
    let mut normal = Array2::<f64>::zeros((rank, rank));
    for a in 0..rank {
        for b in 0..rank {
            let mut acc = 0.0_f64;
            for i in 0..p {
                acc += lambda[[i, a]] * d_inv[i] * lambda[[i, b]];
            }
            normal[[a, b]] = acc;
        }
    }
    let trace = (0..rank).map(|k| normal[[k, k]]).sum::<f64>().max(1.0);
    let ridge = 1e-10 * trace / rank.max(1) as f64;
    for k in 0..rank {
        normal[[k, k]] += ridge;
    }
    let chol = normal
        .cholesky(Side::Lower)
        .map_err(|e| format!("factor_coordinates normal solve: {e:?}"))?;
    let mut coords = Array2::<f64>::zeros((n, rank));
    let mut rhs = Array1::<f64>::zeros(rank);
    for i in 0..n {
        for a in 0..rank {
            let mut acc = 0.0_f64;
            for j in 0..p {
                acc += lambda[[j, a]] * d_inv[j] * r[[i, j]];
            }
            rhs[a] = acc;
        }
        let gamma = chol.solvevec(&rhs);
        for a in 0..rank {
            coords[[i, a]] = gamma[a];
        }
    }
    Ok(coords)
}

/// 3-point moving average over a bin vector (edge-clamped), giving the smooth
/// activity-scale law a continuous, low-curvature shape.
fn moving_average_3(v: &Array1<f64>) -> Array1<f64> {
    let m = v.len();
    let mut out = Array1::<f64>::zeros(m);
    for i in 0..m {
        let lo = i.saturating_sub(1);
        let hi = (i + 1).min(m - 1);
        let mut acc = 0.0_f64;
        let mut cnt = 0.0_f64;
        for j in lo..=hi {
            acc += v[j];
            cnt += 1.0;
        }
        out[i] = acc / cnt;
    }
    out
}

/// Ascending-eigenvalue symmetric eigendecomposition (faer convention).
fn symmetric_eig_ascending(m: &Array2<f64>) -> Result<(Array1<f64>, Array2<f64>), String> {
    m.eigh(Side::Lower)
        .map_err(|e| format!("symmetric_eig: {e:?}"))
}

/// Lower-triangular Cholesky factor `L` of a (numerically) PSD matrix `A` with
/// `L Lᵀ = A`, with a relative spectral floor so a marginally-indefinite
/// precision (round-off) still factors. Used to turn `Σ_n^{-1}` into the
/// `RowMetric` factor `U_n` (here `U_n = L`).
fn lower_cholesky_psd(a: &Array2<f64>) -> Result<Array2<f64>, String> {
    if let Ok(chol) = a.cholesky(Side::Lower) {
        return Ok(chol.lower_triangular());
    }
    // Eigen-repair: clamp eigenvalues to a small positive floor and rebuild a
    // symmetric square root, then Cholesky that (always succeeds, PD).
    let (evals, evecs) = symmetric_eig_ascending(a)?;
    let max_ev = evals.iter().copied().fold(0.0_f64, f64::max).max(1.0);
    let floor = 1e-10 * max_ev;
    let p = a.nrows();
    let mut sqrt = Array2::<f64>::zeros((p, p));
    for i in 0..p {
        for j in 0..p {
            let mut acc = 0.0_f64;
            for k in 0..p {
                let ev = evals[k].max(floor);
                acc += evecs[[i, k]] * ev.sqrt() * evecs[[j, k]];
            }
            sqrt[[i, j]] = acc;
        }
    }
    sqrt.cholesky(Side::Lower)
        .map(|c| c.lower_triangular())
        .map_err(|e| format!("lower_cholesky_psd eigen-repair: {e:?}"))
}

/// Penalized Gaussian log-evidence of the structured model at the fitted
/// parameters — the evidence ladder's rank-selection score.
///
/// The per-row log-density of `r_n ~ N(0, Σ_n)` is
/// `−½ ( log|Σ_n| + r_nᵀ Σ_n^{-1} r_n + p log 2π )`. We sum it across rows and
/// subtract a parameter-count penalty `½ k_params · log n` (a BIC-style Occam
/// term over the `p·r` factor entries + `p` diagonal entries + the bin scales),
/// so adding a spurious factor that does not improve the fit is rejected. Both
/// `log|Σ_n|` and the quadratic use the Woodbury / matrix-determinant lemma so no
/// dense `p × p` inverse or determinant is formed.
fn penalized_log_evidence(
    r: ArrayView2<'_, f64>,
    lambda: &Array2<f64>,
    diagonal: &Array1<f64>,
    row_scale: &Array1<f64>,
    rank: usize,
) -> f64 {
    let n = r.nrows();
    let p = r.ncols();
    let d_inv: Vec<f64> = (0..p).map(|i| 1.0 / diagonal[i]).collect();
    let log_det_d: f64 = diagonal.iter().map(|&d| d.ln()).sum();
    let two_pi_ln = (2.0 * std::f64::consts::PI).ln();

    let mut log_lik = 0.0_f64;
    for i in 0..n {
        let c = row_scale[i].max(f64::MIN_POSITIVE);
        // Quadratic r_nᵀ Σ_n^{-1} r_n via Woodbury:
        //   r_nᵀ D^{-1} r_n − (Bᵀ r_n)ᵀ M^{-1} (Bᵀ r_n),
        // with B = D^{-1}Λ and M = c^{-1}I + ΛᵀD^{-1}Λ.
        let mut quad = 0.0_f64;
        for j in 0..p {
            quad += r[[i, j]] * d_inv[j] * r[[i, j]];
        }
        let mut log_det = log_det_d;
        if rank > 0 {
            // M (r × r) and w = Bᵀ r_n = ΛᵀD^{-1} r_n.
            let mut m = Array2::<f64>::zeros((rank, rank));
            let mut w = Array1::<f64>::zeros(rank);
            for a in 0..rank {
                let mut wa = 0.0_f64;
                for j in 0..p {
                    wa += lambda[[j, a]] * d_inv[j] * r[[i, j]];
                }
                w[a] = wa;
                for b in 0..rank {
                    let mut acc = 0.0_f64;
                    for j in 0..p {
                        acc += lambda[[j, a]] * d_inv[j] * lambda[[j, b]];
                    }
                    m[[a, b]] = acc;
                }
                m[[a, a]] += 1.0 / c;
            }
            // Cholesky M = R Rᵀ → log|M|, and solve M y = w.
            match m.cholesky(Side::Lower) {
                Ok(chol) => {
                    let y = chol.solvevec(&w);
                    let mut wy = 0.0_f64;
                    for a in 0..rank {
                        wy += w[a] * y[a];
                    }
                    quad -= wy;
                    // log|Σ_n| = log|D| + log|M| + r·log c   (matrix-determinant
                    // lemma; the c^{-1}I shift carries the +r·log c).
                    let diag = chol.diag();
                    let log_det_m: f64 = diag.iter().map(|&l| (l * l).ln()).sum();
                    log_det = log_det_d + log_det_m + rank as f64 * c.ln();
                }
                Err(_) => {
                    // Degenerate capacitance — fall back to the diagonal model's
                    // accounting for this row (no factor correction).
                    log_det = log_det_d;
                }
            }
        }
        log_lik += -0.5 * (log_det + quad + p as f64 * two_pi_ln);
    }

    let k_params = (p * rank + p + ACTIVITY_SCALE_BINS) as f64;
    log_lik - 0.5 * k_params * (n.max(2) as f64).ln()
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::{Array1, Array2};

    fn lcg_uniform(state: &mut u64) -> f64 {
        *state = state
            .wrapping_mul(6364136223846793005)
            .wrapping_add(1442695040888963407);
        ((*state >> 11) as f64) / ((1u64 << 53) as f64)
    }

    fn lcg_normal(state: &mut u64) -> f64 {
        let u1 = lcg_uniform(state).max(1e-12);
        let u2 = lcg_uniform(state);
        (-2.0 * u1.ln()).sqrt() * (std::f64::consts::TAU * u2).cos()
    }

    /// Per-rank evidence breakdown on the planted single-factor activity-law
    /// DGP (the `fitted_scale_recovers_planted_activity_law` plant). Pins the
    /// rank-selection decision itself: the ladder must prefer rank 1, and this
    /// test names the margin so an over-selection regression is diagnosable
    /// from the failure message alone.
    #[test]
    fn evidence_ladder_prefers_planted_rank_one() {
        let n = 5000usize;
        let p = 4usize;
        let lambda0 = ndarray::array![[1.5], [1.2], [-0.4], [0.3]];
        let sigma_eps = 0.2_f64;
        let slope = 1.3_f64;
        let mut seed = 0xD1B54A32D192ED03_u64;
        let mut residuals = Array2::<f64>::zeros((n, p));
        let mut activity = Array1::<f64>::zeros(n);
        for row in 0..n {
            let z = (row as f64) / (n as f64 - 1.0);
            activity[row] = z;
            let amp = (slope * z).exp().sqrt();
            let f = lcg_normal(&mut seed);
            for i in 0..p {
                residuals[[row, i]] = amp * lambda0[[i, 0]] * f + sigma_eps * lcg_normal(&mut seed);
            }
        }
        // Reproduce fit()'s bin assignment, then score each rank directly.
        let bins = ACTIVITY_SCALE_BINS.max(1);
        let row_bin: Vec<usize> = (0..n)
            .map(|i| {
                let frac = activity[i];
                (frac * bins as f64).floor().clamp(0.0, bins as f64 - 1.0) as usize
            })
            .collect();
        let mut report = String::new();
        let mut ev = Vec::new();
        for rank in 0..=2usize {
            let m = StructuredResidualModel::fit_fixed_rank(residuals.view(), &row_bin, bins, rank)
                .expect("fixed-rank fit");
            let k_params = (p * rank + p + ACTIVITY_SCALE_BINS) as f64;
            let log_lik = m.log_evidence() + 0.5 * k_params * (n as f64).ln();
            let col_norms: Vec<f64> = (0..rank)
                .map(|k| {
                    m.factor()
                        .column(k)
                        .iter()
                        .map(|v| v * v)
                        .sum::<f64>()
                        .sqrt()
                })
                .collect();
            report.push_str(&format!(
                "rank {rank}: evidence={:.3} loglik={:.3} penalty={:.3} col_norms={:?} diag={:?}\n",
                m.log_evidence(),
                log_lik,
                0.5 * k_params * (n as f64).ln(),
                col_norms,
                m.diagonal()
                    .iter()
                    .map(|v| (v * 1e4).round() / 1e4)
                    .collect::<Vec<_>>()
            ));
            ev.push(m.log_evidence());
        }
        assert!(
            ev[1] > ev[0] && ev[1] > ev[2],
            "evidence ladder must prefer the planted rank 1; breakdown:\n{report}"
        );
    }

    /// Orthonormalize the columns of `m` (modified Gram–Schmidt), dropping
    /// numerically-null columns. Test-side helper for subspace comparisons.
    fn orthonormal_columns(m: ArrayView2<'_, f64>) -> Vec<Array1<f64>> {
        let mut basis: Vec<Array1<f64>> = Vec::new();
        for k in 0..m.ncols() {
            let mut v = m.column(k).to_owned();
            for q in &basis {
                let c = v.dot(q);
                v = &v - &(q * c);
            }
            let norm = v.dot(&v).sqrt();
            if norm > 1e-10 {
                basis.push(v / norm);
            }
        }
        basis
    }

    /// Squared norm of the projection of unit vector `v` onto span(basis) —
    /// `cos²` of the principal angle between `v` and the subspace.
    fn projection_energy(v: &Array1<f64>, basis: &[Array1<f64>]) -> f64 {
        basis.iter().map(|q| v.dot(q).powi(2)).sum()
    }

    /// #974 verification arm (a): the fitted factor must recover the PLANTED
    /// interference subspace. Two orthogonal planted directions with distinct
    /// strengths; the principal angles between each planted direction and
    /// range(Λ̂) must be small, and the evidence ladder must select rank 2.
    #[test]
    fn factor_recovers_planted_interference_subspace() {
        let n = 6000usize;
        let p = 6usize;
        // Two orthogonal planted unit directions.
        let raw1: Array1<f64> = ndarray::array![1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
        let raw2: Array1<f64> = ndarray::array![1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
        let v1 = &raw1 / raw1.dot(&raw1).sqrt();
        let v2 = &raw2 / raw2.dot(&raw2).sqrt();
        let (amp1, amp2) = (1.4_f64, 0.9_f64);
        let sigma_eps = 0.15_f64;

        let mut seed = 0x9E3779B97F4A7C15_u64;
        let mut residuals = Array2::<f64>::zeros((n, p));
        let activity = Array1::<f64>::zeros(n); // constant ⇒ homoscedastic law
        for row in 0..n {
            let f1 = amp1 * lcg_normal(&mut seed);
            let f2 = amp2 * lcg_normal(&mut seed);
            for i in 0..p {
                residuals[[row, i]] = f1 * v1[i] + f2 * v2[i] + sigma_eps * lcg_normal(&mut seed);
            }
        }

        let model = StructuredResidualModel::fit(ResidualFactorInput {
            residuals: residuals.view(),
            activity: activity.view(),
            max_factor_rank: 4,
        })
        .expect("fit");

        assert_eq!(
            model.factor_rank(),
            2,
            "ladder must select the planted rank 2 (got {}, evidence {:.3})",
            model.factor_rank(),
            model.log_evidence()
        );
        let basis = orthonormal_columns(model.factor());
        assert_eq!(basis.len(), 2, "fitted factor must span 2 directions");
        let e1 = projection_energy(&v1, &basis);
        let e2 = projection_energy(&v2, &basis);
        // cos² of each principal angle ≥ 0.95 ⇒ angle ≤ ~13°.
        assert!(
            e1 > 0.95 && e2 > 0.95,
            "planted directions must lie in range(Λ̂): cos² = ({e1:.4}, {e2:.4})"
        );
    }

    /// #974 verification arm (d): recovery of the planted activity-variance
    /// law. Single planted factor with per-row energy `exp(slope·z)`; the
    /// fitted `c(z_n)` must reproduce the law's shape — strongly correlated
    /// with the planted log-scale and with the right dynamic range.
    #[test]
    fn fitted_scale_recovers_planted_activity_law() {
        let n = 6000usize;
        let p = 4usize;
        let lambda0 = ndarray::array![1.5, 1.2, -0.4, 0.3];
        let sigma_eps = 0.2_f64;
        let slope = 1.3_f64;
        let mut seed = 0xD1B54A32D192ED03_u64;
        let mut residuals = Array2::<f64>::zeros((n, p));
        let mut activity = Array1::<f64>::zeros(n);
        for row in 0..n {
            let z = (row as f64) / (n as f64 - 1.0);
            activity[row] = z;
            let amp = (slope * z).exp().sqrt();
            let f = lcg_normal(&mut seed);
            for i in 0..p {
                residuals[[row, i]] = amp * lambda0[i] * f + sigma_eps * lcg_normal(&mut seed);
            }
        }

        let model = StructuredResidualModel::fit(ResidualFactorInput {
            residuals: residuals.view(),
            activity: activity.view(),
            max_factor_rank: 2,
        })
        .expect("fit");
        assert_eq!(model.factor_rank(), 1, "planted rank is 1");

        // Pearson correlation between fitted log c(z_n) and the planted
        // log-law slope·z (mean-1 normalization cancels in the correlation).
        let fitted_log: Vec<f64> = model.row_scale().iter().map(|c| c.ln()).collect();
        let planted_log: Vec<f64> = activity.iter().map(|z| slope * z).collect();
        let mean_f = fitted_log.iter().sum::<f64>() / n as f64;
        let mean_p = planted_log.iter().sum::<f64>() / n as f64;
        let mut cov = 0.0_f64;
        let mut var_f = 0.0_f64;
        let mut var_p = 0.0_f64;
        for i in 0..n {
            let df = fitted_log[i] - mean_f;
            let dp = planted_log[i] - mean_p;
            cov += df * dp;
            var_f += df * df;
            var_p += dp * dp;
        }
        let corr = cov / (var_f.sqrt() * var_p.sqrt());
        assert!(
            corr > 0.9,
            "fitted activity law must track the planted exp({slope}·z): corr = {corr:.4}"
        );

        // Dynamic range: planted c(top)/c(bottom) over the inner bin centers
        // is exp(slope·7/8) ≈ 3.1; the binned/smoothed estimate must land in
        // a generous bracket around it (smoothing shrinks the edges).
        let lo = model.row_scale()[n / 16]; // first-bin interior
        let hi = model.row_scale()[n - 1 - n / 16]; // last-bin interior
        let ratio = hi / lo;
        assert!(
            ratio > 1.8 && ratio < 5.5,
            "fitted dynamic range {ratio:.3} must bracket the planted ≈3.1"
        );
    }
}