gam-sae 0.3.139

Sparse-autoencoder latent-manifold terms for the gam penalized-likelihood engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
use super::*;

pub(crate) const SAE_MANIFOLD_ARMIJO_C1: f64 = 1.0e-4;

pub(crate) const SAE_MANIFOLD_MAX_LINESEARCH_HALVINGS: usize = 12;

/// Relative Cholesky-pivot floor for the analytic SAE outer-rho gradient.
///
/// The evidence value can still be honest below this threshold because it only
/// sums `log(diag(L))`. The analytic gradient is different: selected-inverse
/// traces and `ArrowFactorCache::full_inverse_apply` divide by those pivots.
/// Once `min_pivot / max_pivot` is below this floor, the gradient lane must
/// either identify a closed-form gauge orbit and stiffen only that quotient
/// direction, or reject the trial rho as numerically singular.
pub(crate) const SAE_OUTER_GRADIENT_PIVOT_RATIO_FLOOR: f64 = 1.0e-12;

pub(crate) const SAE_OUTER_GRADIENT_GAUGE_RAYLEIGH_FACTOR: f64 = 1.0e-8;

/// Relative spectral cutoff below which a penalised decoder β-curvature
/// eigenvalue (`G_k + λ_smooth·S_k`) is treated as a genuine flat direction of
/// the joint inner Hessian — the rank-deficient-decoder null quotiented out of
/// the inner convergence measure and deflated in the outer gradient (#1051).
/// Matches the `1e-9` relative rank cutoff used across the codebase.
pub(crate) const SAE_DECODER_BETA_NULL_RELATIVE_FLOOR: f64 = 1.0e-9;

/// Nominal curvature-homotopy `η` step (#1007): the tracker covers `η ∈ [0, 1]`
/// in this many equal predictor-corrector waypoints when the branch is clean.
/// Five waypoints is a few corrector solves — far cheaper than the multi-seed
/// cascade it replaces — and the step is halved adaptively when the arrow-factor
/// min pivot shrinks, so a near-bifurcation stretch is resolved at finer
/// granularity without a separate knob.
pub(crate) const CURVATURE_WALK_INITIAL_ETA_STEP: f64 = 0.2;

/// Smallest curvature-homotopy `η` step (#1007). A pivot collapse (or corrector
/// failure) that persists at this step is a DETECTED branch bifurcation, not a
/// step-size artifact: the walk records it and defers to the seed cascade.
pub(crate) const CURVATURE_WALK_MIN_ETA_STEP: f64 = 1.0 / 256.0;

/// Hard ceiling on accepted corrector solves in one curvature-homotopy walk
/// (#1007). Bounds the walk's cost under repeated halving; reaching it is a
/// structural-termination signal (the branch is not cleanly trackable) that
/// defers to the cascade, never a spin.
pub(crate) const CURVATURE_WALK_MAX_CORRECTORS: usize = 32;

/// Joint Newton iteration budget for the curvature-walk degenerate-basin
/// recovery (#1117). When the walk lands on a sub-arrival-floor (see the
/// `arrival_floor` in `run_curvature_homotopy_entry_at_rho`: the achievable
/// linear ceiling `anchor_ev` minus one atom's share) reconstruction, the
/// recovery runs a REAL joint fit from the pristine
/// (circle-aware) seed with at least this many inner iterations, independent of
/// the outer objective's possibly-frozen `inner_max_iter = 0` — otherwise the
/// recovery (and the fallback cascade) would re-freeze at the cold seed and
/// never escape the linear basin. Matches the cold-start budget that recovers
/// EV ≈ 0.94 on the K = 1 periodic circle.
pub(crate) const CURVATURE_WALK_RECOVERY_INNER_ITERS: usize = 25;

/// Relative floor on the Newton directional decrease, expressed as a tiny
/// multiple of `‖g‖·‖Δ‖`. A predicted decrease below this is at the level of
/// f64 round-off in the quadratic model and is treated as no progress (the step
/// is rejected). Scaling by the gradient/step norms makes the floor invariant
/// to the problem's overall magnitude.
pub(crate) const SAE_MANIFOLD_DIRECTIONAL_DECREASE_REL_FLOOR: f64 = 1.0e-14;

/// Row count at or above which the fused SAE reconstruction data-fit
/// (`loss_scaled`) fans its per-row decode + residual reduction out over
/// rayon. Below this the single-threaded fused pass is cheaper than the
/// fan-out; matched in spirit to the arrow-Schur `SCHUR_MATVEC_PARALLEL_ROW_MIN`
/// gate so short batches inside an outer fan-out stay sequential (#1017).
pub(crate) const SAE_LOSS_PARALLEL_ROW_MIN: usize = 64;

/// Relative tolerance on the undamped Newton step norm (scaled by the iterate
/// scale) for accepting inner-solve convergence.
pub(crate) const SAE_MANIFOLD_INNER_STEP_REL_TOL: f64 = 1.0e-4;

/// Relative tolerance on the KKT gradient norm (scaled by the iterate scale) for
/// accepting inner-solve convergence.
pub(crate) const SAE_MANIFOLD_INNER_GRAD_REL_TOL: f64 = 1.0e-5;

/// Relative per-refine-round penalised-objective decrease below which the inner
/// solve is treated as having reached its numerical fixed point (#1051). On an
/// ill-conditioned penalised bilinear fit the KKT gradient and undamped step
/// stay above tolerance while the objective stops moving; this `√εmach`-scale
/// floor recognises that stalled iterate as the converged inner optimum instead
/// of grinding the refine budget to the `1e12` infeasible sentinel.
pub(crate) const SAE_MANIFOLD_INNER_OBJECTIVE_STALL_REL_TOL: f64 = 1.0e-8;

/// Fraction of the total since-entry objective reduction below which a refine
/// round's contribution is treated as cosmetic flat-valley crawl (#1051), so the
/// inner solve is accepted as numerically converged. At `1e-4` the inner fit has
/// captured ≥ 99.99% of the achievable penalised-objective reduction before the
/// criterion is ranked — far past the point where further crawl can change the
/// Laplace evidence, yet strict enough that a materially-improving fit refines on.
pub(crate) const SAE_MANIFOLD_INNER_OBJECTIVE_STALL_FRACTION: f64 = 1.0e-4;

/// Minimum completed refine rounds before the objective-stagnation fixed point
/// may be accepted (#1051). Enough rounds to establish a meaningful
/// total-improvement baseline for the fraction test, but far below the full
/// refine budget — terminating the ill-conditioned crawl early is the goal.
pub(crate) const SAE_MANIFOLD_INNER_OBJECTIVE_STALL_MIN_ROUNDS: usize = 3;

/// Above this full-`B` β width, dense beta-penalty curvature is never
/// materialized when Grassmann frames are engaged; exact curvature is probed
/// directly in the factored coordinate space instead.
pub(crate) const SAE_DENSE_BETA_PENALTY_PROBE_MAX_DIM: usize = 4096;

/// Relative spectral cutoff for counting the numerical rank / nullity of a
/// symmetric penalty Gram: eigenvalues at or below `cutoff · λ_max` are treated
/// as zero. Shared by [`SaeManifoldTerm::symmetric_rank`] and
/// [`smooth_penalty_nullity`] so the two stay in lockstep.
pub(crate) const SAE_MANIFOLD_SPECTRAL_RANK_CUTOFF: f64 = 1.0e-9;

/// Floor on the Levenberg-Marquardt ridge added to a per-row Hessian before
/// Cholesky, so the first attempt is always strictly positive even when the
/// caller passes a zero base ridge.
pub(crate) const SAE_MANIFOLD_ROW_RIDGE_FLOOR: f64 = 1.0e-12;

/// Multiplicative factor by which the LM ridge is escalated after a failed
/// Cholesky factorisation of a per-row Hessian.
pub(crate) const SAE_MANIFOLD_ROW_RIDGE_GROWTH: f64 = 10.0;

#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct SaeBetaPenaltyAssembly {
    pub(crate) dense_written: bool,
    pub(crate) deferred_factored: bool,
}

impl SaeBetaPenaltyAssembly {
    pub(crate) fn record_curvature(&mut self, dense_beta_curvature: bool) {
        if dense_beta_curvature {
            self.dense_written = true;
        } else {
            self.deferred_factored = true;
        }
    }
}

/// ABSOLUTE FALLBACK explained-variance floor for the reconstruction-collapse
/// guard (#1023), used ONLY when the data-derived bar is un-computable.
///
/// #1522 retired this as the primary collapse threshold: the live bar is
/// [`SAE_COLLAPSE_PCA_EV_FRACTION`]`·pca_ev_ceiling(target, K)` (see
/// `collapse_ev_bar`), which tracks the data's own achievable EV instead of a
/// corpus-tuned constant. This number is reached only on a degenerate target
/// (constant columns / SVD failure → non-finite ceiling), where any positive
/// floor is arbitrary; `0.10` is a deliberately conservative "the fitted matrix
/// must explain at least a tenth of the variance or it is a structural collapse"
/// fallback so the guard still keys on *something* finite in that corner. It is
/// NOT a tuned operating point — a fit on real data is judged against the PCA
/// ceiling, never this constant.
pub(crate) const SAE_FIT_DATA_COLLAPSE_EV_FLOOR: f64 = 0.10;

/// #1522/#1610 — fraction of the REACHABLE-rank PCA / Eckart-Young EV ceiling
/// below which a fit counts as a structural co-collapse. The collapse bar is
/// `SAE_COLLAPSE_PCA_EV_FRACTION · pca_ev_ceiling(target, dictionary_rank)`,
/// where `dictionary_rank` is the dictionary's GEOMETRICALLY REACHABLE rank
/// ([`super::outer_objective::reachable_dictionary_rank`] = `Σ_k rank(Φ_k)`),
/// not the nominal coefficient count `Σ_k basis_size_k`. The ceiling is the BEST
/// EV any rank-`dictionary_rank` LINEAR dictionary could reach on THIS centered
/// target, so the bar is data-derived (scales with what the data actually
/// admits) rather than an absolute corpus-tuned number.
///
/// #1610 — the previous rank `Σ_k basis_size_k` was biased HIGH for a NONLINEAR
/// dictionary (the owner's audit table: "nonlinear dict vs linear PCA ceiling
/// (biased high)"): a curved `latent_dim = d` atom decoded through a smooth
/// chart spans only `rank(Φ_k) ≤ basis_size_k` linear output directions, so
/// summing the nominal coefficient counts over-stated the linearly-achievable
/// ceiling the fraction is taken against. Keying the rank on each chart's
/// REALIZED image rank `rank(Φ_k)` (read from the chart design alone, so a
/// co-collapsed decoder still reports full geometric reach) calibrates the bar
/// against what the dictionary geometry can actually reach.
///
/// `0.5` encodes the decision "a fit that explains less than HALF of the variance
/// a reachable-rank dictionary could has structurally collapsed, whatever its
/// absolute EV" — a dimensionless ratio with that single, explicit meaning, not
/// a magnitude tuned to any one corpus. It is the sole source for the ratio used
/// at every collapse-guard site.
pub(crate) const SAE_COLLAPSE_PCA_EV_FRACTION: f64 = 0.5;

pub(crate) const SAE_FIT_DATA_COLLAPSE_COST: f64 = 1.0e12;

pub(crate) const SAE_FINAL_EV_DEGRADATION_TOL: f64 = 1.0e-3;

/// #1026 — per-row coordinate re-projection grid resolution for the final
/// reconstruction polish (matches the resolution the curvature-homotopy arrival
/// polish uses in `outer_objective.rs`). The projection snaps each row's latent
/// coordinate to the nearest point of a `resolution`-point decode grid, so a
/// finer grid is a tighter chart re-fit; `256` is the established cost/precision
/// point for the chart re-projection (a circle/sphere/Euclidean grid sweep).
pub(crate) const SAE_FINAL_DECODER_POLISH_PROJECTION_RESOLUTION: usize = 256;

pub(crate) const SAE_SEED_DISPERSION_FLOOR: f64 = 1.0e-12;

/// #1026/#1610 decoder-repulsion conditioner strength as a DIMENSIONLESS ratio of
/// the primary separation-barrier strength. The collinearity-gated cross-decoder
/// repulsion injects POSITIVE curvature in the inter-atom co-collapse direction.
/// It is a conditioner/separator, not a primary objective term: it is identically
/// zero (value, gradient, curvature) unless two atom decoders exceed
/// [`SAE_DECODER_REPULSION_COLLINEARITY_GATE`], so it is a strict no-op for
/// well-separated atoms and for `K = 1`. Without it the SAE joint inner Newton
/// solve has NO inter-atom repulsion (data-fit is independent per atom given the
/// soft assignment), so at `K >= 4` on real residual geometry two atoms drift
/// onto the same decoder direction, the per-row `H_tt` block goes near-singular,
/// the reduced β-Schur over-subtracts and goes indefinite, and the inner solve
/// never converges (#1026).
///
/// #1610 — the previous value was an absolute `1e-3` whose magnitude was NOT
/// derived: it was four orders below the separation barrier (`10.0`) with no
/// stated relationship, and it weighted the un-normalized cross-Gram energy
/// `‖B_jB_kᵀ‖²_F = c_jk²·‖B_j‖²_F·‖B_k‖²_F`, so the repulsion's force on the
/// collinearity `c_jk²` scaled with the decoder energy `‖B_j‖²_F·‖B_k‖²_F` while
/// the separation barrier's force on the SAME `c_jk²` was scale-free. The `1e-3`
/// was therefore implicitly absorbing the decoder-energy units at one assumed
/// corpus scale — exactly the unprincipled, non-generalizing hand-pick #1610
/// flags. The fix (see [`super::penalties::SaeManifoldTerm::refresh_decoder_repulsion_gate`])
/// normalizes the per-pair weight by `‖B_j‖²_F·‖B_k‖²_F`, so the repulsion now
/// penalizes the SAME dimensionless collinearity `c_jk² ∈ [0,1]` the separation
/// barrier does. With both terms acting on the dimensionless `c_jk²`, the
/// repulsion strength is a pure dimensionless ratio of the barrier strength:
///
///   `μ_rep = SAE_DECODER_REPULSION_BARRIER_RATIO · μ_sep`.
///
/// The repulsion is a SUBDOMINANT conditioner — the separation barrier is the
/// primary anti-collapse cure (it DIVERGES at `c²→1`; the repulsion is a finite
/// PSD quadratic). The ratio `1e-4` keeps the repulsion four orders below the
/// barrier BY DESIGN (so it conditions the indefinite directions without fighting
/// the barrier's restoring force), and at the canonical unit decoder scale
/// (`‖B_k‖²_F ≈ 1`) the effective per-pair weight `μ_sep·1e-4 = 10·1e-4 = 1e-3`
/// reduces EXACTLY to the historical absolute constant, so unit-scale fits are
/// byte-unchanged. Unlike the old absolute `1e-3` this engages identically at any
/// corpus scale.
pub(crate) const SAE_DECODER_REPULSION_BARRIER_RATIO: f64 = 1.0e-4;

// The derived decoder-repulsion strength is a dimensionless fraction
// [`SAE_DECODER_REPULSION_BARRIER_RATIO`] of the data-derived (possibly
// runtime-overridden) separation-barrier strength μ_C; it is computed on the
// term itself (it needs the dictionary's overcompleteness, not a global
// constant) — see [`super::penalties::SaeManifoldTerm::decoder_repulsion_strength`].

/// #1026 normalized collinearity score
/// `s_jk = ‖B_jB_kᵀ‖²_F / (‖B_j‖²_F·‖B_k‖²_F)` at/above which the decoder
/// repulsion engages. Below this the smoothstep gate is exactly 0 (zero penalty
/// / gradient / curvature), so healthy fits whose atoms point in distinct
/// directions never activate the term. `s_jk` is scale-free in each decoder's
/// magnitude (it measures ANGLE, not norm) so it cannot fight the data-fit's
/// choice of decoder scale.
pub(crate) const SAE_DECODER_REPULSION_COLLINEARITY_GATE: f64 = 0.5;

/// Relative-mass floor defining when two atoms genuinely CO-FIRE on a row, used
/// by the anti-collapse co-activation scan (`barrier_coactive_pairs`). An atom
/// whose assignment mass on a row is below this fraction of that row's peak mass
/// is treated as NOT firing there: it contributes `≤ floor·peak` reconstruction
/// mass, so the co-activation it would register and the anti-collapse repulsion /
/// separation it would receive are negligible.
///
/// For structurally sparse assignments (JumpReLU hard gate, IBP-MAP) the surviving
/// active atoms sit far above this floor and the hard zeros are excluded anyway,
/// so the co-active support is unchanged. It is load-bearing for SOFTMAX, whose
/// normalization gives EVERY atom a tiny but strictly nonzero tail mass: a plain
/// `a ≠ 0` test would mark all `K` atoms co-active on every row and collapse the
/// scan to the dense `O(N·K²)` all-pairs cost (minutes at `K = 10⁴`). Matches the
/// `1e-3` relative cutoff the compact row layout uses (`from_dense_weights`), so
/// the penalty support and the assembled Newton support stay consistent.
pub(crate) const SAE_COACTIVE_RELATIVE_MASS_FLOOR: f64 = 1.0e-3;

// ── #1026 / #1522 interior-point COLLAPSE-PREVENTION barriers ────────────────
//
// The collinearity-gated `SAE_DECODER_REPULSION_*` term above is a SEPARATOR for
// already-large, already-collinear decoders; it is a strict no-op at the
// principal failure state (a decoder with norm → 0) because a zero decoder has
// no direction to be collinear with. The two barriers below are the actual
// anti-collapse core: they are interior-point log-barriers that DIVERGE at the
// collapse boundary, so the inner Newton can never reach it.

// #1026/#1522 — RUNTIME separation-barrier-strength override. Read through the
// accessor below so a SINGLE compiled wheel can sweep μ_sep from Python
// (`set_sae_barrier_overrides`) without recompiling. A quiet-NaN sentinel means
// "unset → derive μ_C from the problem" (the data-derived overcompleteness
// ratio), so 0.0 remains a legitimate swept value (barrier disabled). The amplitude
// (keep-alive) barrier was removed: an over-complete dictionary's surplus
// features SHOULD die, and a dead atom's decoder block is parked into a
// well-conditioned state by the inner per-row Tikhonov ridge — forcing the
// norm away from zero only over-inflated healthy atoms and fought that natural
// death. So there is no amplitude strength or active-atom gate to override.
static SAE_SEP_STRENGTH_OVERRIDE_BITS: std::sync::atomic::AtomicU64 =
    std::sync::atomic::AtomicU64::new(0x7ff8_0000_0000_0000);
/// #1026/#1522/#1610 — read the process-global separation-barrier-strength
/// override, or `None` when unset. `None` ⇒ the data-derived μ_C (the
/// overcompleteness ratio from
/// [`super::penalties::SaeManifoldTerm::separation_barrier_strength`]) is used.
/// A quiet-NaN sentinel means "unset → derive from the problem"; `0.0` stays a
/// legitimate swept value (barrier disabled).
pub(crate) fn sae_separation_barrier_override() -> Option<f64> {
    let v =
        f64::from_bits(SAE_SEP_STRENGTH_OVERRIDE_BITS.load(std::sync::atomic::Ordering::Relaxed));
    if v.is_nan() { None } else { Some(v) }
}

/// Set the process-global SAE separation-barrier strength override (one wheel,
/// many configs). `sep_strength` is NaN to clear the override back to the
/// data-derived μ_C (`K / reachable_rank`); there is no compiled-constant
/// default any more.
/// The amplitude (keep-alive) barrier and its active-atom gate were removed
/// (surplus features are allowed to die into a ridge-parked state), so this
/// takes only the separation strength. Called from the gamfit Python FFI.
pub fn set_sae_barrier_overrides(sep_strength: f64) {
    SAE_SEP_STRENGTH_OVERRIDE_BITS
        .store(sep_strength.to_bits(), std::sync::atomic::Ordering::Relaxed);
}

// #1026/#1522/#1610 — the SEPARATION barrier strength `μ_C` is NO LONGER a
// hand-picked absolute constant (it was `10.0`, matched to no problem scale).
// It is now DATA-DERIVED from the dictionary's overcompleteness — see the full
// derivation on [`super::penalties::SaeManifoldTerm::separation_barrier_strength`]
// (the penalty form `P_sep = -μ_C Σ q_jk log(1-c²+ε)` on the normalized decoder
// shapes is documented there). The runtime override
// (`set_sae_barrier_overrides`) still takes precedence over the derived value.

/// #1026/#1522 SEPARATION barrier softening `ε` in `log(1 - c_jk² + ε)`. Bounds
/// the barrier (and its PSD majorizer) at the exact-alignment limit `c_jk² = 1`.
pub(crate) const SAE_SEPARATION_BARRIER_EPS: f64 = 1.0e-6;

/// #1625 — normalized collinearity `c_jk² = ‖U_jU_kᵀ‖²_F ∈ [0,1]` at/above which
/// the SEPARATION barrier engages, the analog of
/// [`SAE_DECODER_REPULSION_COLLINEARITY_GATE`] for the primary anti-collapse
/// barrier. A C1 smoothstep ramps the barrier from exactly 0 at this threshold to
/// full strength (and the interior-point divergence) as `c_jk² → 1`, so the
/// barrier is a genuine COLLAPSE-prevention force — active only once two coactive
/// atoms are materially aligned — rather than a global orthogonality prior that
/// taxes every distinct-but-correlated pair.
///
/// WHY a gate at all (the ungated `−log(1−c²+ε)` was the #1625 stall): the
/// interior-point shape has force `∂P/∂c² = 1/(1−c²+ε) ≈ 1` even at MODERATE
/// collinearity, so two genuinely-distinct atoms (e.g. `c² = 0.36`, a 53° angle —
/// nowhere near the `c² → 1` collapse) feel an O(1) separating force that, on a
/// well-specified fit whose data residual is near zero, DOMINATES the objective and
/// drags the decoders off the data optimum. The inner (t, β) Newton then has to
/// chase a barrier-shifted optimum it converges to only slowly, and
/// `reml_criterion`'s undamped-PD inner solve never reaches KKT stationarity —
/// the #1625 "inner solve did not converge" refusal. Gating the barrier to the
/// near-collapse regime (`c² ≳ 0.5`) leaves well-separated dictionaries at their
/// data optimum (so they converge) while preserving the divergent restoring force
/// exactly where collapse happens (`c² → 1`).
///
/// Chosen equal to the repulsion gate (`0.5`): the two anti-collapse terms then
/// engage on the same near-collinear pair set, the barrier as the divergent
/// interior-point core and the repulsion as its subdominant conditioner. Pairs
/// below `0.5` (≥ 45° apart) are not collapsing and need no anti-collapse force.
pub(crate) const SAE_SEPARATION_BARRIER_COLLINEARITY_GATE: f64 = 0.5;

/// #1026/#1522/#1610 RELATIVE decoder-norm floor below which an atom is treated
/// as inactive / shape-undefined for the SEPARATION barrier (its `U_k` is
/// ill-conditioned). The separation barrier abstains for a pair until the
/// decoder norm is lifted above the floor.
///
/// #1610 — this is a RELATIVE fraction of the live dictionary's largest decoder
/// energy (`max_k ‖B_k‖_F`), NOT an absolute magnitude. The previous absolute
/// `1e-6` floor was NOT scale-invariant: the SAE decoders inherit the scale of
/// the activations being modeled, so under a global rescaling of the corpus by a
/// factor `s` every decoder norm scales by `s` while the absolute floor stayed
/// fixed — on a corpus whose natural decoder scale is below `1e-6` the barrier
/// spuriously abstained on EVERY pair (collapse prevention silently disabled),
/// and on a corpus scaled far above it the floor was inert. Keying the floor to
/// `max_k ‖B_k‖²_F` makes the abstain set scale-free: it depends only on the
/// RATIO of an atom's energy to the dictionary's, so collapse prevention engages
/// identically regardless of the corpus scale. At the canonical unit decoder
/// scale (`max_k ‖B_k‖²_F ≈ 1`) it reduces to the historical `1e-6` floor.
/// Consumed via [`super::penalties::barrier_norm_floor_sq`], the single source
/// for both the barrier value and its gradient/curvature, so the line-search
/// value never desyncs from the step.
pub(crate) const SAE_BARRIER_ACTIVE_NORM_REL_FLOOR: f64 = 1.0e-6;

/// Full SAE-manifold term.
#[derive(Debug)]
pub struct SaeManifoldTerm {
    pub atoms: Vec<SaeManifoldAtom>,
    pub assignment: SaeAssignment,
    pub(crate) temperature_schedule: Option<GumbelTemperatureSchedule>,
    /// Active-set row layout from the most recent `assemble_arrow_schur` call.
    /// `None` for dense modes (Softmax / IBPMap) or when not yet assembled.
    pub(crate) last_row_layout: Option<SaeRowLayout>,
    /// The single provenance-carrying per-row inner product (Object 2). The
    /// reconstruction likelihood whitens residuals through it and the isometry
    /// gauge's [`gam_terms::analytic_penalties::WeightField`] is constructed
    /// from the same object, so a likelihood-metric ≠ gauge-metric state is
    /// unrepresentable. `None` ⇒ Euclidean / isotropic (magic-by-default: the
    /// metric is selected by whether per-row Fisher factors were installed, not
    /// by a flag), which is bit-for-bit the historical isotropic `φ̂` path.
    pub(crate) row_metric: Option<gam_problem::RowMetric>,
    /// #976 Layer-1 guard ledger for the most recent joint fit: every
    /// active-mass breach with the action taken (re-seed / terminal). Cleared
    /// at the start of each `run_joint_fit_arrow_schur`; read post-fit via
    /// [`SaeManifoldTerm::collapse_events`] and carried onto the
    /// structure-search [`gam_solve::structure_search::SearchLedger`].
    pub(crate) collapse_events: Vec<CollapseEvent>,
    /// Per-row **design honesty weights** (#991): Horvitz–Thompson inclusion
    /// corrections from a designed corpus subsample
    /// ([`gam_solve::row_sampling_measure::RowSamplingMeasure::designed_subsample`] /
    /// [`crate::corpus::designed_target`]), self-normalized to
    /// mean `1.0` over the term's rows so dispersion, dof, and the
    /// data-vs-penalty balance stay consistent at the fitted sample size while
    /// the design's selection bias is removed (oversampled loud rows are
    /// downweighted back).
    ///
    /// The weights enter the objective as a per-row scalar metric `w_i · I_p`
    /// on the reconstruction channel ONLY, realized as a `√w_i` scaling of the
    /// per-row residual, latent Jacobian, and β basis load at their single
    /// construction sites in the assembly — so the data-fit value, the t-block
    /// Gauss-Newton, the β gradient/Gram, and the cross blocks all carry
    /// exactly one factor of `w_i` and cannot desync (the same discipline as
    /// the #974 whitening seam). Per-row latent priors (assignment prior, ARD
    /// coordinate prior) are deliberately NOT weighted: included rows' latent
    /// states are genuine model components of the subsampled model,
    /// conditional on inclusion; the HT correction applies to the row
    /// *evidence* about shared structure (decoder β, ρ), not to the latent
    /// priors. `None` ⇒ the exact unweighted path, bit-for-bit.
    pub(crate) row_loss_weights: Option<Vec<f64>>,
    /// #972 / #977 T1: whether the MOST RECENT `assemble_arrow_schur` built the
    /// β-tier in the *factored* Grassmann-coordinate layout (border width
    /// [`Self::factored_border_dim`], the per-atom `C_k` blocks) rather than the
    /// full-`B` layout ([`Self::beta_dim`]). When `true`, the `delta_beta` the
    /// arrow solver returns is a `ΔC` (factored coordinates) that
    /// [`Self::apply_newton_step_impl`] must LIFT through each active frame
    /// (`ΔB_k = ΔC_k U_kᵀ`) before applying it to the decoder. `false` ⇒ the
    /// historical full-`B` path, where `delta_beta` is `ΔB` directly. Set in
    /// lock-step with the assembled system so the step interpretation cannot
    /// drift from the layout the system was built in.
    pub(crate) last_frames_active: bool,
    /// #1033 test seam: force the large-`n` assembly fold to use this row chunk
    /// width instead of the streaming-plan's `chunk_size`. `None` ⇒ production
    /// behavior (the admission plan picks the window). Tests set a tiny value to
    /// drive the multi-chunk fold path on a small problem and assert it is
    /// bit-identical to the single-pass (`chunk_size == n`) fold. Never set in
    /// production; it only re-partitions the fold, never changes per-row math.
    pub(crate) assembly_chunk_override: Option<usize>,
    /// #1407: when set, `assemble_arrow_schur` emits ONLY the per-row block-
    /// diagonal `htt`/`gt` (the data-fit Gauss-Newton + assignment/ARD prior
    /// curvature and gradient), skipping the entire β decoder tier — the β Gram
    /// `G`, the `gb` gradient, the matrix-free `H_tβ` Kronecker operator, the
    /// dense `H_ββ`, and all β-tier analytic penalties. The fixed-decoder encode
    /// (`run_fixed_decoder_arrow_schur`) freezes the decoder and its per-row step
    /// reads only `htt`/`gt` (`fixed_decoder_step_from_rows`), so the entire
    /// `K`-dependent decoder assembly it otherwise computes-and-discards is
    /// avoided. A transient mode set in lock-step by that driver, not persisted.
    pub(crate) fixed_decoder_assembly: bool,
    /// #1408/#1409 — optional hard per-row active-atom cap for Softmax mode,
    /// threaded from the fit/encode `top_k` argument. When set (and `< K`), the
    /// Softmax assignment engages the COMPACT top-`k` row layout
    /// ([`SaeRowLayout::from_dense_weights`]) inside the optimization itself, so
    /// the Newton assembly/solve and the criterion only ever touch each row's
    /// top-`k` softmax atoms instead of all `K`. The full-`K` softmax
    /// normalization is still used to FORM each row's assignment vector `a`
    /// (the gate map); only the dropped tail logits — carrying negligible
    /// `O(a)` reconstruction mass and `O(a²)` curvature — are excluded from the
    /// per-row block. This folds `top_k` into the optimization rather than
    /// applying it as a post-fit projection (the #1409 defect), and makes the
    /// FFI's after-the-fact top-`k` projection a no-op at the optimum. `None`
    /// (the default) keeps the budget-driven engagement only.
    pub(crate) softmax_active_cap: Option<usize>,
    /// Reusable dense β-tier workspace for analytic penalty assembly. SAE
    /// immediately lowers the dense block into a `BetaPenaltyOp`, so the returned
    /// `ArrowSchurSystem` does not need to keep owning the allocation.
    pub(crate) border_hbb_workspace: Array2<f64>,
    /// Fitted Gaussian reconstruction dispersion used only by the empirical
    /// incoherence/curvature certificate-input report. `None` for synthetic terms
    /// or legacy internal callers that have not computed post-fit dispersion.
    pub(crate) certificate_dispersion: Option<f64>,
    /// Outcome of the most recent curvature-homotopy entry walk (#1007), or
    /// `None` when no walk has run (the seed cascade entry, or any consumer that
    /// never invokes the tracker). Recorded on the fit payload so the bifurcation
    /// / collapse outcome is observable — never a silent fallback. Cleared by
    /// the objective's `reset` so each seed's walk reports only its own run.
    pub(crate) curvature_walk_report: Option<CurvatureWalkReport>,
    /// Deflated row-gauge direction count established by the first undamped
    /// evidence factorization in the current optimization. A later change means
    /// the quotient dimension changed mid-solve, which is a structural event and
    /// must not be hidden inside the Laplace normalizer.
    pub(crate) expected_evidence_gauge_deflated_directions: Option<usize>,
    /// #1037 re-anchor counter: how many times the quotient (gauge-deflation)
    /// dimension has been re-anchored within the current optimization. A
    /// legitimate quotient-dimension change (an atom born / reseeded /
    /// rank-reduced) re-anchors the comparison once; an unbounded churn that
    /// never settles is the genuine pathology the guard must still catch. Reset
    /// to `0` alongside `expected_evidence_gauge_deflated_directions`.
    pub(crate) evidence_gauge_deflation_reanchors: usize,
    /// #1217 oscillation detector: the sign of the most recent change in the
    /// gauge-deflation count (`+1` when it last increased, `−1` when it last
    /// decreased, `0` before the first change). The deflation count is a
    /// per-ROW-summed count of near-null evidence directions, so on real K≥2
    /// data it drifts smoothly (and monotonically) as the conditioning improves
    /// across the ρ-walk — a benign O(N) quantity, NOT a discrete dictionary
    /// event. The genuine pathology the #1037 guard must catch is an
    /// OSCILLATING count (repeated direction reversals that never settle), so
    /// the re-anchor budget is charged only on a direction REVERSAL, not on
    /// every monotone drift step. Reset to `0` alongside the re-anchor counter.
    pub(crate) evidence_gauge_deflation_last_delta_sign: i8,
    /// #976 / #1117 K>1 robustness: how many full-dictionary co-collapse
    /// multi-starts the decoder-norm guard has already spent in the current
    /// optimization. Bounded by [`SAE_DICTIONARY_COCOLLAPSE_RESEED_BUDGET`];
    /// reset to `0` alongside [`Self::evidence_gauge_deflation_reanchors`] at the
    /// start of each outer optimization. Distinct from the per-atom reseed
    /// ledger in [`Self::collapse_events`] because a co-collapse reseed is a
    /// whole-dictionary multi-start, not a per-atom second chance.
    pub(crate) dictionary_cocollapse_reseeds: usize,
    /// #1026 co-collapse multi-start incumbent: the best (highest reconstruction
    /// EV) dictionary state seen across the full-dictionary co-collapse reseeds in
    /// the current optimization, with that EV. A blind reseed sequence can land in
    /// a strictly WORSE basin than the seed it replaced (real OLMo K=4: EV 0.127 →
    /// −1.01 across reseeds), so the multi-start must RETAIN the best basin and
    /// restore it when the reseed budget is spent, instead of keeping the last
    /// (often catastrophic) attempt. `None` until the first co-collapse reseed;
    /// reset to `None` alongside [`Self::dictionary_cocollapse_reseeds`] at the
    /// start of each outer optimization.
    pub(crate) best_cocollapse_incumbent: Option<(f64, SaeManifoldMutableState)>,
    /// #1026 decoder-repulsion gate, frozen per assembly (lagged-diffusivity
    /// discipline, exactly like [`SaeManifoldAtom::smooth_penalty`]): the
    /// symmetric `(K, K)` matrix of collinearity gate weights
    /// `gate(s_jk)·decoder_repulsion_strength()/(‖B_j‖²_F·‖B_k‖²_F)` computed
    /// from the decoder state at assembly entry (#1610: energy-normalized, so the
    /// realized penalty is a function of the dimensionless collinearity `c_jk²`
    /// alone). Both the assembly (gradient into `gb`, PSD
    /// curvature into `hbb`) and the line-search value path
    /// ([`Self::penalized_objective_total`]) read THIS frozen gate, so the
    /// repulsion's value, gradient, and curvature stay mutually consistent
    /// across a Newton step even as the trial decoders move (the cross-Gram
    /// quadratic moves with the trial; only the gate weight is frozen). `None`
    /// when no repulsion is active (`K < 2`, or every pair below the gate
    /// threshold — the strict no-op case). Refreshed at the same chokepoint as
    /// the smoothness Gram; not part of the persisted term identity (Clone
    /// starts `None`).
    /// SPARSE near-collinear pair gate (#1026): the list of `(j, k, w)` with
    /// `j < k` whose frozen repulsion weight `w > 0`. Only near-collinear pairs
    /// (above the collinearity gate) ever carry a nonzero weight, so this list is
    /// tiny even at large `K` — never the dense `K×K` matrix (8 GiB at K=32768).
    pub(crate) decoder_repulsion_gate: Option<Vec<(usize, usize, f64)>>,
    /// #1625 — the SEPARATION barrier's frozen normalized-coactivation weights
    /// `q_jk`, the analog of [`Self::decoder_repulsion_gate`] for the #1522
    /// collapse-prevention barrier. The barrier energy
    /// `P_sep = μ_C·Σ_{j<k} −q_jk·log(1−c_jk²+ε)` is weighted by the normalized
    /// coactivation `q_jk = (Σ_i a_ij a_ik)/√(Σa_ij²·Σa_ik²)`, which is a function
    /// of the assignment masses `a_ik` (hence the logits). The barrier's GRADIENT
    /// assembly ([`Self::add_sae_separation_barrier`]) treats `q_jk` as a constant
    /// multiplicative weight (it differentiates only the decoder shape `c_jk²`), so
    /// for value/gradient consistency the VALUE path
    /// ([`Self::separation_barrier_value`], read by the line-search
    /// [`Self::penalized_objective_total`]) MUST read the SAME `q_jk` the gradient
    /// used — not recompute it from the trial logits the line search moves.
    /// Freezing it here at assembly entry (lagged-diffusivity, exactly like the
    /// smoothness Gram and the repulsion gate) makes the barrier a pure function of
    /// the decoder shapes within a Newton step, so it exerts NO phantom force on
    /// the routing and the inner solve reaches true KKT stationarity. `None` when
    /// no pair co-fires (`K < 2`, or a fully-disjoint routing — the strict no-op);
    /// callers fall back to the live coactivation in that case. Transient: not part
    /// of the persisted term identity (Clone starts `None`, rebuilt next assembly).
    pub(crate) barrier_coactivation_gate: Option<Vec<(usize, usize, f64)>>,
    /// #1026: the load-bearing curved-vs-linear hybrid-split verdict, computed
    /// once in [`Self::canonicalize_charts_post_fit`] after the joint fit
    /// converges. Each eligible `d = 1` atom's fitted curved image is adjudicated
    /// against its straight (linear special-case) sub-model on the common
    /// rank-aware Laplace evidence scale. `None` until the post-fit pass runs (or
    /// when no atom is eligible). Surfaced in the Python model output so a user
    /// sees which atoms genuinely earn their curvature and which collapse to the
    /// linear tail. Read via [`Self::hybrid_split_report`].
    pub(crate) hybrid_split_report: Option<crate::hybrid_split::SaeHybridSplitReport>,
    /// Per-atom inner-decoder-smooth byproducts harvested post-fit (#1097 /
    /// #1103), one entry per atom in [`Self::atoms`] order. Each is the fixed
    /// fitted snapshot the residual-gauge certificate's three post-PIRLS atom
    /// inference reports consume
    /// ([`crate::identifiability::AtomInnerFit`]). `None` until
    /// [`Self::set_atom_inner_fits`] runs (it needs the reconstruction target
    /// `Z`, available only at the post-fit harness seam where the dispersion is
    /// also profiled); a per-atom `None` means that atom had no active rows or a
    /// degenerate inner design. Read by [`Self::to_residual_gauge_model`], which
    /// attaches each onto its [`crate::identifiability::FittedAtom`].
    pub(crate) atom_inner_fits:
        Option<Vec<Option<crate::identifiability::AtomInnerFit>>>,
    /// #1228 — the trained dictionary's hybrid-collapsed linear images, attached
    /// to an OOS term so held-out reconstruction decodes verdict-linear `d = 1`
    /// slots by the SAME straight sub-model the training reconstruction used.
    /// `None` on a fitted term (its collapse policy lives in
    /// [`Self::hybrid_split_report`]); `Some` only on the fresh OOS term built in
    /// `sae_manifold_predict_oos`, set from the serialized payload via
    /// [`Self::set_hybrid_linear_images`]. Consulted by
    /// [`Self::hybrid_linear_image_map`] so train and OOS share one collapse map.
    pub(crate) oos_linear_images: Option<Vec<crate::hybrid_split::AtomLinearImage>>,
}

impl Clone for SaeManifoldTerm {
    fn clone(&self) -> Self {
        Self {
            atoms: self.atoms.clone(),
            assignment: self.assignment.clone(),
            temperature_schedule: self.temperature_schedule.clone(),
            last_row_layout: self.last_row_layout.clone(),
            row_metric: self.row_metric.clone(),
            collapse_events: self.collapse_events.clone(),
            row_loss_weights: self.row_loss_weights.clone(),
            last_frames_active: self.last_frames_active,
            assembly_chunk_override: self.assembly_chunk_override,
            fixed_decoder_assembly: false,
            // Persisted configuration (like the assignment mode), carried across
            // clones so a cloned term optimizes the same compact top-`k` problem.
            softmax_active_cap: self.softmax_active_cap,
            border_hbb_workspace: Array2::<f64>::zeros((0, 0)),
            certificate_dispersion: self.certificate_dispersion,
            curvature_walk_report: self.curvature_walk_report.clone(),
            expected_evidence_gauge_deflated_directions: self
                .expected_evidence_gauge_deflated_directions,
            evidence_gauge_deflation_reanchors: self.evidence_gauge_deflation_reanchors,
            evidence_gauge_deflation_last_delta_sign: self.evidence_gauge_deflation_last_delta_sign,
            dictionary_cocollapse_reseeds: self.dictionary_cocollapse_reseeds,
            // Transient in-fit multi-start incumbent — not part of the persisted
            // term identity (like `border_hbb_workspace`); a fresh clone starts
            // with no incumbent and rebuilds it if it re-enters co-collapse.
            best_cocollapse_incumbent: None,
            // Transient per-assembly frozen gate — rebuilt at the next assembly.
            decoder_repulsion_gate: None,
            // #1625 — transient per-assembly frozen barrier coactivation; rebuilt
            // at the next assembly, exactly like the repulsion gate above.
            barrier_coactivation_gate: None,
            hybrid_split_report: self.hybrid_split_report.clone(),
            atom_inner_fits: self.atom_inner_fits.clone(),
            oos_linear_images: self.oos_linear_images.clone(),
        }
    }
}

/// Snapshot of exactly the mutable term state that an `apply_newton_step` +
/// `loss` line-search trial perturbs: per-atom decoder coefficients, the
/// `refresh_basis`-rebuilt basis evaluations (`basis_values`, `basis_jacobian`),
/// and the live intrinsic smoothness Gram read by the objective, plus the
/// assignment logits and latent coordinates.
///
/// Static fields (atom names, basis kinds, basis-evaluator `Arc`s, assignment
/// mode, temperature schedule) are *not* snapshotted: they are invariant across
/// an inner Newton line search, so the previous `self.clone()` per halving
/// re-copied them needlessly. Cloning only the line-search state keeps the
/// `O(N·M·d)` `basis_jacobian` copy off the per-halving hot path (one snapshot
/// before the search, one restore per rejected trial) instead of firing it on
/// every Armijo backtrack.
///
/// The canonical `smooth_penalty_raw` / `smooth_penalty_order` are static, but
/// the live intrinsic roughness Gram `smooth_penalty` is mutable state: it is
/// refreshed by assembly from the current decoder and basis Jacobian, and the
/// line-search objective reads it directly. Restoring it with the decoder and
/// basis caches keeps every rejected trial's baseline and nonlinear objective
/// on the same lagged-diffusivity quadratic.
#[derive(Debug)]
pub(crate) struct SaeManifoldMutableState {
    /// Per-atom `(basis_values, basis_jacobian, decoder_coefficients, smooth_penalty)`.
    pub(crate) atoms: Vec<(Array2<f64>, Array3<f64>, Array2<f64>, Array2<f64>)>,
    pub(crate) logits: Array2<f64>,
    pub(crate) coords: Vec<LatentCoordValues>,
    pub(crate) last_row_layout: Option<SaeRowLayout>,
}