gam_solve/reml/reml_outer_engine/penalty_coordinate.rs
1use super::*;
2pub use gam_problem::PenaltyCoordinate;
3
4/// Exact pseudo-logdeterminant log|S|₊ and its derivatives with respect to ρ.
5///
6/// # Exact pseudo-logdet on the positive eigenspace
7///
8/// For S(ρ) = Σ exp(ρ_k) S_k with S_k ⪰ 0, the nullspace
9/// N(S) = ∩_k N(S_k) is structurally fixed (independent of ρ).
10/// No eigenvalue of S crosses zero during optimization, so the
11/// pseudo-logdet L = Σ_{σ_i > 0} log σ_i is C∞ in ρ.
12///
13/// ## Computation
14///
15/// Eigendecompose S, identify positive eigenvalues σ_i > ε (where ε is a
16/// relative threshold for numerical zero detection), then:
17///
18/// L(S) = Σ_{positive} log σ_i
19/// ∂_k L = tr(S⁺ A_k) where A_k = λ_k S_k
20/// ∂²_kl L = δ_{kl} ∂_k L − tr(S⁺ A_l S⁺ A_k)
21///
22/// S⁺ is the Moore-Penrose pseudoinverse restricted to the positive
23/// eigenspace. These are the exact derivatives of L — no δ-regularization,
24/// no nullity metadata, no chain-rule inconsistencies.
25#[derive(Clone, Debug)]
26pub struct PenaltyLogdetDerivs {
27 /// L(S) = log|S|₊ — the exact pseudo-logdeterminant on the positive eigenspace.
28 ///
29 /// L(S) = Σ_{σ_i > ε} log σ_i, where ε is a relative threshold that
30 /// identifies the structural nullspace directly from the eigenspectrum.
31 pub value: f64,
32 /// ∂/∂ρₖ L(S) — first derivatives (one per smoothing parameter).
33 ///
34 /// ∂_k L = tr(S⁺ Aₖ) where Aₖ = λₖ Sₖ and S⁺ is the pseudoinverse
35 /// restricted to the positive eigenspace.
36 pub first: Array1<f64>,
37 /// ∂²/(∂ρₖ∂ρₗ) L(S) — second derivatives (for outer Hessian).
38 ///
39 /// ∂²_kl L = δ_{kl} ∂_k L − λₖ λₗ tr(S⁺ Sₖ S⁺ Sₗ).
40 pub second: Option<Array2<f64>>,
41}
42
43/// Unified representation of a single smoothing-parameter penalty coordinate.
44///
45
46// PenaltyLogdetEigenspace, build_penalty_logdet_eigenspace,
47// scaled_penalty_logdet_nullspace_leakage, and frobenius_inner_same_shape
48// have been replaced by the canonical PenaltyPseudologdet in
49// super::super::penalty_logdet. All callers now use that module directly.
50
51/// Reduced trace kernel `K = U · M · Uᵀ` for pseudo-logdet REML/LAML
52/// criteria: an orthonormal column basis `u_s` (p × r) plus the r × r
53/// symmetric reduced kernel `h_proj_inverse`, with `tr(K · A)` evaluated as
54/// `tr(M · Uᵀ A U)` so contractions run on the r-dimensional subspace.
55///
56/// Two producers install it, with different (documented) exactness domains:
57///
58/// 1. **Intrinsic spectral form (#901, the GLM dense paths in runtime.rs —
59/// `intrinsic_hessian_pseudo_logdet_parts`):** `u_s = U_H`, the kept
60/// eigenvectors of the penalized Hessian `H_pen`, and `h_proj_inverse =
61/// diag(1/σ_a)`. Then `K = H_pen⁺` exactly, and `tr(K · Ḣ)` is the exact
62/// first derivative of the cost's `log|H_pen|₊` along **every** drift
63/// direction — penalty-supported or not, moving-subspace ψ drifts
64/// included — because on a constant-rank stratum first-order eigenvector
65/// motion cancels out of the pseudo-logdet derivative. This object can be
66/// traced against the GLM IFT correction `D_β H[v] = X' diag(c ⊙ X v) X`
67/// (which leaks onto `null(S)` via the intercept column) without error.
68///
69/// 2. **Range(Sλ) Schur block (#752, `joint_penalty_subspace_trace_parts`
70/// in custom_family.rs):** `u_s` spans `range(Sλ)` and `h_proj_inverse =
71/// U_Sᵀ (H+Sλ)⁺ U_S`. For penalty-supported `A` (`A = ∂Sλ/∂ρ`), the
72/// identity `U_S U_Sᵀ A U_S U_Sᵀ = A` gives `tr(K · A) = tr((H+Sλ)⁺ A) =
73/// d log|H+Sλ|₊/dρ` — exact for the ρ family. It is **not** exact for
74/// drifts with `null(Sλ)` support (GLM cubic corrections, ψ basis
75/// drifts); paths that carry such drifts must install form 1.
76///
77/// Historically this struct carried a third reading — `(U_Sᵀ H U_S)⁻¹`, the
78/// plain projected inverse paired with the projected cost `log|U_Sᵀ H U_S|₊`.
79/// That object is WRONG as a REML determinant term: splitting `H` over
80/// `range(S) ⊕ ker(S)` as `[[A,B],[Bᵀ,C]]`, the projected logdet is
81/// `log det A`, dropping the θ-dependent Schur curvature
82/// `log det(C − BᵀA⁻¹B)` of the likelihood-identified, penalty-null block
83/// (sign-flipped ρ-gradients, ~1e5 ψ blow-ups vs FD — #901). No producer
84/// builds it anymore.
85#[derive(Clone, Debug)]
86pub struct PenaltySubspaceTrace {
87 pub u_s: Array2<f64>,
88 pub h_proj_inverse: Array2<f64>,
89}
90
91impl PenaltySubspaceTrace {
92 /// Compute `tr(K · A)` where `K = U_S · h_proj_inverse · U_Sᵀ` — the
93 /// pseudo-logdet trace kernel (see the struct doc for the two producer
94 /// forms and their exactness domains).
95 ///
96 /// Uses the identity `tr(K · A) = tr(h_proj_inverse · U_Sᵀ A U_S)` so the
97 /// reduction runs on the r × r subspace rather than materializing K.
98 pub fn trace_projected_logdet(&self, a: &Array2<f64>) -> f64 {
99 gam_terms::construction::trace_penalty_covariance_in_orthogonal_basis(
100 a,
101 &self.u_s,
102 &self.h_proj_inverse,
103 )
104 }
105
106 /// Reduce a p × p matrix `A` to its r × r projection `U_Sᵀ · A · U_S`.
107 ///
108 /// Exposed so callers that need the same reduced matrix for both the
109 /// single-trace `tr(K · A)` and the cross-trace `tr(K · A · K · B)`
110 /// can avoid repeating the p × p · p × r matmuls. Routes through
111 /// faer's parallel SIMD GEMM (`fast_atb` / `fast_ab`) so the p-large
112 /// contraction axis amortizes across all cores.
113 pub fn reduce(&self, a: &Array2<f64>) -> Array2<f64> {
114 let u_s_t_a = gam_linalg::faer_ndarray::fast_atb(&self.u_s, a);
115 gam_linalg::faer_ndarray::fast_ab(&u_s_t_a, &self.u_s)
116 }
117
118 /// Compute `tr(H_proj⁻¹ · R)` given an already-reduced `R = U_Sᵀ A U_S`.
119 pub fn trace_projected_logdet_reduced(&self, r_mat: &Array2<f64>) -> f64 {
120 gam_terms::construction::trace_reduced_penalty_covariance(r_mat, &self.h_proj_inverse)
121 }
122
123 /// Cross-trace given pre-reduced blocks `R_A = U_Sᵀ A U_S`, `R_B = U_Sᵀ B U_S`.
124 pub fn trace_projected_logdet_cross_reduced(&self, ra: &Array2<f64>, rb: &Array2<f64>) -> f64 {
125 // left = H_proj⁻¹ · R_A ; right = H_proj⁻¹ · R_B ; tr(left · right).
126 let left = self.h_proj_inverse.dot(ra);
127 let right = self.h_proj_inverse.dot(rb);
128 trace_matrix_product(&left, &right)
129 }
130
131 /// Reduce a `HyperOperator` `A` to its `r × r` projection
132 /// `U_Sᵀ · A · U_S` without materializing the dense `p × p` block.
133 /// Uses `A.mul_mat(U_S)` so an Hv-only operator is probed in `r` matvecs
134 /// (each `O(work_of_A)`), then a single `r × p × r` reduction routed
135 /// through faer's parallel SIMD GEMM (`fast_atb`).
136 pub fn reduce_operator<O>(&self, a: &O) -> Array2<f64>
137 where
138 O: HyperOperator + ?Sized,
139 {
140 let au = a.mul_mat(&self.u_s);
141 gam_linalg::faer_ndarray::fast_atb(&self.u_s, &au)
142 }
143
144 /// `tr(K · A)` for `A` exposed only as a `HyperOperator`. Mirrors
145 /// [`Self::trace_projected_logdet`] without forcing dense materialization
146 /// of `A`.
147 pub fn trace_operator<O>(&self, a: &O) -> f64
148 where
149 O: HyperOperator + ?Sized,
150 {
151 self.trace_projected_logdet_reduced(&self.reduce_operator(a))
152 }
153
154 /// Projected leverage `h^{G,proj}_i = Xᵢᵀ · K · Xᵢ` for every row of `x`.
155 ///
156 /// Computed in bulk as `Z = X · U_S` (`n × r`) then
157 /// `h^{G,proj}_i = (Z H_proj⁻¹ Zᵀ)_{ii} = Σ_{a,b} Z_{ia} (H_proj⁻¹)_{ab} Z_{ib}`,
158 /// total cost `O(n · p · r + n · r²)` — strictly cheaper than `n` calls
159 /// to [`Self::apply`] because the `n × p · p × r` GEMM streams the
160 /// `p`-axis once. Streams `X` through `try_row_chunk` so operator-backed
161 /// (Lazy) designs at large scale never densify the full `(n × p)` block.
162 pub fn xt_projected_kernel_x_diagonal(&self, x: &DesignMatrix) -> Array1<f64> {
163 let n = x.nrows();
164 let p = x.ncols();
165 let r = self.u_s.ncols();
166 assert_eq!(self.u_s.nrows(), p);
167 assert_eq!(self.h_proj_inverse.nrows(), r);
168 assert_eq!(self.h_proj_inverse.ncols(), r);
169
170 let block = {
171 const TARGET_CHUNK_FLOATS: usize = 1 << 16;
172 (TARGET_CHUNK_FLOATS / p.max(1)).clamp(1, n.max(1))
173 };
174
175 let mut h = Array1::<f64>::zeros(n);
176 let mut start = 0usize;
177 while start < n {
178 let end = (start + block).min(n);
179 let rows = x.try_row_chunk(start..end).unwrap_or_else(|err| {
180 // SAFETY: `start..end` is constructed from
181 // `0..n = 0..x.nrows()` with `end = (start+block).min(n)`,
182 // so it is always a valid sub-range of `x`. Failure means
183 // the operator broke its row-chunk contract.
184 // SAFETY: row range built from 0..x.nrows(); failure means operator broke its contract.
185 reml_contract_panic(format!(
186 "xt_projected_kernel_x_diagonal: row chunk failed: {err}"
187 ))
188 });
189 // Z_chunk = rows · U_S ((end-start) × r).
190 let z_chunk = gam_linalg::faer_ndarray::fast_ab(&rows, &self.u_s);
191 // h_i = Σ_{a,b} Z_{ia} (H_proj⁻¹)_{ab} Z_{ib}.
192 for (i, row_z) in z_chunk.outer_iter().enumerate() {
193 let mut acc = 0.0;
194 for (z_a, h_row) in row_z
195 .iter()
196 .copied()
197 .zip(self.h_proj_inverse.rows().into_iter())
198 {
199 let mut inner = 0.0;
200 for (h_value, z_b) in h_row.iter().copied().zip(row_z.iter().copied()) {
201 inner += h_value * z_b;
202 }
203 acc += z_a * inner;
204 }
205 h[start + i] = acc;
206 }
207 start = end;
208 }
209 h
210 }
211
212 /// Projected bilinear pseudo-inverse `aᵀ · K⁺ · b` where
213 /// `K⁺ = U_S · H_proj⁻¹ · U_Sᵀ`.
214 ///
215 /// Used by the rank-deficient LAML IFT correction path: when `b ∈
216 /// col(S_k) ⊂ range(S_+)`, applying the projected pseudo-inverse
217 /// instead of the full `H⁻¹` strips spurious null-space noise from
218 /// `a` (≈ the outer-stationarity residual `r`) before the inverse,
219 /// without biasing the numerator. Costs `O(p·r + r²)` versus the
220 /// `O(p²·r)` full solve.
221 pub fn bilinear_pseudo_inverse(&self, a: &Array1<f64>, b: &Array1<f64>) -> f64 {
222 let proj_a = gam_linalg::faer_ndarray::fast_atv(&self.u_s, a);
223 let proj_b = gam_linalg::faer_ndarray::fast_atv(&self.u_s, b);
224 let h_proj_inv_b = self.h_proj_inverse.dot(&proj_b);
225 proj_a.dot(&h_proj_inv_b)
226 }
227
228 /// Euclidean projection onto the retained penalty/Hessian range used by
229 /// this projected kernel: `P_S a = U_S U_Sᵀ a`.
230 pub fn project_onto_subspace(&self, a: &Array1<f64>) -> Array1<f64> {
231 let proj_a = gam_linalg::faer_ndarray::fast_atv(&self.u_s, a);
232 gam_linalg::faer_ndarray::fast_av(&self.u_s, &proj_a)
233 }
234
235 /// Apply the projected pseudo-inverse `K = U_S · H_proj⁻¹ · U_Sᵀ` to a
236 /// vector `a`, returning the minimum-norm solution `v = K · a` of the
237 /// system `H v = a` restricted to `range(S₊)`.
238 ///
239 /// This is the correct stand-in for `H⁻¹ · a` in all per-coordinate
240 /// outer-gradient/Hessian formulas when the rank-deficient LAML fix is
241 /// active (`penalty_subspace_trace = Some`). The full `H⁻¹ · a` solve
242 /// amplifies any component of `a` outside `range(H_free)` by
243 /// `1/σ_min(H_active_normal)` — which on large-scale survival
244 /// marginal-slope is ~10¹² and propagates into outer gradients of
245 /// magnitude 10¹⁴, suppressed by the envelope tripwire downstream and
246 /// killing every seed before the fit can take a step. This operator may
247 /// only drop components that the inner KKT certificate has already made
248 /// negligible; `ProjectedKktResidual::projected_into_reduced_range` enforces
249 /// that contract before the IFT correction uses this pseudo-inverse. With
250 /// that guard, the returned gradient lives on the constrained manifold,
251 /// matching the projected `log|U_Sᵀ H U_S|` term.
252 ///
253 /// Costs `O(p·r + r²)` for the two `U_S`-contractions plus the `r × r`
254 /// solve — strictly cheaper than the `O(p²)` full `hop.solve_multi`
255 /// when `r ≪ p`, and bounded regardless of `σ_min(H)`.
256 pub fn apply_pseudo_inverse(&self, a: &Array1<f64>) -> Array1<f64> {
257 // The one sensitivity operator (#935): the projected inverse action
258 // `U_S · H_proj⁻¹ · U_Sᵀ · a` has a single spelling, shared with every
259 // other consumer of `FittedInverse::Projected`.
260 self.sensitivity().apply(a)
261 }
262
263 /// View this projected trace kernel as the unified [`FitSensitivity`]
264 /// (#935) over the rank-deficient LAML convention `K = U_S · H_proj⁻¹ ·
265 /// U_Sᵀ`. The trace machinery stays here; the *inverse action* is the
266 /// shared operator, so no site can disagree about what `H⁻¹` means.
267 pub fn sensitivity(&self) -> crate::sensitivity::FitSensitivity<'_> {
268 crate::sensitivity::FitSensitivity::from_projected(&self.u_s, &self.h_proj_inverse)
269 }
270
271 /// Build the **constrained pseudo-inverse kernel**
272 /// `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`
273 /// from this penalty-projected kernel `K_S` and the *active* row block
274 /// `A_act` of the joint linear inequality constraint matrix.
275 ///
276 /// `K_T` is the **Moore-Penrose pseudo-inverse of `H` restricted to
277 /// `T = range(S₊) ∩ ker(A_act)`** — the smooth manifold the inner
278 /// solver actually moves on at a constrained-stationary point. It is
279 /// exactly the kernel that solves the per-coordinate saddle-point
280 /// IFT system
281 ///
282 /// ```text
283 /// [ H Aᵀ_act ] [ ∂β/∂ρ_k ] [ −a_k ]
284 /// [ A_act 0 ] [ ∂λ/∂ρ_k ] = [ 0 ]
285 /// ```
286 ///
287 /// with `∂β/∂ρ_k = −K_T · a_k`. Using `K_T` for the per-coordinate
288 /// mode response `v_k` makes the outer gradient the *exact* derivative
289 /// of the projected Laplace cost `log|U_Tᵀ H U_T|`, where `U_T` is an
290 /// orthonormal basis of `T` — the marginal-likelihood determinant the
291 /// inner is actually drawing on.
292 ///
293 /// Returns a [`ConstrainedSubspaceKernel`] handle that caches the
294 /// small `k_active × k_active` Schur complement so subsequent
295 /// `apply_pseudo_inverse` calls for different RHS reuse it. When the
296 /// active set is empty the handle degrades to a pass-through over
297 /// `self` (no extra work).
298 ///
299 /// Total precompute cost: `k_active` calls to
300 /// [`Self::apply_pseudo_inverse`] (one per active row) plus a
301 /// `k_active × k_active` Cholesky/QR. Per-vector `apply` cost: one
302 /// `K_S` apply + one `k_active × p` matvec + one small triangular
303 /// solve + one `p × k_active` matvec.
304 pub fn with_active_constraints<'a>(
305 &'a self,
306 a_act: ndarray::ArrayView2<'a, f64>,
307 ) -> ConstrainedSubspaceKernel<'a> {
308 let k_active = a_act.nrows();
309 if k_active == 0 {
310 return ConstrainedSubspaceKernel {
311 kernel: self,
312 z: Array2::zeros((0, self.u_s.nrows())),
313 a_act,
314 m_inv: Array2::zeros((0, 0)),
315 k_active: 0,
316 };
317 }
318 // Z = K_S · Aᵀ_act, shape (p × k_active).
319 let p = self.u_s.nrows();
320 let mut z = Array2::<f64>::zeros((p, k_active));
321 for j in 0..k_active {
322 let a_row = a_act.row(j).to_owned();
323 let k_s_a_row = self.apply_pseudo_inverse(&a_row);
324 z.column_mut(j).assign(&k_s_a_row);
325 }
326 // M = A_act · Z (shape k_active × k_active, symmetric PSD on
327 // range(K_S) ∩ image(A_actᵀ); on a rank-deficient overlap we
328 // add a tiny diagonal regulariser so the inversion remains
329 // bounded — same noise-floor strategy as elsewhere in this
330 // module).
331 let mut m = a_act.dot(&z);
332 // Symmetrise (numerical noise from the matmul leaves small skew).
333 for i in 0..k_active {
334 for j in 0..i {
335 let avg = 0.5 * (m[[i, j]] + m[[j, i]]);
336 m[[i, j]] = avg;
337 m[[j, i]] = avg;
338 }
339 }
340 // Eigendecomposition-based Moore-Penrose pseudo-inverse with a
341 // relative spectral cutoff. This is the principled treatment of
342 // rank deficiency in `A_act` when restricted to `range(S₊)`:
343 // some active constraint rows may be linearly dependent after
344 // projection (e.g. several monotonicity rows pinning the same
345 // flat region all reduce to the same row in `range(S₊)`).
346 // A plain `M⁻¹` then amplifies near-null directions; the
347 // pseudo-inverse drops them at a relative threshold
348 // `tol = eps · k_active · σ_max(M)`, which is the standard
349 // NumPy/LAPACK convention and exactly what Codex flagged as
350 // necessary in the math review.
351 let (evals, evecs) = m
352 .eigh(faer::Side::Lower)
353 .unwrap_or_else(|_| (Array1::<f64>::zeros(k_active), Array2::<f64>::eye(k_active)));
354 let sigma_max = evals.iter().copied().fold(0.0_f64, f64::max).max(0.0);
355 let tol = f64::EPSILON * (k_active as f64) * sigma_max.max(1.0);
356 let mut m_inv = Array2::<f64>::zeros((k_active, k_active));
357 let mut dropped = 0usize;
358 for q in 0..k_active {
359 if evals[q] > tol {
360 let inv_sigma = 1.0 / evals[q];
361 // Outer product u_q u_qᵀ scaled by 1/σ_q.
362 for i in 0..k_active {
363 for j in 0..k_active {
364 m_inv[[i, j]] += inv_sigma * evecs[[i, q]] * evecs[[j, q]];
365 }
366 }
367 } else {
368 dropped += 1;
369 }
370 }
371 if dropped > 0 {
372 log::debug!(
373 "[constrained-subspace kernel] dropped {} of {} active-constraint directions \
374 (rank-deficient on range(S₊)); pseudo-inverse threshold = {:.3e}",
375 dropped,
376 k_active,
377 tol,
378 );
379 }
380 ConstrainedSubspaceKernel {
381 kernel: self,
382 z,
383 a_act,
384 m_inv,
385 k_active,
386 }
387 }
388}
389
390/// Per-evaluation handle that combines a penalty-projected
391/// [`PenaltySubspaceTrace`] with an active inequality-constraint block,
392/// producing the constraint-aware pseudo-inverse
393/// `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. See
394/// [`PenaltySubspaceTrace::with_active_constraints`] for the math.
395///
396/// Caches the small `k_active × k_active` Schur inverse so subsequent
397/// per-coordinate `apply` calls only do `O(p · k_active)` work each.
398pub struct ConstrainedSubspaceKernel<'a> {
399 pub(crate) kernel: &'a PenaltySubspaceTrace,
400 /// `Z = K_S · Aᵀ_act`, shape `(p × k_active)`.
401 pub(crate) z: Array2<f64>,
402 /// Active-row block of the joint constraint matrix.
403 pub(crate) a_act: ndarray::ArrayView2<'a, f64>,
404 /// `(A_act · K_S · Aᵀ_act)⁻¹`, shape `(k_active × k_active)`.
405 pub(crate) m_inv: Array2<f64>,
406 pub(crate) k_active: usize,
407}
408
409impl<'a> ConstrainedSubspaceKernel<'a> {
410 /// Apply `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S` to `a`. The result
411 /// lies in `range(S₊) ∩ ker(A_act)` — the smooth manifold the inner
412 /// solver actually moves on at a constrained-stationary point.
413 pub fn apply_pseudo_inverse(&self, a: &Array1<f64>) -> Array1<f64> {
414 let v_s = self.kernel.apply_pseudo_inverse(a);
415 if self.k_active == 0 {
416 return v_s;
417 }
418 // mu = M_inv · (A_act · v_s)
419 let t = self.a_act.dot(&v_s);
420 let mu = self.m_inv.dot(&t);
421 // v = v_s - Z · mu
422 let correction = self.z.dot(&mu);
423 v_s - &correction
424 }
425
426 /// Whether any active constraints contribute (when false this kernel
427 /// is identical to the bare [`PenaltySubspaceTrace::apply_pseudo_inverse`]).
428 pub fn has_active_constraints(&self) -> bool {
429 self.k_active > 0
430 }
431}
432
433/// Tangency self-audit gate for the constrained mode-response arm: the
434/// emitted `v = K_T · rhs` must lie in `ker(A_act)` by construction, so
435/// `|A_act · v|` is compared against this fraction of the cancellation
436/// scale `|A_act| · |v|` (per active row). Generous enough that legitimate
437/// rank-deficient active sets (whose dropped Schur directions leave
438/// ε-level residue, see [`PenaltySubspaceTrace::with_active_constraints`])
439/// never trip it; the historical failure mode it guards (the d6b17a7f
440/// `1/σ_min ≈ 10¹²` null-space amplification) exceeds it by six orders.
441pub(crate) const THETA_MODE_RESPONSE_TANGENCY_GATE: f64 = 1e-6;
442
443/// #931 migration pass 2 — the ThetaDirection shared-drift pass: the ONE
444/// per-evaluation selection of the IFT mode-response kernel behind every
445/// `dβ̂/dθ = −K · ∂g/∂θ` solve in the outer gradient/Hessian assembly.
446///
447/// Before this object existed, four sites (the gradient solve stack in
448/// `reml_laml_evaluate`, the ρ- and ext-coordinate standalone fallbacks in
449/// `compute_outer_hessian`, and the standalone fallback in
450/// `build_outer_hessian_operator`) each re-implemented the same selection
451/// rule by hand, with comments warning each other to "mirror the
452/// selection exactly, otherwise the operator-form Hessian and dense
453/// materialization disagree on every entry". A hand-copied convention every
454/// caller must remember is precisely the objective↔gradient desync surface
455/// (#748/#752/#901 class) the criterion-as-atoms architecture (#931)
456/// removes. Now the rule is DECIDED in exactly one constructor and every
457/// consumer is a contraction of the same kernel object — the gradient and
458/// both Hessian representations structurally cannot pick different
459/// inverses for the same evaluation point:
460///
461/// * Active inequality constraints recorded on the inner solution → the
462/// lifted constrained kernel
463/// `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. The inner SCOP solver
464/// clamps β̂(θ) onto `T = range(S₊) ∩ ker(A_act)`, so the true IFT
465/// derivative lives in T and the lifted kernel gives the minimum-norm
466/// solution there; the full solve would amplify any RHS component
467/// outside `range(H_free)` by `1/σ_min(H_active_normal)` — ~10¹² on
468/// large-scale survival marginal-slope (commit d6b17a7f).
469/// * Otherwise → the FULL Hessian solve `v = H⁻¹ · rhs`, even when the
470/// LAML cost surface uses the projected logdet `½ log|U_Sᵀ H U_S|`:
471/// the inner solver converges β̂ ∈ R^p in the unconstrained full
472/// space, so the IFT identity demands the full inverse, and the
473/// penalty-subspace projection acts on the TRACE contraction side
474/// only. Routing through bare `K_S` here would discard the
475/// `null(S₊)` component of dβ̂/dθ — the near-separable ψ-gradient
476/// blow-up pinned by `duchon_probit_per_row_dnu_dpsi_fd_vs_analytic`.
477///
478/// The two emission shapes (`respond_one` per-vector, `respond_stack`
479/// batched) exist because the call sites have different RHS layouts and
480/// their solve shapes must stay bit-identical to the pre-port assembly
481/// (per-column GEMV vs blocked GEMM sum in different orders) — NOT because
482/// a site may choose a different kernel. Both shapes dispatch on the same
483/// stored decision.
484///
485/// This is the `Sensitivity`-operator half of the `ThetaDirection`
486/// calculus sketched in `atoms.rs`: the direction's `β̇` channel is a
487/// contraction of this kernel, so atoms borrowing the shared drift can no
488/// longer see a different chain rule than their neighbors.
489pub(crate) struct ThetaModeResponseKernel<'s> {
490 pub(crate) hop: &'s dyn HessianOperator,
491 /// `Some` exactly when the selection rule chose the lifted constrained
492 /// kernel. Built once per evaluation point (one Schur-complement
493 /// factorization), shared by every gradient/Hessian consumer — the
494 /// pre-port code rebuilt it per consumer site.
495 pub(crate) constrained: Option<ConstrainedSubspaceKernel<'s>>,
496}
497
498impl<'s> ThetaModeResponseKernel<'s> {
499 /// The ONE place the mode-response kernel selection rule lives.
500 pub(crate) fn select(
501 subspace: Option<&'s PenaltySubspaceTrace>,
502 active_constraints: Option<&'s ActiveLinearConstraintBlock>,
503 hop: &'s dyn HessianOperator,
504 ) -> Self {
505 let constrained = match (subspace, active_constraints) {
506 (Some(kernel), Some(block)) => {
507 let ck = kernel.with_active_constraints(block.a.view());
508 ck.has_active_constraints().then_some(ck)
509 }
510 _ => None,
511 };
512 Self { hop, constrained }
513 }
514
515 /// Mode response for one right-hand side: `K_T · rhs` under active
516 /// constraints, `H⁻¹ · rhs` (single-RHS `solve`) otherwise. Used by the
517 /// per-coordinate fallbacks whose pre-port assembly solved one vector at
518 /// a time — the single-RHS shape is preserved bit-identically.
519 pub(crate) fn respond_one(&self, rhs: &Array1<f64>) -> Array1<f64> {
520 match self.constrained.as_ref() {
521 Some(ck) => {
522 let v = ck.apply_pseudo_inverse(rhs);
523 self.certify_tangency(ck, &v);
524 v
525 }
526 None => self.hop.solve(rhs),
527 }
528 }
529
530 /// Mode responses for a column-stacked RHS block: per-column `K_T`
531 /// applies under active constraints (the lifted kernel has no blocked
532 /// form), one batched `solve_multi` otherwise (BLAS-3 / GPU batched
533 /// route) — exactly the shapes the stacked call sites used pre-port.
534 /// Zero RHS columns (box-masked ρ coordinates) emit exact zeros through
535 /// either arm, since both kernels are linear.
536 pub(crate) fn respond_stack(&self, rhs_stack: &Array2<f64>) -> Array2<f64> {
537 match self.constrained.as_ref() {
538 Some(ck) => {
539 let mut out = Array2::<f64>::zeros(rhs_stack.raw_dim());
540 for (j, col) in rhs_stack.columns().into_iter().enumerate() {
541 let v = ck.apply_pseudo_inverse(&col.to_owned());
542 self.certify_tangency(ck, &v);
543 out.column_mut(j).assign(&v);
544 }
545 out
546 }
547 None => self.hop.solve_multi(rhs_stack),
548 }
549 }
550
551 /// Per-atom certify body (#934 FD-self-audit pattern, applied as an
552 /// exact structural invariant): every constrained emission must lie in
553 /// `ker(A_act)` — `A_act · v = 0` is the defining property of `K_T`'s
554 /// range, so a violation can only mean the kernel object and the
555 /// emission desynced. Checked on every constrained response (cost
556 /// `O(k_active · p)`, negligible next to the apply itself) against the
557 /// row-wise cancellation scale `|A_act| · |v|`; a violation does not
558 /// fail the fit — it names the atom loudly in the `[CERTIFICATE]`
559 /// stream, exactly like the outer-optimum criterion audit. The
560 /// unconstrained arm carries no separate certify: its coherence with
561 /// the criterion VALUE is audited end-to-end by the #934
562 /// `CriterionCertificate` at every returned optimum.
563 pub(crate) fn certify_tangency(&self, ck: &ConstrainedSubspaceKernel<'_>, v: &Array1<f64>) {
564 let residual = ck.a_act.dot(v);
565 for (row, r) in residual.iter().enumerate() {
566 let scale: f64 = ck
567 .a_act
568 .row(row)
569 .iter()
570 .zip(v.iter())
571 .map(|(a, x)| (a * x).abs())
572 .sum();
573 if r.abs() > THETA_MODE_RESPONSE_TANGENCY_GATE * (scale + f64::EPSILON) {
574 log::warn!(
575 "[CERTIFICATE warning] atom \"theta_mode_response\": constrained IFT \
576 mode response left ker(A_act) — active row {row} residual {:.3e} \
577 exceeds gate {:.1e}·{:.3e}; the lifted kernel K_T and its emission \
578 have desynced (#931 pass-2 invariant)",
579 r.abs(),
580 THETA_MODE_RESPONSE_TANGENCY_GATE,
581 scale,
582 );
583 }
584 }
585 }
586}
587
588impl ProjectedKktResidual {
589 pub(crate) fn projected_into_reduced_range(
590 &self,
591 kernel: &PenaltySubspaceTrace,
592 ) -> Result<Self, String> {
593 match self.subspace {
594 KktResidualSubspace::ReducedRange => Ok(self.clone()),
595 KktResidualSubspace::ActiveProjected => {
596 let reduced_residual = kernel.project_onto_subspace(&self.residual);
597 let dropped_inf = self
598 .residual
599 .iter()
600 .zip(reduced_residual.iter())
601 .map(|(full, reduced)| (full - reduced).abs())
602 .fold(0.0_f64, f64::max);
603 let residual_inf = self
604 .residual
605 .iter()
606 .map(|value| value.abs())
607 .fold(0.0_f64, f64::max);
608 // Default mixed absolute/relative tolerance for the dropped-mass
609 // gate when the caller supplies no explicit `residual_tol`:
610 // ~1e-10 scaled by `1 + ‖r‖∞` so it degrades gracefully with the
611 // residual magnitude.
612 const DEFAULT_KKT_RESIDUAL_REL_TOL: f64 = 1e-10;
613 let tol = self
614 .residual_tol
615 .unwrap_or_else(|| DEFAULT_KKT_RESIDUAL_REL_TOL * (1.0 + residual_inf));
616 let gate = tol;
617 if dropped_inf > gate {
618 return Err(format!(
619 "projected KKT residual contains unresolved mass outside the reduced \
620 Hessian/penalty range: |r_A - r_R|∞={dropped_inf:.3e} > tol={gate:.3e}; \
621 range-projected IFT correction is valid only after the null direction is \
622 explicitly removed/fixed or after the active-projected residual is small"
623 ));
624 }
625 let mut reduced = Self::from_reduced_range(reduced_residual);
626 reduced.residual_tol = self.residual_tol;
627 reduced.free_rank = self.free_rank;
628 Ok(reduced)
629 }
630 }
631 }
632
633 /// The KKT-stationarity tolerance the inner solver applied at the
634 /// producing iterate. Returns `None` when the residual was built
635 /// from a legacy site that hasn't been threaded yet; downstream
636 /// consumers should substitute `f64::NAN` in that case.
637 pub fn residual_tol(&self) -> Option<f64> {
638 self.residual_tol
639 }
640
641 /// Dimensionality of the free subspace: `total_p - active_set_size`
642 /// at the producing iterate. `None` from legacy construction sites.
643 pub fn free_rank(&self) -> Option<usize> {
644 self.free_rank
645 }
646}