Skip to main content

gam_solve/reml/reml_outer_engine/
penalty_coordinate.rs

1use super::*;
2pub use gam_problem::PenaltyCoordinate;
3
4/// Exact pseudo-logdeterminant log|S|₊ and its derivatives with respect to ρ.
5///
6/// # Exact pseudo-logdet on the positive eigenspace
7///
8/// For S(ρ) = Σ exp(ρ_k) S_k with S_k ⪰ 0, the nullspace
9/// N(S) = ∩_k N(S_k) is structurally fixed (independent of ρ).
10/// No eigenvalue of S crosses zero during optimization, so the
11/// pseudo-logdet L = Σ_{σ_i > 0} log σ_i is C∞ in ρ.
12///
13/// ## Computation
14///
15/// Eigendecompose S, identify positive eigenvalues σ_i > ε (where ε is a
16/// relative threshold for numerical zero detection), then:
17///
18///   L(S)     = Σ_{positive} log σ_i
19///   ∂_k L    = tr(S⁺ A_k)            where A_k = λ_k S_k
20///   ∂²_kl L  = δ_{kl} ∂_k L − tr(S⁺ A_l S⁺ A_k)
21///
22/// S⁺ is the Moore-Penrose pseudoinverse restricted to the positive
23/// eigenspace. These are the exact derivatives of L — no δ-regularization,
24/// no nullity metadata, no chain-rule inconsistencies.
25#[derive(Clone, Debug)]
26pub struct PenaltyLogdetDerivs {
27    /// L(S) = log|S|₊ — the exact pseudo-logdeterminant on the positive eigenspace.
28    ///
29    /// L(S) = Σ_{σ_i > ε} log σ_i, where ε is a relative threshold that
30    /// identifies the structural nullspace directly from the eigenspectrum.
31    pub value: f64,
32    /// ∂/∂ρₖ L(S) — first derivatives (one per smoothing parameter).
33    ///
34    /// ∂_k L = tr(S⁺ Aₖ) where Aₖ = λₖ Sₖ and S⁺ is the pseudoinverse
35    /// restricted to the positive eigenspace.
36    pub first: Array1<f64>,
37    /// ∂²/(∂ρₖ∂ρₗ) L(S) — second derivatives (for outer Hessian).
38    ///
39    /// ∂²_kl L = δ_{kl} ∂_k L − λₖ λₗ tr(S⁺ Sₖ S⁺ Sₗ).
40    pub second: Option<Array2<f64>>,
41}
42
43/// Unified representation of a single smoothing-parameter penalty coordinate.
44///
45
46// PenaltyLogdetEigenspace, build_penalty_logdet_eigenspace,
47// scaled_penalty_logdet_nullspace_leakage, and frobenius_inner_same_shape
48// have been replaced by the canonical PenaltyPseudologdet in
49// super::super::penalty_logdet. All callers now use that module directly.
50
51/// Reduced trace kernel `K = U · M · Uᵀ` for pseudo-logdet REML/LAML
52/// criteria: an orthonormal column basis `u_s` (p × r) plus the r × r
53/// symmetric reduced kernel `h_proj_inverse`, with `tr(K · A)` evaluated as
54/// `tr(M · Uᵀ A U)` so contractions run on the r-dimensional subspace.
55///
56/// Two producers install it, with different (documented) exactness domains:
57///
58/// 1. **Intrinsic spectral form (#901, the GLM dense paths in runtime.rs —
59///    `intrinsic_hessian_pseudo_logdet_parts`):** `u_s = U_H`, the kept
60///    eigenvectors of the penalized Hessian `H_pen`, and `h_proj_inverse =
61///    diag(1/σ_a)`. Then `K = H_pen⁺` exactly, and `tr(K · Ḣ)` is the exact
62///    first derivative of the cost's `log|H_pen|₊` along **every** drift
63///    direction — penalty-supported or not, moving-subspace ψ drifts
64///    included — because on a constant-rank stratum first-order eigenvector
65///    motion cancels out of the pseudo-logdet derivative. This object can be
66///    traced against the GLM IFT correction `D_β H[v] = X' diag(c ⊙ X v) X`
67///    (which leaks onto `null(S)` via the intercept column) without error.
68///
69/// 2. **Range(Sλ) Schur block (#752, `joint_penalty_subspace_trace_parts`
70///    in custom_family.rs):** `u_s` spans `range(Sλ)` and `h_proj_inverse =
71///    U_Sᵀ (H+Sλ)⁺ U_S`. For penalty-supported `A` (`A = ∂Sλ/∂ρ`), the
72///    identity `U_S U_Sᵀ A U_S U_Sᵀ = A` gives `tr(K · A) = tr((H+Sλ)⁺ A) =
73///    d log|H+Sλ|₊/dρ` — exact for the ρ family. It is **not** exact for
74///    drifts with `null(Sλ)` support (GLM cubic corrections, ψ basis
75///    drifts); paths that carry such drifts must install form 1.
76///
77/// Historically this struct carried a third reading — `(U_Sᵀ H U_S)⁻¹`, the
78/// plain projected inverse paired with the projected cost `log|U_Sᵀ H U_S|₊`.
79/// That object is WRONG as a REML determinant term: splitting `H` over
80/// `range(S) ⊕ ker(S)` as `[[A,B],[Bᵀ,C]]`, the projected logdet is
81/// `log det A`, dropping the θ-dependent Schur curvature
82/// `log det(C − BᵀA⁻¹B)` of the likelihood-identified, penalty-null block
83/// (sign-flipped ρ-gradients, ~1e5 ψ blow-ups vs FD — #901). No producer
84/// builds it anymore.
85#[derive(Clone, Debug)]
86pub struct PenaltySubspaceTrace {
87    pub u_s: Array2<f64>,
88    pub h_proj_inverse: Array2<f64>,
89}
90
91impl PenaltySubspaceTrace {
92    /// Compute `tr(K · A)` where `K = U_S · h_proj_inverse · U_Sᵀ` — the
93    /// pseudo-logdet trace kernel (see the struct doc for the two producer
94    /// forms and their exactness domains).
95    ///
96    /// Uses the identity `tr(K · A) = tr(h_proj_inverse · U_Sᵀ A U_S)` so the
97    /// reduction runs on the r × r subspace rather than materializing K.
98    pub fn trace_projected_logdet(&self, a: &Array2<f64>) -> f64 {
99        gam_terms::construction::trace_penalty_covariance_in_orthogonal_basis(
100            a,
101            &self.u_s,
102            &self.h_proj_inverse,
103        )
104    }
105
106    /// Reduce a p × p matrix `A` to its r × r projection `U_Sᵀ · A · U_S`.
107    ///
108    /// Exposed so callers that need the same reduced matrix for both the
109    /// single-trace `tr(K · A)` and the cross-trace `tr(K · A · K · B)`
110    /// can avoid repeating the p × p · p × r matmuls.  Routes through
111    /// faer's parallel SIMD GEMM (`fast_atb` / `fast_ab`) so the p-large
112    /// contraction axis amortizes across all cores.
113    pub fn reduce(&self, a: &Array2<f64>) -> Array2<f64> {
114        let u_s_t_a = gam_linalg::faer_ndarray::fast_atb(&self.u_s, a);
115        gam_linalg::faer_ndarray::fast_ab(&u_s_t_a, &self.u_s)
116    }
117
118    /// Compute `tr(H_proj⁻¹ · R)` given an already-reduced `R = U_Sᵀ A U_S`.
119    pub fn trace_projected_logdet_reduced(&self, r_mat: &Array2<f64>) -> f64 {
120        gam_terms::construction::trace_reduced_penalty_covariance(r_mat, &self.h_proj_inverse)
121    }
122
123    /// Cross-trace given pre-reduced blocks `R_A = U_Sᵀ A U_S`, `R_B = U_Sᵀ B U_S`.
124    pub fn trace_projected_logdet_cross_reduced(&self, ra: &Array2<f64>, rb: &Array2<f64>) -> f64 {
125        // left = H_proj⁻¹ · R_A ;  right = H_proj⁻¹ · R_B ;  tr(left · right).
126        let left = self.h_proj_inverse.dot(ra);
127        let right = self.h_proj_inverse.dot(rb);
128        trace_matrix_product(&left, &right)
129    }
130
131    /// Reduce a `HyperOperator` `A` to its `r × r` projection
132    /// `U_Sᵀ · A · U_S` without materializing the dense `p × p` block.
133    /// Uses `A.mul_mat(U_S)` so an Hv-only operator is probed in `r` matvecs
134    /// (each `O(work_of_A)`), then a single `r × p × r` reduction routed
135    /// through faer's parallel SIMD GEMM (`fast_atb`).
136    pub fn reduce_operator<O>(&self, a: &O) -> Array2<f64>
137    where
138        O: HyperOperator + ?Sized,
139    {
140        let au = a.mul_mat(&self.u_s);
141        gam_linalg::faer_ndarray::fast_atb(&self.u_s, &au)
142    }
143
144    /// `tr(K · A)` for `A` exposed only as a `HyperOperator`.  Mirrors
145    /// [`Self::trace_projected_logdet`] without forcing dense materialization
146    /// of `A`.
147    pub fn trace_operator<O>(&self, a: &O) -> f64
148    where
149        O: HyperOperator + ?Sized,
150    {
151        self.trace_projected_logdet_reduced(&self.reduce_operator(a))
152    }
153
154    /// Projected leverage `h^{G,proj}_i = Xᵢᵀ · K · Xᵢ` for every row of `x`.
155    ///
156    /// Computed in bulk as `Z = X · U_S` (`n × r`) then
157    /// `h^{G,proj}_i = (Z H_proj⁻¹ Zᵀ)_{ii} = Σ_{a,b} Z_{ia} (H_proj⁻¹)_{ab} Z_{ib}`,
158    /// total cost `O(n · p · r + n · r²)` — strictly cheaper than `n` calls
159    /// to [`Self::apply`] because the `n × p · p × r` GEMM streams the
160    /// `p`-axis once.  Streams `X` through `try_row_chunk` so operator-backed
161    /// (Lazy) designs at large scale never densify the full `(n × p)` block.
162    pub fn xt_projected_kernel_x_diagonal(&self, x: &DesignMatrix) -> Array1<f64> {
163        let n = x.nrows();
164        let p = x.ncols();
165        let r = self.u_s.ncols();
166        assert_eq!(self.u_s.nrows(), p);
167        assert_eq!(self.h_proj_inverse.nrows(), r);
168        assert_eq!(self.h_proj_inverse.ncols(), r);
169
170        let block = {
171            const TARGET_CHUNK_FLOATS: usize = 1 << 16;
172            (TARGET_CHUNK_FLOATS / p.max(1)).clamp(1, n.max(1))
173        };
174
175        let mut h = Array1::<f64>::zeros(n);
176        let mut start = 0usize;
177        while start < n {
178            let end = (start + block).min(n);
179            let rows = x.try_row_chunk(start..end).unwrap_or_else(|err| {
180                // SAFETY: `start..end` is constructed from
181                // `0..n = 0..x.nrows()` with `end = (start+block).min(n)`,
182                // so it is always a valid sub-range of `x`. Failure means
183                // the operator broke its row-chunk contract.
184                // SAFETY: row range built from 0..x.nrows(); failure means operator broke its contract.
185                reml_contract_panic(format!(
186                    "xt_projected_kernel_x_diagonal: row chunk failed: {err}"
187                ))
188            });
189            // Z_chunk = rows · U_S  ((end-start) × r).
190            let z_chunk = gam_linalg::faer_ndarray::fast_ab(&rows, &self.u_s);
191            // h_i = Σ_{a,b} Z_{ia} (H_proj⁻¹)_{ab} Z_{ib}.
192            for (i, row_z) in z_chunk.outer_iter().enumerate() {
193                let mut acc = 0.0;
194                for (z_a, h_row) in row_z
195                    .iter()
196                    .copied()
197                    .zip(self.h_proj_inverse.rows().into_iter())
198                {
199                    let mut inner = 0.0;
200                    for (h_value, z_b) in h_row.iter().copied().zip(row_z.iter().copied()) {
201                        inner += h_value * z_b;
202                    }
203                    acc += z_a * inner;
204                }
205                h[start + i] = acc;
206            }
207            start = end;
208        }
209        h
210    }
211
212    /// Projected bilinear pseudo-inverse `aᵀ · K⁺ · b` where
213    /// `K⁺ = U_S · H_proj⁻¹ · U_Sᵀ`.
214    ///
215    /// Used by the rank-deficient LAML IFT correction path: when `b ∈
216    /// col(S_k) ⊂ range(S_+)`, applying the projected pseudo-inverse
217    /// instead of the full `H⁻¹` strips spurious null-space noise from
218    /// `a` (≈ the outer-stationarity residual `r`) before the inverse,
219    /// without biasing the numerator. Costs `O(p·r + r²)` versus the
220    /// `O(p²·r)` full solve.
221    pub fn bilinear_pseudo_inverse(&self, a: &Array1<f64>, b: &Array1<f64>) -> f64 {
222        let proj_a = gam_linalg::faer_ndarray::fast_atv(&self.u_s, a);
223        let proj_b = gam_linalg::faer_ndarray::fast_atv(&self.u_s, b);
224        let h_proj_inv_b = self.h_proj_inverse.dot(&proj_b);
225        proj_a.dot(&h_proj_inv_b)
226    }
227
228    /// Euclidean projection onto the retained penalty/Hessian range used by
229    /// this projected kernel: `P_S a = U_S U_Sᵀ a`.
230    pub fn project_onto_subspace(&self, a: &Array1<f64>) -> Array1<f64> {
231        let proj_a = gam_linalg::faer_ndarray::fast_atv(&self.u_s, a);
232        gam_linalg::faer_ndarray::fast_av(&self.u_s, &proj_a)
233    }
234
235    /// Apply the projected pseudo-inverse `K = U_S · H_proj⁻¹ · U_Sᵀ` to a
236    /// vector `a`, returning the minimum-norm solution `v = K · a` of the
237    /// system `H v = a` restricted to `range(S₊)`.
238    ///
239    /// This is the correct stand-in for `H⁻¹ · a` in all per-coordinate
240    /// outer-gradient/Hessian formulas when the rank-deficient LAML fix is
241    /// active (`penalty_subspace_trace = Some`). The full `H⁻¹ · a` solve
242    /// amplifies any component of `a` outside `range(H_free)` by
243    /// `1/σ_min(H_active_normal)` — which on large-scale survival
244    /// marginal-slope is ~10¹² and propagates into outer gradients of
245    /// magnitude 10¹⁴, suppressed by the envelope tripwire downstream and
246    /// killing every seed before the fit can take a step. This operator may
247    /// only drop components that the inner KKT certificate has already made
248    /// negligible; `ProjectedKktResidual::projected_into_reduced_range` enforces
249    /// that contract before the IFT correction uses this pseudo-inverse. With
250    /// that guard, the returned gradient lives on the constrained manifold,
251    /// matching the projected `log|U_Sᵀ H U_S|` term.
252    ///
253    /// Costs `O(p·r + r²)` for the two `U_S`-contractions plus the `r × r`
254    /// solve — strictly cheaper than the `O(p²)` full `hop.solve_multi`
255    /// when `r ≪ p`, and bounded regardless of `σ_min(H)`.
256    pub fn apply_pseudo_inverse(&self, a: &Array1<f64>) -> Array1<f64> {
257        // The one sensitivity operator (#935): the projected inverse action
258        // `U_S · H_proj⁻¹ · U_Sᵀ · a` has a single spelling, shared with every
259        // other consumer of `FittedInverse::Projected`.
260        self.sensitivity().apply(a)
261    }
262
263    /// View this projected trace kernel as the unified [`FitSensitivity`]
264    /// (#935) over the rank-deficient LAML convention `K = U_S · H_proj⁻¹ ·
265    /// U_Sᵀ`. The trace machinery stays here; the *inverse action* is the
266    /// shared operator, so no site can disagree about what `H⁻¹` means.
267    pub fn sensitivity(&self) -> crate::sensitivity::FitSensitivity<'_> {
268        crate::sensitivity::FitSensitivity::from_projected(&self.u_s, &self.h_proj_inverse)
269    }
270
271    /// Build the **constrained pseudo-inverse kernel**
272    /// `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`
273    /// from this penalty-projected kernel `K_S` and the *active* row block
274    /// `A_act` of the joint linear inequality constraint matrix.
275    ///
276    /// `K_T` is the **Moore-Penrose pseudo-inverse of `H` restricted to
277    /// `T = range(S₊) ∩ ker(A_act)`** — the smooth manifold the inner
278    /// solver actually moves on at a constrained-stationary point. It is
279    /// exactly the kernel that solves the per-coordinate saddle-point
280    /// IFT system
281    ///
282    /// ```text
283    ///   [ H   Aᵀ_act ] [ ∂β/∂ρ_k ]   [ −a_k ]
284    ///   [ A_act  0   ] [ ∂λ/∂ρ_k ] = [   0  ]
285    /// ```
286    ///
287    /// with `∂β/∂ρ_k = −K_T · a_k`. Using `K_T` for the per-coordinate
288    /// mode response `v_k` makes the outer gradient the *exact* derivative
289    /// of the projected Laplace cost `log|U_Tᵀ H U_T|`, where `U_T` is an
290    /// orthonormal basis of `T` — the marginal-likelihood determinant the
291    /// inner is actually drawing on.
292    ///
293    /// Returns a [`ConstrainedSubspaceKernel`] handle that caches the
294    /// small `k_active × k_active` Schur complement so subsequent
295    /// `apply_pseudo_inverse` calls for different RHS reuse it. When the
296    /// active set is empty the handle degrades to a pass-through over
297    /// `self` (no extra work).
298    ///
299    /// Total precompute cost: `k_active` calls to
300    /// [`Self::apply_pseudo_inverse`] (one per active row) plus a
301    /// `k_active × k_active` Cholesky/QR. Per-vector `apply` cost: one
302    /// `K_S` apply + one `k_active × p` matvec + one small triangular
303    /// solve + one `p × k_active` matvec.
304    pub fn with_active_constraints<'a>(
305        &'a self,
306        a_act: ndarray::ArrayView2<'a, f64>,
307    ) -> ConstrainedSubspaceKernel<'a> {
308        let k_active = a_act.nrows();
309        if k_active == 0 {
310            return ConstrainedSubspaceKernel {
311                kernel: self,
312                z: Array2::zeros((0, self.u_s.nrows())),
313                a_act,
314                m_inv: Array2::zeros((0, 0)),
315                k_active: 0,
316            };
317        }
318        // Z = K_S · Aᵀ_act,  shape (p × k_active).
319        let p = self.u_s.nrows();
320        let mut z = Array2::<f64>::zeros((p, k_active));
321        for j in 0..k_active {
322            let a_row = a_act.row(j).to_owned();
323            let k_s_a_row = self.apply_pseudo_inverse(&a_row);
324            z.column_mut(j).assign(&k_s_a_row);
325        }
326        // M = A_act · Z   (shape k_active × k_active, symmetric PSD on
327        // range(K_S) ∩ image(A_actᵀ); on a rank-deficient overlap we
328        // add a tiny diagonal regulariser so the inversion remains
329        // bounded — same noise-floor strategy as elsewhere in this
330        // module).
331        let mut m = a_act.dot(&z);
332        // Symmetrise (numerical noise from the matmul leaves small skew).
333        for i in 0..k_active {
334            for j in 0..i {
335                let avg = 0.5 * (m[[i, j]] + m[[j, i]]);
336                m[[i, j]] = avg;
337                m[[j, i]] = avg;
338            }
339        }
340        // Eigendecomposition-based Moore-Penrose pseudo-inverse with a
341        // relative spectral cutoff. This is the principled treatment of
342        // rank deficiency in `A_act` when restricted to `range(S₊)`:
343        // some active constraint rows may be linearly dependent after
344        // projection (e.g. several monotonicity rows pinning the same
345        // flat region all reduce to the same row in `range(S₊)`).
346        // A plain `M⁻¹` then amplifies near-null directions; the
347        // pseudo-inverse drops them at a relative threshold
348        // `tol = eps · k_active · σ_max(M)`, which is the standard
349        // NumPy/LAPACK convention and exactly what Codex flagged as
350        // necessary in the math review.
351        let (evals, evecs) = m
352            .eigh(faer::Side::Lower)
353            .unwrap_or_else(|_| (Array1::<f64>::zeros(k_active), Array2::<f64>::eye(k_active)));
354        let sigma_max = evals.iter().copied().fold(0.0_f64, f64::max).max(0.0);
355        let tol = f64::EPSILON * (k_active as f64) * sigma_max.max(1.0);
356        let mut m_inv = Array2::<f64>::zeros((k_active, k_active));
357        let mut dropped = 0usize;
358        for q in 0..k_active {
359            if evals[q] > tol {
360                let inv_sigma = 1.0 / evals[q];
361                // Outer product u_q u_qᵀ scaled by 1/σ_q.
362                for i in 0..k_active {
363                    for j in 0..k_active {
364                        m_inv[[i, j]] += inv_sigma * evecs[[i, q]] * evecs[[j, q]];
365                    }
366                }
367            } else {
368                dropped += 1;
369            }
370        }
371        if dropped > 0 {
372            log::debug!(
373                "[constrained-subspace kernel] dropped {} of {} active-constraint directions \
374                 (rank-deficient on range(S₊)); pseudo-inverse threshold = {:.3e}",
375                dropped,
376                k_active,
377                tol,
378            );
379        }
380        ConstrainedSubspaceKernel {
381            kernel: self,
382            z,
383            a_act,
384            m_inv,
385            k_active,
386        }
387    }
388}
389
390/// Per-evaluation handle that combines a penalty-projected
391/// [`PenaltySubspaceTrace`] with an active inequality-constraint block,
392/// producing the constraint-aware pseudo-inverse
393/// `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. See
394/// [`PenaltySubspaceTrace::with_active_constraints`] for the math.
395///
396/// Caches the small `k_active × k_active` Schur inverse so subsequent
397/// per-coordinate `apply` calls only do `O(p · k_active)` work each.
398pub struct ConstrainedSubspaceKernel<'a> {
399    pub(crate) kernel: &'a PenaltySubspaceTrace,
400    /// `Z = K_S · Aᵀ_act`, shape `(p × k_active)`.
401    pub(crate) z: Array2<f64>,
402    /// Active-row block of the joint constraint matrix.
403    pub(crate) a_act: ndarray::ArrayView2<'a, f64>,
404    /// `(A_act · K_S · Aᵀ_act)⁻¹`, shape `(k_active × k_active)`.
405    pub(crate) m_inv: Array2<f64>,
406    pub(crate) k_active: usize,
407}
408
409impl<'a> ConstrainedSubspaceKernel<'a> {
410    /// Apply `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S` to `a`. The result
411    /// lies in `range(S₊) ∩ ker(A_act)` — the smooth manifold the inner
412    /// solver actually moves on at a constrained-stationary point.
413    pub fn apply_pseudo_inverse(&self, a: &Array1<f64>) -> Array1<f64> {
414        let v_s = self.kernel.apply_pseudo_inverse(a);
415        if self.k_active == 0 {
416            return v_s;
417        }
418        // mu = M_inv · (A_act · v_s)
419        let t = self.a_act.dot(&v_s);
420        let mu = self.m_inv.dot(&t);
421        // v = v_s - Z · mu
422        let correction = self.z.dot(&mu);
423        v_s - &correction
424    }
425
426    /// Whether any active constraints contribute (when false this kernel
427    /// is identical to the bare [`PenaltySubspaceTrace::apply_pseudo_inverse`]).
428    pub fn has_active_constraints(&self) -> bool {
429        self.k_active > 0
430    }
431}
432
433/// Tangency self-audit gate for the constrained mode-response arm: the
434/// emitted `v = K_T · rhs` must lie in `ker(A_act)` by construction, so
435/// `|A_act · v|` is compared against this fraction of the cancellation
436/// scale `|A_act| · |v|` (per active row). Generous enough that legitimate
437/// rank-deficient active sets (whose dropped Schur directions leave
438/// ε-level residue, see [`PenaltySubspaceTrace::with_active_constraints`])
439/// never trip it; the historical failure mode it guards (the d6b17a7f
440/// `1/σ_min ≈ 10¹²` null-space amplification) exceeds it by six orders.
441pub(crate) const THETA_MODE_RESPONSE_TANGENCY_GATE: f64 = 1e-6;
442
443/// #931 migration pass 2 — the ThetaDirection shared-drift pass: the ONE
444/// per-evaluation selection of the IFT mode-response kernel behind every
445/// `dβ̂/dθ = −K · ∂g/∂θ` solve in the outer gradient/Hessian assembly.
446///
447/// Before this object existed, four sites (the gradient solve stack in
448/// `reml_laml_evaluate`, the ρ- and ext-coordinate standalone fallbacks in
449/// `compute_outer_hessian`, and the standalone fallback in
450/// `build_outer_hessian_operator`) each re-implemented the same selection
451/// rule by hand, with comments warning each other to "mirror the
452/// selection exactly, otherwise the operator-form Hessian and dense
453/// materialization disagree on every entry". A hand-copied convention every
454/// caller must remember is precisely the objective↔gradient desync surface
455/// (#748/#752/#901 class) the criterion-as-atoms architecture (#931)
456/// removes. Now the rule is DECIDED in exactly one constructor and every
457/// consumer is a contraction of the same kernel object — the gradient and
458/// both Hessian representations structurally cannot pick different
459/// inverses for the same evaluation point:
460///
461///   * Active inequality constraints recorded on the inner solution → the
462///     lifted constrained kernel
463///     `K_T = K_S − K_S Aᵀ (A K_S Aᵀ)⁻¹ A K_S`. The inner SCOP solver
464///     clamps β̂(θ) onto `T = range(S₊) ∩ ker(A_act)`, so the true IFT
465///     derivative lives in T and the lifted kernel gives the minimum-norm
466///     solution there; the full solve would amplify any RHS component
467///     outside `range(H_free)` by `1/σ_min(H_active_normal)` — ~10¹² on
468///     large-scale survival marginal-slope (commit d6b17a7f).
469///   * Otherwise → the FULL Hessian solve `v = H⁻¹ · rhs`, even when the
470///     LAML cost surface uses the projected logdet `½ log|U_Sᵀ H U_S|`:
471///     the inner solver converges β̂ ∈ R^p in the unconstrained full
472///     space, so the IFT identity demands the full inverse, and the
473///     penalty-subspace projection acts on the TRACE contraction side
474///     only. Routing through bare `K_S` here would discard the
475///     `null(S₊)` component of dβ̂/dθ — the near-separable ψ-gradient
476///     blow-up pinned by `duchon_probit_per_row_dnu_dpsi_fd_vs_analytic`.
477///
478/// The two emission shapes (`respond_one` per-vector, `respond_stack`
479/// batched) exist because the call sites have different RHS layouts and
480/// their solve shapes must stay bit-identical to the pre-port assembly
481/// (per-column GEMV vs blocked GEMM sum in different orders) — NOT because
482/// a site may choose a different kernel. Both shapes dispatch on the same
483/// stored decision.
484///
485/// This is the `Sensitivity`-operator half of the `ThetaDirection`
486/// calculus sketched in `atoms.rs`: the direction's `β̇` channel is a
487/// contraction of this kernel, so atoms borrowing the shared drift can no
488/// longer see a different chain rule than their neighbors.
489pub(crate) struct ThetaModeResponseKernel<'s> {
490    pub(crate) hop: &'s dyn HessianOperator,
491    /// `Some` exactly when the selection rule chose the lifted constrained
492    /// kernel. Built once per evaluation point (one Schur-complement
493    /// factorization), shared by every gradient/Hessian consumer — the
494    /// pre-port code rebuilt it per consumer site.
495    pub(crate) constrained: Option<ConstrainedSubspaceKernel<'s>>,
496}
497
498impl<'s> ThetaModeResponseKernel<'s> {
499    /// The ONE place the mode-response kernel selection rule lives.
500    pub(crate) fn select(
501        subspace: Option<&'s PenaltySubspaceTrace>,
502        active_constraints: Option<&'s ActiveLinearConstraintBlock>,
503        hop: &'s dyn HessianOperator,
504    ) -> Self {
505        let constrained = match (subspace, active_constraints) {
506            (Some(kernel), Some(block)) => {
507                let ck = kernel.with_active_constraints(block.a.view());
508                ck.has_active_constraints().then_some(ck)
509            }
510            _ => None,
511        };
512        Self { hop, constrained }
513    }
514
515    /// Mode response for one right-hand side: `K_T · rhs` under active
516    /// constraints, `H⁻¹ · rhs` (single-RHS `solve`) otherwise. Used by the
517    /// per-coordinate fallbacks whose pre-port assembly solved one vector at
518    /// a time — the single-RHS shape is preserved bit-identically.
519    pub(crate) fn respond_one(&self, rhs: &Array1<f64>) -> Array1<f64> {
520        match self.constrained.as_ref() {
521            Some(ck) => {
522                let v = ck.apply_pseudo_inverse(rhs);
523                self.certify_tangency(ck, &v);
524                v
525            }
526            None => self.hop.solve(rhs),
527        }
528    }
529
530    /// Mode responses for a column-stacked RHS block: per-column `K_T`
531    /// applies under active constraints (the lifted kernel has no blocked
532    /// form), one batched `solve_multi` otherwise (BLAS-3 / GPU batched
533    /// route) — exactly the shapes the stacked call sites used pre-port.
534    /// Zero RHS columns (box-masked ρ coordinates) emit exact zeros through
535    /// either arm, since both kernels are linear.
536    pub(crate) fn respond_stack(&self, rhs_stack: &Array2<f64>) -> Array2<f64> {
537        match self.constrained.as_ref() {
538            Some(ck) => {
539                let mut out = Array2::<f64>::zeros(rhs_stack.raw_dim());
540                for (j, col) in rhs_stack.columns().into_iter().enumerate() {
541                    let v = ck.apply_pseudo_inverse(&col.to_owned());
542                    self.certify_tangency(ck, &v);
543                    out.column_mut(j).assign(&v);
544                }
545                out
546            }
547            None => self.hop.solve_multi(rhs_stack),
548        }
549    }
550
551    /// Per-atom certify body (#934 FD-self-audit pattern, applied as an
552    /// exact structural invariant): every constrained emission must lie in
553    /// `ker(A_act)` — `A_act · v = 0` is the defining property of `K_T`'s
554    /// range, so a violation can only mean the kernel object and the
555    /// emission desynced. Checked on every constrained response (cost
556    /// `O(k_active · p)`, negligible next to the apply itself) against the
557    /// row-wise cancellation scale `|A_act| · |v|`; a violation does not
558    /// fail the fit — it names the atom loudly in the `[CERTIFICATE]`
559    /// stream, exactly like the outer-optimum criterion audit. The
560    /// unconstrained arm carries no separate certify: its coherence with
561    /// the criterion VALUE is audited end-to-end by the #934
562    /// `CriterionCertificate` at every returned optimum.
563    pub(crate) fn certify_tangency(&self, ck: &ConstrainedSubspaceKernel<'_>, v: &Array1<f64>) {
564        let residual = ck.a_act.dot(v);
565        for (row, r) in residual.iter().enumerate() {
566            let scale: f64 = ck
567                .a_act
568                .row(row)
569                .iter()
570                .zip(v.iter())
571                .map(|(a, x)| (a * x).abs())
572                .sum();
573            if r.abs() > THETA_MODE_RESPONSE_TANGENCY_GATE * (scale + f64::EPSILON) {
574                log::warn!(
575                    "[CERTIFICATE warning] atom \"theta_mode_response\": constrained IFT \
576                     mode response left ker(A_act) — active row {row} residual {:.3e} \
577                     exceeds gate {:.1e}·{:.3e}; the lifted kernel K_T and its emission \
578                     have desynced (#931 pass-2 invariant)",
579                    r.abs(),
580                    THETA_MODE_RESPONSE_TANGENCY_GATE,
581                    scale,
582                );
583            }
584        }
585    }
586}
587
588impl ProjectedKktResidual {
589    pub(crate) fn projected_into_reduced_range(
590        &self,
591        kernel: &PenaltySubspaceTrace,
592    ) -> Result<Self, String> {
593        match self.subspace {
594            KktResidualSubspace::ReducedRange => Ok(self.clone()),
595            KktResidualSubspace::ActiveProjected => {
596                let reduced_residual = kernel.project_onto_subspace(&self.residual);
597                let dropped_inf = self
598                    .residual
599                    .iter()
600                    .zip(reduced_residual.iter())
601                    .map(|(full, reduced)| (full - reduced).abs())
602                    .fold(0.0_f64, f64::max);
603                let residual_inf = self
604                    .residual
605                    .iter()
606                    .map(|value| value.abs())
607                    .fold(0.0_f64, f64::max);
608                // Default mixed absolute/relative tolerance for the dropped-mass
609                // gate when the caller supplies no explicit `residual_tol`:
610                // ~1e-10 scaled by `1 + ‖r‖∞` so it degrades gracefully with the
611                // residual magnitude.
612                const DEFAULT_KKT_RESIDUAL_REL_TOL: f64 = 1e-10;
613                let tol = self
614                    .residual_tol
615                    .unwrap_or_else(|| DEFAULT_KKT_RESIDUAL_REL_TOL * (1.0 + residual_inf));
616                let gate = tol;
617                if dropped_inf > gate {
618                    return Err(format!(
619                        "projected KKT residual contains unresolved mass outside the reduced \
620                         Hessian/penalty range: |r_A - r_R|∞={dropped_inf:.3e} > tol={gate:.3e}; \
621                         range-projected IFT correction is valid only after the null direction is \
622                         explicitly removed/fixed or after the active-projected residual is small"
623                    ));
624                }
625                let mut reduced = Self::from_reduced_range(reduced_residual);
626                reduced.residual_tol = self.residual_tol;
627                reduced.free_rank = self.free_rank;
628                Ok(reduced)
629            }
630        }
631    }
632
633    /// The KKT-stationarity tolerance the inner solver applied at the
634    /// producing iterate. Returns `None` when the residual was built
635    /// from a legacy site that hasn't been threaded yet; downstream
636    /// consumers should substitute `f64::NAN` in that case.
637    pub fn residual_tol(&self) -> Option<f64> {
638        self.residual_tol
639    }
640
641    /// Dimensionality of the free subspace: `total_p - active_set_size`
642    /// at the producing iterate. `None` from legacy construction sites.
643    pub fn free_rank(&self) -> Option<usize> {
644        self.free_rank
645    }
646}